Full Code of huggingface/candle for AI

main 38e7202145a8 cached

1049 files

8.7 MB

2.3M tokens

10803 symbols

1 requests

Download .txt

Showing preview only (9,266K chars total). Download the full file or copy to clipboard to get everything.

Repository: huggingface/candle
Branch: main
Commit: 38e7202145a8
Files: 1049
Total size: 8.7 MB

Directory structure:
gitextract_tey_ubja/

├── .cargo/
│   └── config.toml
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── ci_cuda.yaml
│       ├── maturin.yml
│       ├── python.yml
│       ├── rust-ci.yml
│       └── trufflehog.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── Makefile
├── README.md
├── candle-book/
│   ├── .gitignore
│   ├── CONTRIBUTING.md
│   ├── Cargo.toml
│   ├── book.toml
│   └── src/
│       ├── README.md
│       ├── SUMMARY.md
│       ├── advanced/
│       │   └── mkl.md
│       ├── apps/
│       │   ├── README.md
│       │   ├── desktop.md
│       │   ├── rest.md
│       │   └── wasm.md
│       ├── chapter_1.md
│       ├── cuda/
│       │   ├── README.md
│       │   ├── porting.md
│       │   └── writing.md
│       ├── error_manage.md
│       ├── guide/
│       │   ├── cheatsheet.md
│       │   ├── hello_world.md
│       │   ├── installation.md
│       │   └── mnist/
│       │       ├── intro.md
│       │       ├── modeling.md
│       │       ├── saving_loading.md
│       │       └── training.md
│       ├── inference/
│       │   ├── cuda/
│       │   │   ├── README.md
│       │   │   ├── porting.md
│       │   │   └── writing.md
│       │   ├── hub.md
│       │   └── inference.md
│       ├── lib.rs
│       ├── simplified.rs
│       ├── tracing.md
│       └── training/
│           ├── finetuning.md
│           ├── mnist.md
│           ├── serialization.md
│           ├── simplified.md
│           └── training.md
├── candle-core/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── README.md
│   ├── benches/
│   │   ├── bench_main.rs
│   │   └── benchmarks/
│   │       ├── affine.rs
│   │       ├── binary.rs
│   │       ├── broadcast.rs
│   │       ├── conv_transpose2d.rs
│   │       ├── copy.rs
│   │       ├── matmul.rs
│   │       ├── mod.rs
│   │       ├── qmatmul.rs
│   │       ├── random.rs
│   │       ├── reduce.rs
│   │       ├── unary.rs
│   │       └── where_cond.rs
│   ├── examples/
│   │   ├── basics.rs
│   │   ├── cuda_basics.rs
│   │   ├── cuda_sum_benchmark.rs
│   │   └── metal_basics.rs
│   ├── src/
│   │   ├── accelerate.rs
│   │   ├── backend.rs
│   │   ├── backprop.rs
│   │   ├── conv.rs
│   │   ├── convert.rs
│   │   ├── cpu/
│   │   │   ├── avx.rs
│   │   │   ├── erf.rs
│   │   │   ├── kernels.rs
│   │   │   ├── mod.rs
│   │   │   ├── neon.rs
│   │   │   └── simd128.rs
│   │   ├── cpu_backend/
│   │   │   ├── conv2d.rs
│   │   │   ├── mod.rs
│   │   │   └── utils.rs
│   │   ├── cuda_backend/
│   │   │   ├── cudnn.rs
│   │   │   ├── device.rs
│   │   │   ├── error.rs
│   │   │   ├── mod.rs
│   │   │   └── utils.rs
│   │   ├── custom_op.rs
│   │   ├── device.rs
│   │   ├── display.rs
│   │   ├── dtype.rs
│   │   ├── dummy_cuda_backend.rs
│   │   ├── dummy_dtype.rs
│   │   ├── dummy_metal_backend.rs
│   │   ├── error.rs
│   │   ├── indexer.rs
│   │   ├── layout.rs
│   │   ├── lib.rs
│   │   ├── metal_backend/
│   │   │   ├── device.rs
│   │   │   └── mod.rs
│   │   ├── mkl.rs
│   │   ├── npy.rs
│   │   ├── op.rs
│   │   ├── pickle.rs
│   │   ├── quantized/
│   │   │   ├── avx.rs
│   │   │   ├── cuda.rs
│   │   │   ├── dummy_cuda.rs
│   │   │   ├── dummy_metal.rs
│   │   │   ├── ggml_file.rs
│   │   │   ├── gguf_file.rs
│   │   │   ├── imatrix_file.rs
│   │   │   ├── k_quants.rs
│   │   │   ├── metal.rs
│   │   │   ├── mod.rs
│   │   │   ├── neon.rs
│   │   │   ├── simd128.rs
│   │   │   ├── tokenizer.rs
│   │   │   └── utils.rs
│   │   ├── safetensors.rs
│   │   ├── scalar.rs
│   │   ├── shape.rs
│   │   ├── sort.rs
│   │   ├── storage.rs
│   │   ├── streaming.rs
│   │   ├── strided_index.rs
│   │   ├── tensor.rs
│   │   ├── tensor_cat.rs
│   │   ├── test_utils.rs
│   │   ├── utils.rs
│   │   └── variable.rs
│   └── tests/
│       ├── bilinear_tests.rs
│       ├── conv_tests.rs
│       ├── custom_op_tests.rs
│       ├── display_tests.rs
│       ├── fortran_tensor_3d.pth
│       ├── grad_tests.rs
│       ├── indexing_tests.rs
│       ├── layout_tests.rs
│       ├── matmul_tests.rs
│       ├── npy.py
│       ├── pool_tests.rs
│       ├── pth.py
│       ├── pth_tests.rs
│       ├── quantized_tests.rs
│       ├── serialization_tests.rs
│       ├── tensor_tests.rs
│       ├── test.npy
│       ├── test.pt
│       └── test_with_key.pt
├── candle-datasets/
│   ├── Cargo.toml
│   ├── README.md
│   └── src/
│       ├── batcher.rs
│       ├── hub.rs
│       ├── lib.rs
│       ├── nlp/
│       │   ├── mod.rs
│       │   └── tinystories.rs
│       └── vision/
│           ├── cifar.rs
│           ├── fashion_mnist.rs
│           ├── mnist.rs
│           └── mod.rs
├── candle-examples/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── buildtime_downloader.rs
│   ├── examples/
│   │   ├── based/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── beit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── bert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── bert_single_file_binary/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── bigcode/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── blip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── chatglm/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── chinese_clip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── clip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── codegeex4-9b/
│   │   │   ├── README.org
│   │   │   └── main.rs
│   │   ├── colpali/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── convmixer/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── convnext/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── csm/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── voices.safetensors
│   │   ├── custom-ops/
│   │   │   ├── README.md
│   │   │   ├── cuda_kernels.rs
│   │   │   ├── kernels/
│   │   │   │   ├── layernorm_kernels.cu
│   │   │   │   └── reduction_utils.cuh
│   │   │   └── main.rs
│   │   ├── debertav2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── deepseekv2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── depth_anything_v2/
│   │   │   ├── README.md
│   │   │   ├── color_map.rs
│   │   │   └── main.rs
│   │   ├── dinov2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── dinov2reg4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── distilbert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── efficientnet/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── efficientvit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── encodec/
│   │   │   ├── README.md
│   │   │   ├── audio_io.rs
│   │   │   ├── jfk-codes.safetensors
│   │   │   └── main.rs
│   │   ├── eva2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── falcon/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── fastvit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── flux/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── t5_tokenizer.py
│   │   ├── gemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── gguf-tokenizer.rs
│   │   ├── glm4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── granite/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── granitemoehybrid/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── gte-qwen/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── helium/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── hiera/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── jina-bert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── llama/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── llama2-c/
│   │   │   ├── main.rs
│   │   │   └── training.rs
│   │   ├── llama_multiprocess/
│   │   │   ├── main.rs
│   │   │   └── model.rs
│   │   ├── llava/
│   │   │   ├── constants.rs
│   │   │   ├── conversation.rs
│   │   │   ├── image_processor.rs
│   │   │   ├── main.rs
│   │   │   └── readme.md
│   │   ├── mamba/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mamba-minimal/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── model.rs
│   │   ├── mamba2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── marian-mt/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── python/
│   │   │       ├── convert_slow_tokenizer.py
│   │   │       └── requirements.txt
│   │   ├── metavoice/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mimi/
│   │   │   ├── README.md
│   │   │   ├── audio_io.rs
│   │   │   └── main.rs
│   │   ├── mistral/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mixtral/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mnist-training/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mobileclip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mobilenetv4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mobileone/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── modernbert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── moondream/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── musicgen/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── musicgen_model.rs
│   │   ├── nomic-bert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── nvembed_v2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── olmo/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── onnx/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── onnx-llm/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── onnx_basics.rs
│   │   ├── orpheus/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── paddleocr-vl/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── paligemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── parler-tts/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── phi/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── pixtral/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-gemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-glm4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-lfm2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-phi/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-qwen2-instruct/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-qwen3/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-qwen3-moe/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-t5/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── qwen/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── recurrent-gemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── reinforcement-learning/
│   │   │   ├── README.md
│   │   │   ├── atari_wrappers.py
│   │   │   ├── ddpg.rs
│   │   │   ├── dqn.rs
│   │   │   ├── gym_env.rs
│   │   │   ├── main.rs
│   │   │   ├── policy_gradient.rs
│   │   │   └── vec_gym_env.rs
│   │   ├── replit-code/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── repvgg/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── resnet/
│   │   │   ├── README.md
│   │   │   ├── export_models.py
│   │   │   └── main.rs
│   │   ├── rwkv/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── segformer/
│   │   │   ├── README.md
│   │   │   ├── assets/
│   │   │   │   └── labels.json
│   │   │   └── main.rs
│   │   ├── segment-anything/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── siglip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── silero-vad/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── smollm3/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── snac/
│   │   │   ├── audio_io.rs
│   │   │   └── main.rs
│   │   ├── splade/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── stable-diffusion/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── stable-diffusion-3/
│   │   │   ├── README.md
│   │   │   ├── clip.rs
│   │   │   ├── main.rs
│   │   │   ├── sampling.rs
│   │   │   └── vae.rs
│   │   ├── stable-lm/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── starcoder2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── stella-en-v5/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── t5/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── trocr/
│   │   │   ├── image_processor.rs
│   │   │   ├── main.rs
│   │   │   └── readme.md
│   │   ├── vgg/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── vit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── voxtral/
│   │   │   ├── README.md
│   │   │   ├── download.rs
│   │   │   ├── main.rs
│   │   │   ├── melfilters128.bytes
│   │   │   └── model.rs
│   │   ├── whisper/
│   │   │   ├── README.md
│   │   │   ├── extract_weights.py
│   │   │   ├── main.rs
│   │   │   ├── melfilters.bytes
│   │   │   ├── melfilters128.bytes
│   │   │   └── multilingual.rs
│   │   ├── whisper-microphone/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── multilingual.rs
│   │   ├── wuerstchen/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── xlm-roberta/
│   │   │   ├── Readme.md
│   │   │   └── main.rs
│   │   ├── yi/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── yolo-v3/
│   │   │   ├── README.md
│   │   │   ├── darknet.rs
│   │   │   ├── extract-weights.py
│   │   │   ├── main.rs
│   │   │   └── yolo-v3.cfg
│   │   ├── yolo-v8/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── model.rs
│   │   └── z_image/
│   │       ├── README.md
│   │       └── main.rs
│   └── src/
│       ├── audio.rs
│       ├── bs1770.rs
│       ├── chat_template.rs
│       ├── coco_classes.rs
│       ├── imagenet.rs
│       ├── lib.rs
│       ├── token_output_stream.rs
│       └── wav.rs
├── candle-flash-attn/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── kernels/
│   │   ├── alibi.h
│   │   ├── block_info.h
│   │   ├── dropout.h
│   │   ├── error.h
│   │   ├── flash.h
│   │   ├── flash_api.cu
│   │   ├── flash_fwd_hdim128_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim128_bf16_sm80.cu
│   │   ├── flash_fwd_hdim128_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim128_fp16_sm80.cu
│   │   ├── flash_fwd_hdim160_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim160_bf16_sm80.cu
│   │   ├── flash_fwd_hdim160_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim160_fp16_sm80.cu
│   │   ├── flash_fwd_hdim192_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim192_bf16_sm80.cu
│   │   ├── flash_fwd_hdim192_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim192_fp16_sm80.cu
│   │   ├── flash_fwd_hdim224_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim224_bf16_sm80.cu
│   │   ├── flash_fwd_hdim224_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim224_fp16_sm80.cu
│   │   ├── flash_fwd_hdim256_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim256_bf16_sm80.cu
│   │   ├── flash_fwd_hdim256_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim256_fp16_sm80.cu
│   │   ├── flash_fwd_hdim32_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim32_bf16_sm80.cu
│   │   ├── flash_fwd_hdim32_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim32_fp16_sm80.cu
│   │   ├── flash_fwd_hdim64_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim64_bf16_sm80.cu
│   │   ├── flash_fwd_hdim64_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim64_fp16_sm80.cu
│   │   ├── flash_fwd_hdim96_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim96_bf16_sm80.cu
│   │   ├── flash_fwd_hdim96_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim96_fp16_sm80.cu
│   │   ├── flash_fwd_kernel.h
│   │   ├── flash_fwd_launch_template.h
│   │   ├── hardware_info.h
│   │   ├── kernel_helpers.h
│   │   ├── kernel_traits.h
│   │   ├── kernel_traits_sm90.h
│   │   ├── kernels.h
│   │   ├── mask.h
│   │   ├── philox.cuh
│   │   ├── rotary.h
│   │   ├── softmax.h
│   │   ├── static_switch.h
│   │   └── utils.h
│   ├── src/
│   │   ├── ffi.rs
│   │   └── lib.rs
│   └── tests/
│       └── flash_attn_tests.rs
├── candle-flash-attn-v3/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── hkernel/
│   │   ├── combine.h
│   │   ├── copy_paged_sm90_tma.hpp
│   │   ├── copy_paged_sm90_tma_cutlass35.hpp
│   │   ├── copy_paged_sm90_tma_cutlass36.hpp
│   │   ├── epilogue_fwd_sm90_tma.hpp
│   │   ├── flash.h
│   │   ├── flash_api.cpp
│   │   ├── flash_api.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_sm90.cu
│   │   ├── flash_fwd_kernel.h
│   │   ├── flash_fwd_launch_template.h
│   │   ├── kernel_traits.h
│   │   ├── mainloop_fwd_sm90_tma_gmma_ws.hpp
│   │   ├── named_barrier.hpp
│   │   ├── seq_len.h
│   │   ├── softmax.h
│   │   ├── static_switch.h
│   │   ├── tile_scheduler.hpp
│   │   └── utils.h
│   ├── src/
│   │   ├── ffi.rs
│   │   └── lib.rs
│   └── tests/
│       └── flash_attn_tests.rs
├── candle-kernels/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   └── src/
│       ├── affine.cu
│       ├── binary.cu
│       ├── binary_op_macros.cuh
│       ├── cast.cu
│       ├── compatibility.cuh
│       ├── conv.cu
│       ├── cuda_utils.cuh
│       ├── ffi.rs
│       ├── fill.cu
│       ├── indexing.cu
│       ├── lib.rs
│       ├── moe/
│       │   ├── gguf.cuh
│       │   ├── moe_gguf.cu
│       │   ├── moe_utils.cuh
│       │   ├── moe_wmma.cu
│       │   └── moe_wmma_gguf.cu
│       ├── ptx.rs
│       ├── quantized.cu
│       ├── reduce.cu
│       ├── sort.cu
│       ├── ternary.cu
│       └── unary.cu
├── candle-metal-kernels/
│   ├── Cargo.toml
│   ├── README.md
│   ├── examples/
│   │   └── metal_benchmarks.rs
│   └── src/
│       ├── err.rs
│       ├── kernel.rs
│       ├── kernels/
│       │   ├── affine.rs
│       │   ├── binary.rs
│       │   ├── cast.rs
│       │   ├── convolution.rs
│       │   ├── fill.rs
│       │   ├── indexing.rs
│       │   ├── macros.rs
│       │   ├── mlx_gemm.rs
│       │   ├── mod.rs
│       │   ├── quantized.rs
│       │   ├── random.rs
│       │   ├── reduce.rs
│       │   ├── sdpa.rs
│       │   ├── sort.rs
│       │   ├── ternary.rs
│       │   └── unary.rs
│       ├── lib.rs
│       ├── metal/
│       │   ├── buffer.rs
│       │   ├── command_buffer.rs
│       │   ├── commands.rs
│       │   ├── compute_pipeline.rs
│       │   ├── device.rs
│       │   ├── encoder.rs
│       │   ├── library.rs
│       │   └── mod.rs
│       ├── metal_src/
│       │   ├── affine.metal
│       │   ├── binary.metal
│       │   ├── cast.metal
│       │   ├── conv.metal
│       │   ├── fill.metal
│       │   ├── indexing.metal
│       │   ├── mlx_gemm.metal
│       │   ├── mlx_sort.metal
│       │   ├── quantized.metal
│       │   ├── random.metal
│       │   ├── reduce.metal
│       │   ├── scaled_dot_product_attention.metal
│       │   ├── sort.metal
│       │   ├── ternary.metal
│       │   ├── unary.metal
│       │   └── utils.metal
│       ├── source.rs
│       ├── tests.rs
│       └── utils.rs
├── candle-nn/
│   ├── Cargo.toml
│   ├── README.md
│   ├── benches/
│   │   ├── bench_main.rs
│   │   └── benchmarks/
│   │       ├── conv.rs
│   │       ├── mod.rs
│   │       ├── norm.rs
│   │       └── softmax.rs
│   ├── examples/
│   │   ├── basic_optimizer.rs
│   │   └── cpu_benchmarks.rs
│   ├── src/
│   │   ├── activation.rs
│   │   ├── batch_norm.rs
│   │   ├── conv.rs
│   │   ├── cpu_flash_attention.rs
│   │   ├── embedding.rs
│   │   ├── encoding.rs
│   │   ├── func.rs
│   │   ├── group_norm.rs
│   │   ├── init.rs
│   │   ├── kv_cache.rs
│   │   ├── layer_norm.rs
│   │   ├── lib.rs
│   │   ├── linear.rs
│   │   ├── loss.rs
│   │   ├── moe.rs
│   │   ├── ops.rs
│   │   ├── optim.rs
│   │   ├── rnn.rs
│   │   ├── rotary_emb.rs
│   │   ├── sampling.rs
│   │   ├── sequential.rs
│   │   ├── var_builder.rs
│   │   └── var_map.rs
│   └── tests/
│       ├── batch_norm.rs
│       ├── cpu_flash_attn.rs
│       ├── group_norm.rs
│       ├── kv_cache.rs
│       ├── layer_norm.rs
│       ├── loss.rs
│       ├── one_hot.rs
│       ├── ops.rs
│       ├── optim.rs
│       ├── rnn.rs
│       └── sdpa.rs
├── candle-onnx/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── src/
│   │   ├── eval.rs
│   │   ├── lib.rs
│   │   └── onnx.proto3
│   └── tests/
│       └── ops.rs
├── candle-pyo3/
│   ├── .gitignore
│   ├── Cargo.toml
│   ├── README.md
│   ├── _additional_typing/
│   │   ├── README.md
│   │   └── __init__.py
│   ├── build.rs
│   ├── e5.py
│   ├── py_src/
│   │   └── candle/
│   │       ├── __init__.py
│   │       ├── __init__.pyi
│   │       ├── functional/
│   │       │   ├── __init__.py
│   │       │   └── __init__.pyi
│   │       ├── models/
│   │       │   ├── bert.py
│   │       │   └── llama.py
│   │       ├── nn/
│   │       │   ├── __init__.py
│   │       │   ├── __init__.pyi
│   │       │   ├── container.py
│   │       │   ├── linear.py
│   │       │   ├── module.py
│   │       │   ├── normalization.py
│   │       │   └── sparse.py
│   │       ├── onnx/
│   │       │   ├── __init__.py
│   │       │   └── __init__.pyi
│   │       ├── testing/
│   │       │   └── __init__.py
│   │       ├── typing/
│   │       │   └── __init__.py
│   │       └── utils/
│   │           ├── __init__.py
│   │           └── __init__.pyi
│   ├── pyproject.toml
│   ├── quant-llama.py
│   ├── src/
│   │   ├── lib.rs
│   │   ├── onnx.rs
│   │   ├── shape.rs
│   │   └── utils.rs
│   ├── stub.py
│   ├── test.py
│   ├── test_pytorch.py
│   └── tests/
│       ├── __init__.py
│       ├── bindings/
│       │   ├── test_linear.py
│       │   ├── test_module.py
│       │   └── test_testing.py
│       └── native/
│           ├── test_shape.py
│           ├── test_tensor.py
│           └── test_utils.py
├── candle-transformers/
│   ├── Cargo.toml
│   ├── README.md
│   ├── src/
│   │   ├── fused_moe.rs
│   │   ├── generation/
│   │   │   └── mod.rs
│   │   ├── lib.rs
│   │   ├── models/
│   │   │   ├── based.rs
│   │   │   ├── beit.rs
│   │   │   ├── bert.rs
│   │   │   ├── bigcode.rs
│   │   │   ├── blip.rs
│   │   │   ├── blip_text.rs
│   │   │   ├── chatglm.rs
│   │   │   ├── chinese_clip/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text_model.rs
│   │   │   │   └── vision_model.rs
│   │   │   ├── clip/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text_model.rs
│   │   │   │   └── vision_model.rs
│   │   │   ├── codegeex4_9b.rs
│   │   │   ├── colpali.rs
│   │   │   ├── convmixer.rs
│   │   │   ├── convnext.rs
│   │   │   ├── csm.rs
│   │   │   ├── dac.rs
│   │   │   ├── debertav2.rs
│   │   │   ├── deepseek2.rs
│   │   │   ├── depth_anything_v2.rs
│   │   │   ├── dinov2.rs
│   │   │   ├── dinov2reg4.rs
│   │   │   ├── distilbert.rs
│   │   │   ├── efficientnet.rs
│   │   │   ├── efficientvit.rs
│   │   │   ├── encodec.rs
│   │   │   ├── eva2.rs
│   │   │   ├── falcon.rs
│   │   │   ├── fastvit.rs
│   │   │   ├── flux/
│   │   │   │   ├── autoencoder.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   ├── quantized_model.rs
│   │   │   │   └── sampling.rs
│   │   │   ├── gemma.rs
│   │   │   ├── gemma2.rs
│   │   │   ├── gemma3.rs
│   │   │   ├── glm4.rs
│   │   │   ├── glm4_new.rs
│   │   │   ├── granite.rs
│   │   │   ├── granitemoehybrid.rs
│   │   │   ├── helium.rs
│   │   │   ├── hiera.rs
│   │   │   ├── jina_bert.rs
│   │   │   ├── llama.rs
│   │   │   ├── llama2_c.rs
│   │   │   ├── llama2_c_weights.rs
│   │   │   ├── llava/
│   │   │   │   ├── config.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── utils.rs
│   │   │   ├── mamba.rs
│   │   │   ├── mamba2.rs
│   │   │   ├── marian.rs
│   │   │   ├── metavoice.rs
│   │   │   ├── mimi/
│   │   │   │   ├── conv.rs
│   │   │   │   ├── encodec.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── quantization.rs
│   │   │   │   ├── seanet.rs
│   │   │   │   └── transformer.rs
│   │   │   ├── mistral.rs
│   │   │   ├── mixformer.rs
│   │   │   ├── mixtral.rs
│   │   │   ├── mmdit/
│   │   │   │   ├── blocks.rs
│   │   │   │   ├── embedding.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   └── projections.rs
│   │   │   ├── mobileclip.rs
│   │   │   ├── mobilenetv4.rs
│   │   │   ├── mobileone.rs
│   │   │   ├── mod.rs
│   │   │   ├── modernbert.rs
│   │   │   ├── moondream.rs
│   │   │   ├── mpt.rs
│   │   │   ├── nomic_bert.rs
│   │   │   ├── nvembed_v2/
│   │   │   │   ├── embedding.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── model.rs
│   │   │   ├── olmo.rs
│   │   │   ├── olmo2.rs
│   │   │   ├── openclip/
│   │   │   │   ├── mod.rs
│   │   │   │   └── text_model.rs
│   │   │   ├── paddleocr_vl/
│   │   │   │   ├── config.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text.rs
│   │   │   │   └── vision.rs
│   │   │   ├── paligemma.rs
│   │   │   ├── parler_tts.rs
│   │   │   ├── persimmon.rs
│   │   │   ├── phi.rs
│   │   │   ├── phi3.rs
│   │   │   ├── pixtral/
│   │   │   │   ├── llava.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── vision_model.rs
│   │   │   ├── quantized_blip.rs
│   │   │   ├── quantized_blip_text.rs
│   │   │   ├── quantized_gemma3.rs
│   │   │   ├── quantized_glm4.rs
│   │   │   ├── quantized_lfm2.rs
│   │   │   ├── quantized_llama.rs
│   │   │   ├── quantized_llama2_c.rs
│   │   │   ├── quantized_metavoice.rs
│   │   │   ├── quantized_mistral.rs
│   │   │   ├── quantized_mixformer.rs
│   │   │   ├── quantized_moondream.rs
│   │   │   ├── quantized_mpt.rs
│   │   │   ├── quantized_phi.rs
│   │   │   ├── quantized_phi3.rs
│   │   │   ├── quantized_qwen2.rs
│   │   │   ├── quantized_qwen3.rs
│   │   │   ├── quantized_qwen3_moe.rs
│   │   │   ├── quantized_recurrent_gemma.rs
│   │   │   ├── quantized_rwkv_v5.rs
│   │   │   ├── quantized_rwkv_v6.rs
│   │   │   ├── quantized_stable_lm.rs
│   │   │   ├── quantized_t5.rs
│   │   │   ├── qwen2.rs
│   │   │   ├── qwen2_moe.rs
│   │   │   ├── qwen3.rs
│   │   │   ├── qwen3_moe.rs
│   │   │   ├── qwen3_vl/
│   │   │   │   ├── config.rs
│   │   │   │   ├── conv3d_temporal_2.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text.rs
│   │   │   │   └── vision.rs
│   │   │   ├── recurrent_gemma.rs
│   │   │   ├── repvgg.rs
│   │   │   ├── resnet.rs
│   │   │   ├── rwkv_v5.rs
│   │   │   ├── rwkv_v6.rs
│   │   │   ├── rwkv_v7.rs
│   │   │   ├── segformer.rs
│   │   │   ├── segment_anything/
│   │   │   │   ├── image_encoder.rs
│   │   │   │   ├── mask_decoder.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── prompt_encoder.rs
│   │   │   │   ├── sam.rs
│   │   │   │   ├── tiny_vit.rs
│   │   │   │   └── transformer.rs
│   │   │   ├── siglip.rs
│   │   │   ├── smol/
│   │   │   │   ├── README.md
│   │   │   │   ├── mod.rs
│   │   │   │   ├── quantized_smollm3.rs
│   │   │   │   └── smollm3.rs
│   │   │   ├── snac.rs
│   │   │   ├── stable_diffusion/
│   │   │   │   ├── attention.rs
│   │   │   │   ├── clip.rs
│   │   │   │   ├── ddim.rs
│   │   │   │   ├── ddpm.rs
│   │   │   │   ├── embeddings.rs
│   │   │   │   ├── euler_ancestral_discrete.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── resnet.rs
│   │   │   │   ├── schedulers.rs
│   │   │   │   ├── unet_2d.rs
│   │   │   │   ├── unet_2d_blocks.rs
│   │   │   │   ├── uni_pc.rs
│   │   │   │   ├── utils.rs
│   │   │   │   └── vae.rs
│   │   │   ├── stable_lm.rs
│   │   │   ├── starcoder2.rs
│   │   │   ├── stella_en_v5.rs
│   │   │   ├── t5.rs
│   │   │   ├── trocr.rs
│   │   │   ├── vgg.rs
│   │   │   ├── vit.rs
│   │   │   ├── voxtral/
│   │   │   │   ├── audio.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   └── voxtral_llama.rs
│   │   │   ├── whisper/
│   │   │   │   ├── audio.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   └── quantized_model.rs
│   │   │   ├── with_tracing.rs
│   │   │   ├── wuerstchen/
│   │   │   │   ├── attention_processor.rs
│   │   │   │   ├── common.rs
│   │   │   │   ├── ddpm.rs
│   │   │   │   ├── diffnext.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── paella_vq.rs
│   │   │   │   └── prior.rs
│   │   │   ├── xlm_roberta.rs
│   │   │   ├── yi.rs
│   │   │   └── z_image/
│   │   │       ├── mod.rs
│   │   │       ├── preprocess.rs
│   │   │       ├── sampling.rs
│   │   │       ├── scheduler.rs
│   │   │       ├── text_encoder.rs
│   │   │       ├── transformer.rs
│   │   │       └── vae.rs
│   │   ├── object_detection.rs
│   │   ├── pipelines/
│   │   │   ├── mod.rs
│   │   │   └── text_generation.rs
│   │   ├── quantized_nn.rs
│   │   ├── quantized_var_builder.rs
│   │   └── utils.rs
│   └── tests/
│       ├── generation_tests.rs
│       └── nms_tests.rs
├── candle-ug/
│   ├── Cargo.toml
│   └── src/
│       └── lib.rs
├── candle-wasm-examples/
│   ├── bert/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── bertWorker.js
│   │   ├── build-lib.sh
│   │   ├── lib-example.html
│   │   ├── src/
│   │   │   ├── bin/
│   │   │   │   └── m.rs
│   │   │   └── lib.rs
│   │   └── utils.js
│   ├── blip/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── blipWorker.js
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       ├── lib.rs
│   │       └── token_output_stream.rs
│   ├── chat-template/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   └── src/
│   │       └── lib.rs
│   ├── llama2-c/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── lib-example.html
│   │   ├── llama2cWorker.js
│   │   └── src/
│   │       ├── app.rs
│   │       ├── bin/
│   │       │   ├── app.rs
│   │       │   ├── m.rs
│   │       │   └── worker.rs
│   │       ├── lib.rs
│   │       ├── model.rs
│   │       └── worker.rs
│   ├── moondream/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── code.js
│   │   ├── index.html
│   │   ├── moondreamWorker.js
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       └── lib.rs
│   ├── phi/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── phiWorker.js
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       └── lib.rs
│   ├── quant-qwen3/
│   │   ├── .cargo/
│   │   │   └── config.toml
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── index.html
│   │   ├── serve.py
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── m.rs
│   │       └── profiler.rs
│   ├── segment-anything/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── lib-example.html
│   │   ├── samWorker.js
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       └── lib.rs
│   ├── t5/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── T5ModelConditionalGeneration.js
│   │   ├── T5ModelEncoderWorker.js
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── src/
│   │   │   ├── bin/
│   │   │   │   ├── m-quantized.rs
│   │   │   │   └── m.rs
│   │   │   └── lib.rs
│   │   └── utils.js
│   ├── whisper/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── lib-example.html
│   │   ├── main.js
│   │   ├── src/
│   │   │   ├── app.rs
│   │   │   ├── audio.rs
│   │   │   ├── bin/
│   │   │   │   ├── app.rs
│   │   │   │   ├── m.rs
│   │   │   │   └── worker.rs
│   │   │   ├── languages.rs
│   │   │   ├── lib.rs
│   │   │   └── worker.rs
│   │   └── whisperWorker.js
│   └── yolo/
│       ├── Cargo.toml
│       ├── README.md
│       ├── build-lib.sh
│       ├── index.html
│       ├── lib-example.html
│       ├── src/
│       │   ├── app.rs
│       │   ├── bin/
│       │   │   ├── app.rs
│       │   │   ├── m.rs
│       │   │   └── worker.rs
│       │   ├── coco_classes.rs
│       │   ├── lib.rs
│       │   ├── model.rs
│       │   └── worker.rs
│       └── yoloWorker.js
├── candle-wasm-tests/
│   ├── Cargo.toml
│   ├── README.md
│   ├── src/
│   │   └── lib.rs
│   ├── tests/
│   │   └── quantized_tests.rs
│   └── webdriver.json
└── tensor-tools/
    ├── Cargo.toml
    └── src/
        └── main.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .cargo/config.toml
================================================
[build]
rustflags = ["-C", "target-cpu=native"]

[target.wasm32-unknown-unknown]
rustflags = ["-C", "target-feature=+simd128", "--cfg", 'getrandom_backend="wasm_js"']

[target.x86_64-apple-darwin]
rustflags = ["-C", "target-feature=-avx,-avx2"]

================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
  - package-ecosystem: "cargo"
    directory: "/"
    schedule:
      interval: "weekly"
    open-pull-requests-limit: 5


================================================
FILE: .github/workflows/ci_cuda.yaml
================================================
name: CI / cuda

on:
  workflow_dispatch:
  pull_request:

jobs:
  test-cuda:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    runs-on:
      group: aws-g5-4xlarge-cache
    container:
      image: nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
    if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
    permissions:
      contents: write
      packages: write
      # This is used to complete the identity challenge
      # with sigstore/fulcio when running outside of PRs.
      id-token: write
      security-events: write
    env:
      CUDA_COMPUTE_CAP: 86
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
      - name: Install dependencies
        run: apt update && apt install curl build-essential libssl-dev protobuf-compiler pkg-config -y
      - name: Install Rust Stable
        uses: dtolnay/rust-toolchain@stable
      - uses: Swatinem/rust-cache@v2
      - name: Test (cuda)
        run: cargo test --features cuda


================================================
FILE: .github/workflows/maturin.yml
================================================
name: PyO3-Wheels

on:
  push:
    branches:
      - main
    tags:
      - '*'
    paths:
      - candle-pyo3/**
  pull_request:
    paths:
      - candle-pyo3/**
  workflow_dispatch:

permissions:
  contents: read

env:
  PROTOC_VERSION: '25.0'
  FEATURES_FLAG: '--features onnx'

jobs:
  linux:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        target: [x86_64, x86, aarch64, s390x, ppc64le]
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: '3.13'
      - name: Build wheels
        uses: PyO3/maturin-action@v1
        with:
          target: ${{ matrix.target }}
          args: --release --out dist --find-interpreter
          sccache: 'true'
          manylinux: auto
          working-directory: ./candle-pyo3
      - name: Upload wheels
        uses: actions/upload-artifact@v6
        with:
          name: wheels-linux-${{ matrix.target }}
          path: ./candle-pyo3/dist

  windows:
    runs-on: windows-latest
    strategy:
      matrix:
        target: [x64, x86]
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: '3.13'
          architecture: ${{ matrix.target }}
      - name: Install Protoc
        uses: arduino/setup-protoc@v3
        with:
          version: ${{ env.PROTOC_VERSION }}
          repo-token: ${{ secrets.GITHUB_TOKEN }}
      - name: Build wheels
        uses: PyO3/maturin-action@v1
        with:
          target: ${{ matrix.target }}
          args: --release --out dist --find-interpreter ${{ env.FEATURES_FLAG }}
          sccache: 'true'
          working-directory: ./candle-pyo3
      - name: Upload wheels
        uses: actions/upload-artifact@v6
        with:
          name: wheels-windows-${{ matrix.target }}
          path: ./candle-pyo3/dist

  macos:
    runs-on: macos-latest
    strategy:
      matrix:
        target: [x86_64, aarch64]
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: '3.13'
      - name: Install Protoc
        uses: arduino/setup-protoc@v3
        with:
            version: ${{ env.PROTOC_VERSION }}
            repo-token: ${{ secrets.GITHUB_TOKEN }}
      - name: Build wheels
        uses: PyO3/maturin-action@v1
        with:
          target: ${{ matrix.target }}
          args: --release --out dist --find-interpreter ${{ env.FEATURES_FLAG }}
          sccache: 'true'
          working-directory: ./candle-pyo3
      - name: Upload wheels
        uses: actions/upload-artifact@v6
        with:
          name: wheels-macos-${{ matrix.target }}
          path: ./candle-pyo3/dist

  sdist:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - name: Install Protoc
        uses: arduino/setup-protoc@v2
        with:
          version: ${{ env.PROTOC_VERSION }}
          repo-token: ${{ secrets.GITHUB_TOKEN }}
      - name: Build sdist
        uses: PyO3/maturin-action@v1
        with:
          command: sdist
          args: --out dist
          working-directory: ./candle-pyo3
      - name: Upload sdist
        uses: actions/upload-artifact@v6
        with:
          name: wheels-sdist
          path: ./candle-pyo3/dist



================================================
FILE: .github/workflows/python.yml
================================================
name: PyO3-CI

on:
  workflow_dispatch:
  push:
    branches:
      - main
    paths:
      - candle-pyo3/**
  pull_request:
    paths:
      - candle-pyo3/**

jobs:
  build_and_test:
    name: Check everything builds & tests
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest] # For now, only test on Linux
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Install Rust
        uses: dtolnay/rust-toolchain@stable

      - name: Install Python
        uses: actions/setup-python@v6
        with:
          python-version: 3.13
          architecture: "x64"

      - name: Cache Cargo Registry
        uses: actions/cache@v5
        with:
          path: ~/.cargo/registry
          key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}

      - name: Install Protoc
        uses: arduino/setup-protoc@v2
        with:
          version: "25.0"
          repo-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Install
        working-directory: ./candle-pyo3
        run: |
          python -m venv .env
          source .env/bin/activate
          pip install -U pip
          pip install pytest maturin black
          python -m maturin develop -r --features onnx

      - name: Check style
        working-directory: ./candle-pyo3
        run: |
          source .env/bin/activate
          python stub.py --check
          black --check .

      - name: Run tests
        working-directory: ./candle-pyo3
        run: |
          source .env/bin/activate
          python -m pytest -s -v tests


================================================
FILE: .github/workflows/rust-ci.yml
================================================
on:
  push:
    branches:
      - main
  pull_request:

name: Continuous integration

jobs:
  check:
    name: Check
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, ubuntu-24.04, windows-latest, macOS-latest, ubuntu-24.04-arm]
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: "3.13"
      - name: Remove cargo config (macOS ring crate fix)
        if: runner.os == 'macOS'
        run: rm -f .cargo/config.toml
      - uses: dtolnay/rust-toolchain@stable

      - name: Run macos with metal
        if: matrix.os == 'macOS-latest' 
        run: cargo check --workspace --features metal

      - name: Run normal cpu
        if: matrix.os == 'ubuntu-latest' || matrix.os == 'windows-latest'
        run: cargo check --workspace

      - name: Run with avx2
        if: matrix.os == 'ubuntu-24.04'
        run: |
          export RUSTFLAGS="-C target-feature=avx2"
          cargo check --workspace 

      - name: Run with arm neon
        if: matrix.os == 'ubuntu-24.04-arm'
        run: |
          export RUSTFLAGS="-C target-feature=neon"
          cargo check --workspace 

  test:
    name: Test Suite
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, windows-latest, macOS-latest]
    steps:
      - name: Free disk space (Linux)
        if: runner.os == 'Linux'
        run: |
          sudo rm -rf /opt/hostedtoolcache
          sudo rm -rf /usr/share/dotnet
          sudo rm -rf /usr/local/lib/android
          sudo rm -rf /opt/ghc
          df -h
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: "3.13"
      - name: Remove cargo config (macOS ring crate fix)
        if: runner.os == 'macOS'
        run: rm -f .cargo/config.toml
      - uses: dtolnay/rust-toolchain@stable
      - name: Install lld (Linux only)
        if: runner.os == 'Linux'
        run: sudo apt-get update && sudo apt-get install -y lld
      - name: Run tests (with lld on Linux)
        if: runner.os == 'Linux'
        env:
          RUSTFLAGS: "-C link-arg=-fuse-ld=lld"
        run: cargo test --workspace
      - name: Run tests (Windows & macOS)
        if: runner.os != 'Linux'
        run: cargo test --workspace

  fmt:
    name: Rustfmt
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: dtolnay/rust-toolchain@stable
        with:
          components: rustfmt
      - run: cargo fmt --all -- --check

  clippy:
    name: Clippy
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: dtolnay/rust-toolchain@stable
        with:
          components: clippy
      - run: cargo clippy --workspace --tests --examples --benches -- -D warnings
 

================================================
FILE: .github/workflows/trufflehog.yml
================================================
on:
  push:

name: Secret Leaks

jobs:
  trufflehog:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Secret Scanning
        uses: trufflesecurity/trufflehog@main


================================================
FILE: .gitignore
================================================
# Generated by Cargo
# will have compiled files and executables
debug/
data/
dist/
target/

# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock

# editor config
.helix
.vscode
.zed

# These are backup files generated by rustfmt
**/*.rs.bk

# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb

*tokenizer*.json
*.npz

perf.data
flamegraph.svg
*.dylib
*.so
*.swp
*.swo
trace-*.json

candle-wasm-examples/*/build
candle-wasm-examples/*/*.bin
candle-wasm-examples/*/*.jpeg
candle-wasm-examples/*/audios/*.wav
candle-wasm-examples/**/*.safetensors
candle-wasm-examples/**/*.gguf
candle-wasm-examples/*/package-lock.json
candle-wasm-examples/**/config*.json
.DS_Store
.idea/*
__pycache__
out.safetensors
out.wav
bria.mp3
bria.safetensors
bria.wav


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
  - repo: https://github.com/Narsil/pre-commit-rust
    rev: 2eed6366172ef2a5186e8785ec0e67243d7d73d0
    hooks:
      - id: fmt
        name: "Rust (fmt)"
      - id: clippy
        name: "Rust (clippy)"
        args:
          [
            "--tests",
            "--examples",
            "--",
            "-Dwarnings",
          ]


================================================
FILE: CHANGELOG.md
================================================
# Changelog
This documents the main changes to the `candle` crate.

## v0.3.1 - Unreleased

### Added

### Modified

## v0.3.0 - 2023-10-01

### Added

- Added the Mistral 7b v0.1 model
  [983](https://github.com/huggingface/candle/pull/983).
- Quantized version of the Mistral model
  [1009](https://github.com/huggingface/candle/pull/1009).
- Add the gelu-erf op and activation function
  [969](https://github.com/huggingface/candle/pull/969).
- Add the mixformer/phi-v1.5 model
  [930](https://github.com/huggingface/candle/pull/930).
- Add the sclice-scatter op
  [927](https://github.com/huggingface/candle/pull/927).
- Add the Wuerstchen diffusion model
  [911](https://github.com/huggingface/candle/pull/911).

### Modified

- Support for simd128 intrinsics in some quantized vecdots
  [982](https://github.com/huggingface/candle/pull/982).
- Optimize the index-select cuda kernel
  [976](https://github.com/huggingface/candle/pull/976).
- Self-contained safetensor wrappers
  [946](https://github.com/huggingface/candle/pull/946).

## v0.2.2 - 2023-09-18

### Added
- Support for `top_p` sampling
  [819](https://github.com/huggingface/candle/pull/819).
- T5 model including decoding
  [864](https://github.com/huggingface/candle/pull/864).
- 1-d upsampling
  [839](https://github.com/huggingface/candle/pull/839).

### Modified
- Bugfix for conv2d
  [820](https://github.com/huggingface/candle/pull/820).
- Support tensor based indexing using `.i`
  [842](https://github.com/huggingface/candle/pull/842).

## v0.2.1 - 2023-09-11

### Added
- Add some RNNs (GRU and LSTM) in `candle-nn`
  [674](https://github.com/huggingface/candle/pull/674),
  [688](https://github.com/huggingface/candle/pull/688).
- gguf v2 support
  [725](https://github.com/huggingface/candle/pull/725).
- Quantized llama example in Python using the pyo3 api
  [716](https://github.com/huggingface/candle/pull/716).
- `candle-nn` layer for conv2d-transposed
  [760](https://github.com/huggingface/candle/pull/760).
- Add the Segment-Anything Model (SAM) as an example
  [773](https://github.com/huggingface/candle/pull/773).
- TinyViT backbone for the segment anything example
  [787](https://github.com/huggingface/candle/pull/787).
- Shape with holes support
  [770](https://github.com/huggingface/candle/pull/770).

### Modified
- Dilations are now supported in conv-transpose2d.
  [671](https://github.com/huggingface/candle/pull/671).
- Interactive mode for the quantized model
  [690](https://github.com/huggingface/candle/pull/690).
- Faster softmax operation
  [747](https://github.com/huggingface/candle/pull/747).
- Faster convolution operations on CPU and CUDA via im2col
  [802](https://github.com/huggingface/candle/pull/802).
- Moving some models to a more central location
  [796](https://github.com/huggingface/candle/pull/796).

## v0.2.0 - 2023-08-30

### Added
- Add the powf op
  [664](https://github.com/huggingface/candle/pull/664).
- Stable Diffusion XL support
  [647](https://github.com/huggingface/candle/pull/647).
- Add the conv-transpose2d op
  [635](https://github.com/huggingface/candle/pull/635).
- Refactor the VarBuilder api
  [627](https://github.com/huggingface/candle/pull/627).
- Add some quantization command
  [625](https://github.com/huggingface/candle/pull/625).
- Support more quantized types, e.g. Q2K, Q4K, Q5K...
  [586](https://github.com/huggingface/candle/pull/586).
- Add pose estimation to the yolo example
  [589](https://github.com/huggingface/candle/pull/589).
- Api to write GGUF files
  [585](https://github.com/huggingface/candle/pull/585).
- Support more quantization types
  [580](https://github.com/huggingface/candle/pull/580).
- Add EfficientNet as an example Computer Vision model
  [572](https://github.com/huggingface/candle/pull/572).
- Add a group parameter to convolutions
  [566](https://github.com/huggingface/candle/pull/566).
- New dtype: int64
  [563](https://github.com/huggingface/candle/pull/563).
- Handling of the GGUF file format.
  [559](https://github.com/huggingface/candle/pull/559).

## v0.1.2 - 2023-08-21


================================================
FILE: Cargo.toml
================================================
[workspace]
members = [
    "candle-core",
    "candle-datasets",
    "candle-examples",
    "candle-nn",
    "candle-pyo3",
    "candle-transformers",
    "candle-ug",
    "candle-wasm-examples/*",
    "candle-wasm-tests",
    "tensor-tools",
]
exclude = [
    "candle-book",
    "candle-flash-attn",
    "candle-flash-attn-v3",
    "candle-kernels",
    "candle-metal-kernels",
    "candle-onnx",
]
resolver = "2"

[workspace.package]
version = "0.9.2"
edition = "2021"
description = "Minimalist ML framework."
repository = "https://github.com/huggingface/candle"
keywords = ["blas", "tensor", "machine-learning"]
categories = ["science"]
license = "MIT OR Apache-2.0"

[workspace.dependencies]
ab_glyph = "0.2.23"
accelerate-src = { version = "0.3.2" }
anyhow = { version = "1", features = ["backtrace"] }
byteorder = "1.4.3"
candle = { path = "./candle-core", package = "candle-core", version = "0.9.2" }
candle-datasets = { path = "./candle-datasets", version = "0.9.2" }
candle-flash-attn = { path = "./candle-flash-attn", version = "0.9.2" }
candle-flash-attn-v3 = { path = "./candle-flash-attn-v3", version = "0.9.2" }
candle-kernels = { path = "./candle-kernels", version = "0.9.2" }
candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.9.2" }
candle-nn = { path = "./candle-nn", version = "0.9.2" }
candle-onnx = { path = "./candle-onnx", version = "0.9.2" }
candle-transformers = { path = "./candle-transformers", version = "0.9.2" }
candle-ug = { path = "./candle-ug", version = "0.9.2" }
clap = { version = "4.2.4", features = ["derive"] }
criterion = { version = "0.8", default-features = false }
cudarc = { version = "0.19.1", features = [
    "std",
    "cublas",
    "cublaslt",
    "curand",
    "driver",
    "nvrtc",
    "f16",
    "f8",
    "cuda-version-from-build-system",
    "dynamic-linking",
], default-features = false }
fancy-regex = "0.17.0"
gemm = { version = "0.19.0", features = ["wasm-simd128-enable"] }
hf-hub = "0.4.1"
half = { version = "2.5.0", features = [
    "num-traits",
    "use-intrinsics",
    "rand_distr",
] }
float8 = { version = "0.7.0", features = ["num-traits", "rand_distr"] }
hound = "3.5.1"
image = { version = "0.25.2", default-features = false, features = [
    "jpeg",
    "png",
] }
imageproc = { version = "0.26.0", features = [
    "text",
], default-features = false }
intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
libc = { version = "0.2.147" }
libm = { version = "0.2.15" }
log = "0.4"
memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
num_cpus = "1.15.0"
num-traits = "0.2.15"
parquet = "57"
rand = "0.9.0"
rand_distr = "0.5.1"
rayon = "1.7.0"
safetensors = "0.7.0"
serde = { version = "1.0.171", features = ["derive"] }
serde_plain = "1.0.2"
serde_json = "1.0.99"
thiserror = "2"
tokenizers = { version = "0.22.0", default-features = false }
tracing = "0.1.37"
tracing-chrome = "0.7.1"
tracing-subscriber = "0.3.7"
ug = "0.5.0"
ug-cuda = "0.5.0"
ug-metal = "0.5.0"
yoke = { version = "0.8.1", features = ["derive"] }
zip = { version = "7.2.0", default-features = false }
objc2-metal = { version = "0.3.1" }
objc2-foundation = { version = "0.3.1" }

[profile.release-with-debug]
inherits = "release"
debug = true


================================================
FILE: LICENSE-APACHE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: LICENSE-MIT
================================================
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.


================================================
FILE: Makefile
================================================
.PHONY: clean-ptx clean test

clean-ptx:
	find target -name "*.ptx" -type f -delete
	echo "" > candle-kernels/src/lib.rs
	touch candle-kernels/build.rs
	touch candle-examples/build.rs
	touch candle-flash-attn/build.rs

clean:
	cargo clean

test:
	cargo test

all: test


================================================
FILE: README.md
================================================
# candle
[![discord server](https://dcbadge.limes.pink/api/server/hugging-face-879548962464493619)](https://discord.gg/hugging-face-879548962464493619)
[![Latest version](https://img.shields.io/crates/v/candle-core.svg)](https://crates.io/crates/candle-core)
[![Documentation](https://docs.rs/candle-core/badge.svg)](https://docs.rs/candle-core)
[![License](https://img.shields.io/github/license/base-org/node?color=blue)](https://github.com/huggingface/candle/blob/main/LICENSE-MIT)
[![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](https://github.com/huggingface/candle/blob/main/LICENSE-APACHE)

Candle is a minimalist ML framework for Rust with a focus on performance (including GPU support) 
and ease of use. Try our online demos: 
[whisper](https://huggingface.co/spaces/lmz/candle-whisper),
[LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2),
[T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
[yolo](https://huggingface.co/spaces/lmz/candle-yolo),
[Segment
Anything](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).

## Get started

Make sure that you have [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) correctly installed as described in [**Installation**](https://huggingface.github.io/candle/guide/installation.html).

Let's see how to run a simple matrix multiplication.
Write the following to your `myapp/src/main.rs` file:
```rust
use candle_core::{Device, Tensor};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let device = Device::Cpu;

    let a = Tensor::randn(0f32, 1., (2, 3), &device)?;
    let b = Tensor::randn(0f32, 1., (3, 4), &device)?;

    let c = a.matmul(&b)?;
    println!("{c}");
    Ok(())
}
```

`cargo run` should display a tensor of shape `Tensor[[2, 4], f32]`.


Having installed `candle` with Cuda support, simply define the `device` to be on GPU:

```diff
- let device = Device::Cpu;
+ let device = Device::new_cuda(0)?;
```

For more advanced examples, please have a look at the following section.

## Check out our examples

These online demos run entirely in your browser:
- [yolo](https://huggingface.co/spaces/lmz/candle-yolo): pose estimation and
  object recognition.
- [whisper](https://huggingface.co/spaces/lmz/candle-whisper): speech recognition.
- [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2): text generation.
- [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm): text generation.
- [Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm): text generation.
- [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm): Image segmentation.
- [BLIP](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning): image captioning.

We also provide some command line based examples using state of the art models:

- [LLaMA v1, v2, and v3](./candle-examples/examples/llama/): general LLM, includes
  the SOLAR-10.7B variant.
- [Falcon](./candle-examples/examples/falcon/): general LLM.
- [Codegeex4](./candle-examples/examples/codegeex4-9b/): Code completion, code interpreter, web search, function calling, repository-level
- [GLM4](./candle-examples/examples/glm4/): Open Multilingual Multimodal Chat LMs by THUDM
- [Gemma v1 and v2](./candle-examples/examples/gemma/): 2b and 7b+/9b general LLMs from Google Deepmind.
- [RecurrentGemma](./candle-examples/examples/recurrent-gemma/): 2b and 7b
  Griffin based models from Google that mix attention with a RNN like state.
- [Phi-1, Phi-1.5, Phi-2, and Phi-3](./candle-examples/examples/phi/): 1.3b,
  2.7b, and 3.8b general LLMs with performance on par with 7b models.
- [StableLM-3B-4E1T](./candle-examples/examples/stable-lm/): a 3b general LLM
  pre-trained on 1T tokens of English and code datasets. Also supports
  StableLM-2, a 1.6b LLM trained on 2T tokens, as well as the code variants.
- [Mamba](./candle-examples/examples/mamba/): an inference only
  implementation of the Mamba state space model.
- [Mistral7b-v0.1](./candle-examples/examples/mistral/): a 7b general LLM with
  better performance than all publicly available 13b models as of 2023-09-28.
- [Mixtral8x7b-v0.1](./candle-examples/examples/mixtral/): a sparse mixture of
  experts 8x7b general LLM with better performance than a Llama 2 70B model with
  much faster inference.
- [StarCoder](./candle-examples/examples/bigcode/) and
  [StarCoder2](./candle-examples/examples/starcoder2/): LLM specialized to code generation.
- [Qwen1.5](./candle-examples/examples/qwen/): Bilingual (English/Chinese) LLMs.
- [RWKV v5 and v6](./candle-examples/examples/rwkv/): An RNN with transformer level LLM
  performance.
- [Replit-code-v1.5](./candle-examples/examples/replit-code/): a 3.3b LLM specialized for code completion.
- [Yi-6B / Yi-34B](./candle-examples/examples/yi/): two bilingual
  (English/Chinese) general LLMs with 6b and 34b parameters.
- [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
  the LLaMA model using the same quantization techniques as
  [llama.cpp](https://github.com/ggerganov/llama.cpp).
- [Quantized Qwen3 MoE](./candle-examples/examples/quantized-qwen3-moe/): support gguf quantized models of Qwen3 MoE models.

<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/quantized/assets/aoc.gif" width="600">
  
- [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
  image generative model, support for the 1.5, 2.1, SDXL 1.0 and Turbo versions.

<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg" width="200">

- [Wuerstchen](./candle-examples/examples/wuerstchen/): another text to
  image generative model.

<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/wuerstchen/assets/cat.jpg" width="200">

- [yolo-v3](./candle-examples/examples/yolo-v3/) and
  [yolo-v8](./candle-examples/examples/yolo-v8/): object detection and pose
  estimation models.

<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.od.jpg" width="200"><img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.pose.jpg" width="200">
- [segment-anything](./candle-examples/examples/segment-anything/): image
  segmentation model with prompt.

<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/segment-anything/assets/sam_merged.jpg" width="200">

- [SegFormer](./candle-examples/examples/segformer/): transformer based semantic segmentation model.
- [Whisper](./candle-examples/examples/whisper/): speech recognition model.
- [EnCodec](./candle-examples/examples/encodec/): high-quality audio compression
  model using residual vector quantization.
- [MetaVoice](./candle-examples/examples/metavoice/): foundational model for
  text-to-speech.
- [Parler-TTS](./candle-examples/examples/parler-tts/): large text-to-speech
  model.
- [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/),
  [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings.
- [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
  using self-supervision (can be used for imagenet classification, depth
  evaluation, segmentation).
- [VGG](./candle-examples/examples/vgg/),
  [RepVGG](./candle-examples/examples/repvgg): computer vision models.
- [BLIP](./candle-examples/examples/blip/): image to text model, can be used to
  generate captions for an image.
- [CLIP](./candle-examples/examples/clip/): multi-model vision and language
  model.
- [TrOCR](./candle-examples/examples/trocr/): a transformer OCR model, with
  dedicated submodels for hand-writing and printed recognition.
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
  model, generates the translated text from the input text.
- [Moondream](./candle-examples/examples/moondream/): tiny computer-vision model 
  that can answer real-world questions about images.

Run them using commands like:
```
cargo run --example quantized --release
```

In order to use **CUDA** add `--features cuda` to the example command line. If
you have cuDNN installed, use `--features cudnn` for even more speedups.

There are also some wasm examples for whisper and
[llama2.c](https://github.com/karpathy/llama2.c). You can either build them with
`trunk` or try them online:
[whisper](https://huggingface.co/spaces/lmz/candle-whisper),
[llama2](https://huggingface.co/spaces/lmz/candle-llama2),
[T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
[Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm),
[Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).

For LLaMA2, run the following command to retrieve the weight files and start a
test server:
```bash
cd candle-wasm-examples/llama2-c
wget https://huggingface.co/spaces/lmz/candle-llama2/resolve/main/model.bin
wget https://huggingface.co/spaces/lmz/candle-llama2/resolve/main/tokenizer.json
trunk serve --release --port 8081
```
And then head over to
[http://localhost:8081/](http://localhost:8081/).

<!--- ANCHOR: useful_libraries --->

## Useful External Resources
- [`candle-tutorial`](https://github.com/ToluClassics/candle-tutorial): A
  very detailed tutorial showing how to convert a PyTorch model to Candle.
- [`candle-lora`](https://github.com/EricLBuehler/candle-lora): Efficient and
  ergonomic LoRA implementation for Candle. `candle-lora` has      
  out-of-the-box LoRA support for many models from Candle, which can be found
  [here](https://github.com/EricLBuehler/candle-lora/tree/master/candle-lora-transformers/examples).
- [`candle-video`](https://github.com/FerrisMind/candle-video): Rust library for text-to-video generation (LTX-Video and related models) built on Candle, focused on fast, Python-free inference.
- [`optimisers`](https://github.com/KGrewal1/optimisers): A collection of optimisers
  including SGD with momentum, AdaGrad, AdaDelta, AdaMax, NAdam, RAdam, and RMSprop.
- [`candle-vllm`](https://github.com/EricLBuehler/candle-vllm): Efficient platform for inference and
  serving local LLMs including an OpenAI compatible API server.
- [`candle-ext`](https://github.com/mokeyish/candle-ext): An extension library to Candle that provides PyTorch functions not currently available in Candle.
- [`candle-coursera-ml`](https://github.com/vishpat/candle-coursera-ml): Implementation of ML algorithms from Coursera's [Machine Learning Specialization](https://www.coursera.org/specializations/machine-learning-introduction) course.
- [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more.
- [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
- [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem.
- [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.
- [`atoma-infer`](https://github.com/atoma-network/atoma-infer): A Rust library for fast inference at scale, leveraging FlashAttention2 for efficient attention computation, PagedAttention for efficient KV-cache memory management, and multi-GPU support. It is OpenAI api compatible.
- [`llms-from-scratch-rs`](https://github.com/nerdai/llms-from-scratch-rs): A comprehensive Rust translation of the code from Sebastian Raschka's Build an LLM from Scratch book.
- [`vllm.rs`](https://github.com/guoqingbao/vllm.rs): A minimalist vLLM implementation in Rust based on Candle.

If you have an addition to this list, please submit a pull request.

<!--- ANCHOR_END: useful_libraries --->

<!--- ANCHOR: features --->

## Features

- Simple syntax, looks and feels like PyTorch.
    - Model training.
    - Embed user-defined ops/kernels, such as [flash-attention v2](https://github.com/huggingface/candle/blob/89ba005962495f2bfbda286e185e9c3c7f5300a3/candle-flash-attn/src/lib.rs#L152).
- Backends.
    - Optimized CPU backend with optional MKL support for x86 and Accelerate for macs.
    - CUDA backend for efficiently running on GPUs, multiple GPU distribution via NCCL.
    - WASM support, run your models in a browser.
- Included models.
    - Language Models.
        - LLaMA v1, v2, and v3 with variants such as SOLAR-10.7B.
        - Falcon.
        - StarCoder, StarCoder2.
        - Phi 1, 1.5, 2, and 3.
        - Mamba, Minimal Mamba
        - Gemma v1 2b and 7b+, v2 2b and 9b.
        - Mistral 7b v0.1.
        - Mixtral 8x7b v0.1.
        - StableLM-3B-4E1T, StableLM-2-1.6B, Stable-Code-3B.
        - Replit-code-v1.5-3B.
        - Bert.
        - Yi-6B and Yi-34B.
        - Qwen1.5, Qwen1.5 MoE, Qwen3 MoE.
        - RWKV v5 and v6.
    - Quantized LLMs.
        - Llama 7b, 13b, 70b, as well as the chat and code variants.
        - Mistral 7b, and 7b instruct.
        - Mixtral 8x7b.
        - Zephyr 7b a and b (Mistral-7b based).
        - OpenChat 3.5 (Mistral-7b based).
        - Qwen3 MoE (16B-A3B, 32B-A3B)
    - Text to text.
        - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
        - Marian MT (Machine Translation).
    - Text to image.
        - Stable Diffusion v1.5, v2.1, XL v1.0.
        - Wurstchen v2.
    - Image to text.
        - BLIP.
        - TrOCR.
    - Audio.
        - Whisper, multi-lingual speech-to-text.
        - EnCodec, audio compression model.
        - MetaVoice-1B, text-to-speech model.
        - Parler-TTS, text-to-speech model.
    - Computer Vision Models.
        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
          ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4, Hiera, FastViT.
        - yolo-v3, yolo-v8.
        - Segment-Anything Model (SAM).
        - SegFormer.
- File formats: load models from safetensors, npz, ggml, or PyTorch files.
- Serverless (on CPU), small and fast deployments.
- Quantization support using the llama.cpp quantized types.

<!--- ANCHOR_END: features --->

## How to use

<!--- ANCHOR: cheatsheet --->
Cheatsheet:

|            | Using PyTorch                            | Using Candle                                                     |
|------------|------------------------------------------|------------------------------------------------------------------|
| Creation   | `torch.Tensor([[1, 2], [3, 4]])`         | `Tensor::new(&[[1f32, 2.], [3., 4.]], &Device::Cpu)?`           |
| Creation   | `torch.zeros((2, 2))`                    | `Tensor::zeros((2, 2), DType::F32, &Device::Cpu)?`               |
| Indexing   | `tensor[:, :4]`                          | `tensor.i((.., ..4))?`                                           |
| Operations | `tensor.view((2, 2))`                    | `tensor.reshape((2, 2))?`                                        |
| Operations | `a.matmul(b)`                            | `a.matmul(&b)?`                                                  |
| Arithmetic | `a + b`                                  | `&a + &b`                                                        |
| Device     | `tensor.to(device="cuda")`               | `tensor.to_device(&Device::new_cuda(0)?)?`                            |
| Dtype      | `tensor.to(dtype=torch.float16)`         | `tensor.to_dtype(&DType::F16)?`                                  |
| Saving     | `torch.save({"A": A}, "model.bin")`      | `candle::safetensors::save(&HashMap::from([("A", A)]), "model.safetensors")?` |
| Loading    | `weights = torch.load("model.bin")`      | `candle::safetensors::load("model.safetensors", &device)`        |

<!--- ANCHOR_END: cheatsheet --->


## Structure

- [candle-core](./candle-core): Core ops, devices, and `Tensor` struct definition
- [candle-nn](./candle-nn/): Tools to build real models
- [candle-examples](./candle-examples/): Examples of using the library in realistic settings
- [candle-kernels](./candle-kernels/): CUDA custom kernels
- [candle-datasets](./candle-datasets/): Datasets and data loaders.
- [candle-transformers](./candle-transformers): transformers-related utilities.
- [candle-flash-attn](./candle-flash-attn): Flash attention v2 layer.
- [candle-onnx](./candle-onnx/): ONNX model evaluation.

## FAQ

### Why should I use Candle?

<!--- ANCHOR: goals --->

Candle's core goal is to *make serverless inference possible*. Full machine learning frameworks like PyTorch
are very large, which makes creating instances on a cluster slow. Candle allows deployment of lightweight
binaries.

Secondly, Candle lets you *remove Python* from production workloads. Python overhead can seriously hurt performance,
and the [GIL](https://www.backblaze.com/blog/the-python-gil-past-present-and-future/) is a notorious source of headaches.

Finally, Rust is cool! A lot of the HF ecosystem already has Rust crates, like [safetensors](https://github.com/huggingface/safetensors) and [tokenizers](https://github.com/huggingface/tokenizers).

<!--- ANCHOR_END: goals --->

### Other ML frameworks

- [dfdx](https://github.com/coreylowman/dfdx) is a formidable crate, with shapes being included
  in types. This prevents a lot of headaches by getting the compiler to complain about shape mismatches right off the bat.
  However, we found that some features still require nightly, and writing code can be a bit daunting for non rust experts.

  We're leveraging and contributing to other core crates for the runtime so hopefully both crates can benefit from each
  other.

- [burn](https://github.com/burn-rs/burn) is a general crate that can leverage multiple backends so you can choose the best
  engine for your workload.

- [tch-rs](https://github.com/LaurentMazare/tch-rs.git) Bindings to the torch library in Rust. Extremely versatile, but they 
  bring in the entire torch library into the runtime. The main contributor of `tch-rs` is also involved in the development
  of `candle`.

### Common Errors

#### Missing symbols when compiling with the mkl feature.

If you get some missing symbols when compiling binaries/tests using the mkl
or accelerate features, e.g. for mkl you get:
```
  = note: /usr/bin/ld: (....o): in function `blas::sgemm':
          .../blas-0.22.0/src/lib.rs:1944: undefined reference to `sgemm_' collect2: error: ld returned 1 exit status

  = note: some `extern` functions couldn't be found; some native libraries may need to be installed or have their path specified
  = note: use the `-l` flag to specify native libraries to link
  = note: use the `cargo:rustc-link-lib` directive to specify the native libraries to link with Cargo
```
or for accelerate:
```
Undefined symbols for architecture arm64:
            "_dgemm_", referenced from:
                candle_core::accelerate::dgemm::h1b71a038552bcabe in libcandle_core...
            "_sgemm_", referenced from:
                candle_core::accelerate::sgemm::h2cf21c592cba3c47 in libcandle_core...
          ld: symbol(s) not found for architecture arm64
```

This is likely due to a missing linker flag that was needed to enable the mkl library. You
can try adding the following for mkl at the top of your binary:
```rust
extern crate intel_mkl_src;
```
or for accelerate:
```rust
extern crate accelerate_src;
```

#### Cannot run the LLaMA examples: access to source requires login credentials

```
Error: request error: https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/tokenizer.json: status code 401
```

This is likely because you're not permissioned for the LLaMA-v2 model. To fix
this, you have to register on the huggingface-hub, accept the [LLaMA-v2 model
conditions](https://huggingface.co/meta-llama/Llama-2-7b-hf), and set up your
authentication token. See issue
[#350](https://github.com/huggingface/candle/issues/350) for more details.

#### Docker build

When building CUDA kernels inside a Dockerfile, nvidia-smi cannot be used to auto-detect compute capability.

You must explicitly set CUDA_COMPUTE_CAP, for example:

```
FROM nvidia/cuda:12.9.0-devel-ubuntu22.04

# Install git and curl
RUN set -eux; \
  apt-get update; \
  apt-get install -y curl git ca-certificates;

# Install Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

# Clone candle repo
RUN git clone https://github.com/huggingface/candle.git

# Set compute capability for the build
ARG CUDA_COMPUTE_CAP=90
ENV CUDA_COMPUTE_CAP=${CUDA_COMPUTE_CAP}

# Build with explicit compute cap
WORKDIR /app
COPY . .
RUN cargo build --release features cuda
```

#### Compiling with flash-attention fails

```
/usr/include/c++/11/bits/std_function.h:530:146: error: parameter packs not expanded with ‘...’:
```

This is a bug in gcc-11 triggered by the Cuda compiler. To fix this, install a different, supported gcc version - for example gcc-10, and specify the path to the compiler in the NVCC_CCBIN environment variable.
```
env NVCC_CCBIN=/usr/lib/gcc/x86_64-linux-gnu/10 cargo ...
```

#### Linking error on windows when running rustdoc or mdbook tests

```
Couldn't compile the test.
---- .\candle-book\src\inference\hub.md - Using_the_hub::Using_in_a_real_model_ (line 50) stdout ----
error: linking with `link.exe` failed: exit code: 1181
//very long chain of linking
 = note: LINK : fatal error LNK1181: cannot open input file 'windows.0.48.5.lib'
```

Make sure you link all native libraries that might be located outside a project target, e.g., to run mdbook tests, you should run:

```
mdbook test candle-book -L .\target\debug\deps\ `
-L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.42.2\lib `
-L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.48.5\lib
```

#### Extremely slow model load time with WSL

This may be caused by the models being loaded from `/mnt/c`, more details on
[stackoverflow](https://stackoverflow.com/questions/68972448/why-is-wsl-extremely-slow-when-compared-with-native-windows-npm-yarn-processing).

#### Tracking down errors

You can set `RUST_BACKTRACE=1` to be provided with backtraces when a candle
error is generated.

#### CudaRC error

If you encounter an error like this one `called `Result::unwrap()` on an `Err` value: LoadLibraryExW { source: Os { code: 126, kind: Uncategorized, message: "The specified module could not be found." } }` on windows. To fix copy and rename these 3 files (make sure they are in path). The paths depend on your cuda version.
`c:\Windows\System32\nvcuda.dll` -> `cuda.dll`
`c:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cublas64_12.dll` -> `cublas.dll`
`c:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\curand64_10.dll` -> `curand.dll`


================================================
FILE: candle-book/.gitignore
================================================
book


================================================
FILE: candle-book/CONTRIBUTING.md
================================================
# Candle Book

The book uses [mdBook](https://github.com/rust-lang/mdBook) for building.

## Installation

To install mdBook, run `cargo install mdbook`. More instructions can be found [here](https://rust-lang.github.io/mdBook/guide/installation.html).

## Viewing the book

To view the book, run `mdbook serve --open candle-book`. More instructions can be found [here](https://rust-lang.github.io/mdBook/guide/creating.html). 

The book is built automatically in github CI.

================================================
FILE: candle-book/Cargo.toml
================================================
[package]
name = "candle-book"
version.workspace = true
edition.workspace = true
description.workspace = true
repository.workspace = true
keywords.workspace = true
categories.workspace = true
license.workspace = true
readme = "README.md"

[dependencies]
accelerate-src = { workspace = true, optional = true }
candle = { workspace = true }
candle-datasets = { workspace = true }
candle-nn = { workspace = true }
candle-transformers = { workspace = true }
candle-flash-attn = { workspace = true, optional = true }
safetensors = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
num-traits = { workspace = true }
intel-mkl-src = { workspace = true, optional = true }
cudarc = { workspace = true, optional = true }
half = { workspace = true, optional = true }
image = { workspace = true, optional = true }
anyhow = { workspace = true }
tokio = "1.48.0"

[dev-dependencies]
byteorder = { workspace = true }
hf-hub = { workspace = true, features=["tokio"]}
clap = { workspace = true }
memmap2 = { workspace = true }
rand = { workspace = true }
tokenizers = { workspace = true, features = ["onig"] }
tracing = { workspace = true }
tracing-chrome = { workspace = true }
tracing-subscriber = { workspace = true }
# Necessary to disambiguate with tokio in wasm examples which are 1.28.1
parquet = { workspace = true }
image = { workspace = true }

[build-dependencies]
anyhow = { workspace = true }

[features]
default = []


================================================
FILE: candle-book/book.toml
================================================
[book]
authors = ["Nicolas Patry"]
language = "en"
multilingual = false
src = "src"
title = "Candle Documentation"


================================================
FILE: candle-book/src/README.md
================================================
# Introduction

{{#include ../../README.md:goals}}

{{#include ../../README.md:features}}

This book will introduce step by step how to use `candle`.

================================================
FILE: candle-book/src/SUMMARY.md
================================================
# Summary

[Introduction](README.md)

# User Guide

- [Installation](guide/installation.md)
- [Tutorial - MNIST](guide/mnist/intro.md)
  - [Modeling](guide/mnist/modeling.md)
  - [Training](guide/mnist/training.md)
  - [Saving And Loading](guide/mnist/saving_loading.md)
- [PyTorch cheatsheet](guide/cheatsheet.md)

# Reference Guide

- [Running a model](inference/inference.md)
    - [Using the hub](inference/hub.md)
- [Error management](error_manage.md)
- [Tracing](tracing.md)
- [Training](training/training.md)
    - [Simplified](training/simplified.md)
    - [MNIST](training/mnist.md)
    - [Fine-tuning]()
    - [Serialization]()
- [Advanced Cuda usage]()
    - [Writing a custom kernel]()
    - [Porting a custom kernel]()
- [Using MKL]()
- [Creating apps]()
    - [Creating a WASM app]()
    - [Creating a REST api webserver]()
    - [Creating a desktop Tauri app]()


================================================
FILE: candle-book/src/advanced/mkl.md
================================================
# Using MKL


================================================
FILE: candle-book/src/apps/README.md
================================================
# Creating apps


================================================
FILE: candle-book/src/apps/desktop.md
================================================
# Creating a desktop Tauri app


================================================
FILE: candle-book/src/apps/rest.md
================================================
# Creating a REST api webserver


================================================
FILE: candle-book/src/apps/wasm.md
================================================
# Creating a WASM app


================================================
FILE: candle-book/src/chapter_1.md
================================================
# Chapter 1


================================================
FILE: candle-book/src/cuda/README.md
================================================
# Advanced Cuda usage


================================================
FILE: candle-book/src/cuda/porting.md
================================================
# Porting a custom kernel


================================================
FILE: candle-book/src/cuda/writing.md
================================================
# Writing a custom kernel


================================================
FILE: candle-book/src/error_manage.md
================================================
# Error management

You might have seen in the code base a lot of `.unwrap()` or `?`.
If you're unfamiliar with Rust check out the [Rust book](https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html)
for more information.

What's important to know though, is that if you want to know *where* a particular operation failed
You can simply use `RUST_BACKTRACE=1` to get the location of where the model actually failed.

Let's see on failing code:

```rust,ignore
let x = Tensor::zeros((1, 784), DType::F32, &device)?;
let y = Tensor::zeros((1, 784), DType::F32, &device)?;
let z = x.matmul(&y)?;
```

Will print at runtime:

```bash
Error: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }
``` 


After adding `RUST_BACKTRACE=1`:


```bash
Error: WithBacktrace { inner: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }, backtrace: Backtrace [{ fn: "candle::error::Error::bt", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/error.rs", line: 200 }, { fn: "candle::tensor::Tensor::matmul", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/tensor.rs", line: 816 }, { fn: "myapp::main", file: "./src/main.rs", line: 29 }, { fn: "core::ops::function::FnOnce::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 250 }, { fn: "std::sys_common::backtrace::__rust_begin_short_backtrace", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/sys_common/backtrace.rs", line: 135 }, { fn: "std::rt::lang_start::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 166 }, { fn: "core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 284 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::rt::lang_start", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 165 }, { fn: "main" }, { fn: "__libc_start_main" }, { fn: "_start" }] }
```

Not super pretty at the moment, but we can see error occurred on `{ fn: "myapp::main", file: "./src/main.rs", line: 29 }`


Another thing to note, is that since Rust is compiled it is not necessarily as easy to recover proper stacktraces
especially in release builds. We're using [`anyhow`](https://docs.rs/anyhow/latest/anyhow/) for that.
The library is still young, please [report](https://github.com/LaurentMazare/candle/issues) any issues detecting where an error is coming from.

## Cuda error management

When running a model on Cuda, you might get a stacktrace not really representing the error.
The reason is that CUDA is async by nature, and therefore the error might be caught while you were sending totally different kernels.

One way to avoid this is to use `CUDA_LAUNCH_BLOCKING=1` as an environment variable. This will force every kernel to be launched sequentially.
You might still however see the error happening on other kernels as the faulty kernel might exit without an error but spoiling some pointer for which the error will happen when dropping the `CudaSlice` only.


If this occurs, you can use [`compute-sanitizer`](https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html)
This tool is like `valgrind` but for cuda. It will help locate the errors in the kernels.




================================================
FILE: candle-book/src/guide/cheatsheet.md
================================================
# Pytorch cheatsheet

{{#include ../../../README.md:cheatsheet}}


================================================
FILE: candle-book/src/guide/hello_world.md
================================================
# Hello world!

We will now create the hello world of the ML world, building a model capable of solving MNIST dataset.

Open `src/main.rs` and fill in this content:

```rust
# extern crate candle_core;
use candle_core::{Device, Result, Tensor};

struct Model {
    first: Tensor,
    second: Tensor,
}

impl Model {
    fn forward(&self, image: &Tensor) -> Result<Tensor> {
        let x = image.matmul(&self.first)?;
        let x = x.relu()?;
        x.matmul(&self.second)
    }
}

fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; to use the GPU.
    let device = Device::Cpu;

    let first = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
    let second = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
    let model = Model { first, second };

    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;

    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
    Ok(())
}
```

Everything should now run with:

```bash
cargo run --release
```

## Using a `Linear` layer.

Now that we have this, we might want to complexify things a bit, for instance by adding `bias` and creating
the classical `Linear` layer. We can do as such

```rust
# extern crate candle_core;
# use candle_core::{Device, Result, Tensor};
struct Linear{
    weight: Tensor,
    bias: Tensor,
}
impl Linear{
    fn forward(&self, x: &Tensor) -> Result<Tensor> {
        let x = x.matmul(&self.weight)?;
        x.broadcast_add(&self.bias)
    }
}

struct Model {
    first: Linear,
    second: Linear,
}

impl Model {
    fn forward(&self, image: &Tensor) -> Result<Tensor> {
        let x = self.first.forward(image)?;
        let x = x.relu()?;
        self.second.forward(&x)
    }
}
```

This will change the model running code into a new function

```rust
# extern crate candle_core;
# use candle_core::{Device, Result, Tensor};
# struct Linear{
#     weight: Tensor,
#     bias: Tensor,
# }
# impl Linear{
#     fn forward(&self, x: &Tensor) -> Result<Tensor> {
#         let x = x.matmul(&self.weight)?;
#         x.broadcast_add(&self.bias)
#     }
# }
# 
# struct Model {
#     first: Linear,
#     second: Linear,
# }
# 
# impl Model {
#     fn forward(&self, image: &Tensor) -> Result<Tensor> {
#         let x = self.first.forward(image)?;
#         let x = x.relu()?;
#         self.second.forward(&x)
#     }
# }
fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; to use the GPU.
    // Use Device::Cpu; to use the CPU.
    let device = Device::cuda_if_available(0)?;

    // Creating a dummy model
    let weight = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
    let first = Linear{weight, bias};
    let weight = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
    let second = Linear{weight, bias};
    let model = Model { first, second };

    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;

    // Inference on the model
    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
    Ok(())
}
```

Now it works, it is a great way to create your own layers.
But most of the classical layers are already implemented in [candle-nn](https://github.com/huggingface/candle/tree/main/candle-nn).

## Using `candle_nn`.

For instance [Linear](https://github.com/huggingface/candle/blob/main/candle-nn/src/linear.rs) is already there.
This Linear is coded with PyTorch layout in mind, to reuse better existing models out there, so it uses the transpose of the weights and not the weights directly.

So instead we can simplify our example:

```bash
cargo add --git https://github.com/huggingface/candle.git candle-nn
```

And rewrite our examples using it

```rust
# extern crate candle_core;
# extern crate candle_nn;
use candle_core::{Device, Result, Tensor};
use candle_nn::{Linear, Module};

struct Model {
    first: Linear,
    second: Linear,
}

impl Model {
    fn forward(&self, image: &Tensor) -> Result<Tensor> {
        let x = self.first.forward(image)?;
        let x = x.relu()?;
        self.second.forward(&x)
    }
}

fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; to use the GPU.
    let device = Device::Cpu;

    // This has changed (784, 100) -> (100, 784) !
    let weight = Tensor::randn(0f32, 1.0, (100, 784), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
    let first = Linear::new(weight, Some(bias));
    let weight = Tensor::randn(0f32, 1.0, (10, 100), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
    let second = Linear::new(weight, Some(bias));
    let model = Model { first, second };

    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;

    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
    Ok(())
}
```

Feel free to modify this example to use `Conv2d` to create a classical convnet instead.


Now that we have the running dummy code we can get to more advanced topics:

- [For PyTorch users](../guide/cheatsheet.md)
- [Running existing models](../inference/inference.md)
- [Training models](../training/training.md)




================================================
FILE: candle-book/src/guide/installation.md
================================================
# Installation

## 1. Create a new rust app or library

```bash
cargo new myapp
cd myapp
```

## 2. Add the correct candle version

### Standard

```bash
cargo add --git https://github.com/huggingface/candle.git candle-core
```

### CUDA

First, make sure that Cuda is correctly installed.
- `nvcc --version` should print information about your Cuda compiler driver.
- `nvidia-smi --query-gpu=compute_cap --format=csv` should print your GPUs compute capability, e.g. something
like:

```bash
compute_cap
8.9
```

You can also compile the Cuda kernels for a specific compute cap using the 
`CUDA_COMPUTE_CAP=<compute cap>` environment variable.

If any of the above commands errors out, please make sure to update your Cuda version.

Add the `candle-core` crate with the cuda feature:

```bash
cargo add --git https://github.com/huggingface/candle.git candle-core --features "cuda"
```

### MKL

You can also see the `mkl` feature which can get faster inference on CPU.

Add the `candle-core` crate with the mkl feature:

```bash
cargo add --git https://github.com/huggingface/candle.git candle-core --features "mkl"
```

### Metal

Metal is exclusive to MacOS.

Add the `candle-core` crate with the metal feature:

```bash
cargo add --git https://github.com/huggingface/candle.git candle-core --features "metal"
```

## 3. Building

Run `cargo build` to make sure everything can be correctly built.

```bash
cargo build
```


================================================
FILE: candle-book/src/guide/mnist/intro.md
================================================
# Candle MNIST Tutorial

## Introduction

This tutorial provides an introduction to Candle by implementing and training a neural network for MNIST digit classification from scratch. 

Throughout this tutorial, you will learn the basics of:

- Tensor operations and model construction
- Creating and implementing neural network layers
- Parameter initialization
- Training loop implementation
- Saving and loading trained models

## Getting Started

Before proceeding, please ensure that you have properly installed Candle by following the instructions in the [Installation](../installation.md) guide.

================================================
FILE: candle-book/src/guide/mnist/modeling.md
================================================
# Candle MNIST Tutorial

## Modeling

Open `src/main.rs` in your project folder and insert the following code:

```rust
use candle_core::{Device, Result, Tensor};

struct Model {
    first: Tensor,
    second: Tensor,
}

impl Model {
    fn forward(&self, image: &Tensor) -> Result<Tensor> {
        let x = image.matmul(&self.first)?;
        let x = x.relu()?;
        x.matmul(&self.second)
    }
}

fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; to utilize GPU acceleration.
    let device = Device::Cpu;

    let first = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
    let second = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
    let model = Model { first, second };

    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;

    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
    Ok(())
}
```

Execute the program with:

```bash
$ cargo run --release

> Digit Tensor[dims 1, 10; f32] digit
```

Since random inputs are provided, expect an incoherent output.

## Implementing a `Linear` Layer

To create a more sophisticated layer type, add a `bias` to the weight to construct the standard `Linear` layer.

Replace the entire content of `src/main.rs` with:

```rust
use candle_core::{Device, Result, Tensor};

struct Linear {
    weight: Tensor,
    bias: Tensor,
}

impl Linear {
    fn forward(&self, x: &Tensor) -> Result<Tensor> {
        let x = x.matmul(&self.weight)?;
        x.broadcast_add(&self.bias)
    }
}

struct Model {
    first: Linear,
    second: Linear,
}

impl Model {
    fn forward(&self, image: &Tensor) -> Result<Tensor> {
        let x = self.first.forward(image)?;
        let x = x.relu()?;
        self.second.forward(&x)
    }
}

fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; for GPU acceleration.
    // Use Device::Cpu; for CPU computation.
    let device = Device::cuda_if_available(0)?;

    // Initialize model parameters
    let weight = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
    let first = Linear { weight, bias };
    let weight = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
    let second = Linear { weight, bias };
    let model = Model { first, second };

    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;

    // Perform inference
    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
    Ok(())
}
```

Execute again with:

```bash
$ cargo run --release

> Digit Tensor[dims 1, 10; f32] digit
```

## Utilizing `candle_nn`

Many classical layers (such as [Linear](https://github.com/huggingface/candle/blob/main/candle-nn/src/linear.rs)) are already implemented in [candle-nn](https://github.com/huggingface/candle/tree/main/candle-nn).

This `Linear` implementation follows PyTorch conventions for improved compatibility with existing models, utilizing the transpose of weights rather than direct weights.

Let's simplify our implementation. First, add `candle-nn` as a dependency:

```bash
$ cargo add --git https://github.com/huggingface/candle.git candle-nn
```

Now, replace the entire content of `src/main.rs` with:

```rust
use candle_core::{Device, Result, Tensor};
use candle_nn::{Linear, Module};

struct Model {
    first: Linear,
    second: Linear,
}

impl Model {
    fn forward(&self, image: &Tensor) -> Result<Tensor> {
        let x = self.first.forward(image)?;
        let x = x.relu()?;
        self.second.forward(&x)
    }
}

fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; for GPU acceleration.
    let device = Device::Cpu;

    // Note the dimension change: (784, 100) -> (100, 784)
    let weight = Tensor::randn(0f32, 1.0, (100, 784), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
    let first = Linear::new(weight, Some(bias));
    let weight = Tensor::randn(0f32, 1.0, (10, 100), &device)?;
    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
    let second = Linear::new(weight, Some(bias));
    let model = Model { first, second };

    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;

    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
    Ok(())
}
```

Execute the final version:

```bash
$ cargo run --release

> Digit Tensor[dims 1, 10; f32] digit
```

================================================
FILE: candle-book/src/guide/mnist/saving_loading.md
================================================
# Candle MNIST Tutorial

## Saving and Loading Models

After training a model, it is useful to save and subsequently load the model parameters. In Candle, this functionality is managed through the `VarMap` data structure, with parameters stored on disk using the [safetensors](https://huggingface.co/docs/safetensors/index) format.

### Saving Model Parameters

Let's modify our `training_loop` function to include functionality for saving weights:

```rust
fn training_loop(
    m: candle_datasets::vision::Dataset,
) -> anyhow::Result<()> {
    let dev = Device::cuda_if_available(0)?;

    let train_labels = m.train_labels;
    let train_images = m.train_images.to_device(&dev)?;
    let train_labels = train_labels.to_dtype(DType::U32)?.to_device(&dev)?;

    // Initialize a VarMap for trainable parameters
    let varmap = VarMap::new();
    let vs = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
    let model = Model::new(vs.clone())?;

    let learning_rate = 0.05;
    let epochs = 10;

    // Initialize stochastic gradient descent optimizer
    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), learning_rate)?;
    let test_images = m.test_images.to_device(&dev)?;
    let test_labels = m.test_labels.to_dtype(DType::U32)?.to_device(&dev)?;
    
    for epoch in 1..epochs {
        // Standard MNIST forward pass
        let logits = model.forward(&train_images)?;
        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
        
        // Compute Negative Log Likelihood loss
        let loss = loss::nll(&log_sm, &train_labels)?;

        // Perform backward pass and update weights
        sgd.backward_step(&loss)?;

        // Evaluate model on test set
        let test_logits = model.forward(&test_images)?;
        let sum_ok = test_logits
            .argmax(D::Minus1)?
            .eq(&test_labels)?
            .to_dtype(DType::F32)?
            .sum_all()?
            .to_scalar::<f32>()?;
        let test_accuracy = sum_ok / test_labels.dims1()? as f32;
        println!(
            "{epoch:4} train loss: {:8.5} test acc: {:5.2}%",
            loss.to_scalar::<f32>()?,
            test_accuracy
        );
    }
    
    // Save model weights to disk
    varmap.save("model_weights.safetensors")?;
    Ok(())
}
```

```bash
$ cargo run --release

> 1 train loss:  2.40485 test acc:  0.11%
> 2 train loss:  2.34161 test acc:  0.14%
> 3 train loss:  2.28841 test acc:  0.17%
> 4 train loss:  2.24158 test acc:  0.19%
> 5 train loss:  2.19898 test acc:  0.23%
> 6 train loss:  2.15927 test acc:  0.26%
> 7 train loss:  2.12161 test acc:  0.29%
> 8 train loss:  2.08549 test acc:  0.32%
> 9 train loss:  2.05053 test acc:  0.35%
```

### Loading Model Parameters

Now that we have saved our model parameters, we can modify the code to load them. The primary change required is to make the `varmap` variable mutable:

```rust
fn training_loop(
    m: candle_datasets::vision::Dataset,
) -> anyhow::Result<()> {
    let dev = Device::cuda_if_available(0)?;

    let train_labels = m.train_labels;
    let train_images = m.train_images.to_device(&dev)?;
    let train_labels = train_labels.to_dtype(DType::U32)?.to_device(&dev)?;

    // Create a mutable VarMap for trainable parameters
    let mut varmap = VarMap::new();
    let vs = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
    let model = Model::new(vs.clone())?;

    // Load pre-trained weights from file
    varmap.load("model_weights.safetensors")?;

    let learning_rate = 0.05;
    let epochs = 10;

    // Initialize stochastic gradient descent optimizer
    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), learning_rate)?;
    let test_images = m.test_images.to_device(&dev)?;
    let test_labels = m.test_labels.to_dtype(DType::U32)?.to_device(&dev)?;
    
    for epoch in 1..epochs {
        // Standard MNIST forward pass
        let logits = model.forward(&train_images)?;
        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
        
        // Compute Negative Log Likelihood loss
        let loss = loss::nll(&log_sm, &train_labels)?;

        // Perform backward pass and update weights
        sgd.backward_step(&loss)?;

        // Evaluate model on test set
        let test_logits = model.forward(&test_images)?;
        let sum_ok = test_logits
            .argmax(D::Minus1)?
            .eq(&test_labels)?
            .to_dtype(DType::F32)?
            .sum_all()?
            .to_scalar::<f32>()?;
        let test_accuracy = sum_ok / test_labels.dims1()? as f32;
        println!(
            "{epoch:4} train loss: {:8.5} test acc: {:5.2}%",
            loss.to_scalar::<f32>()?,
            test_accuracy
        );
    }
    
    // Save updated weights back to disk
    varmap.save("model_weights.safetensors")?;
    Ok(())
}
```

```bash
$ cargo run --release

> 1 train loss:  2.01645 test acc:  0.38%
> 2 train loss:  1.98300 test acc:  0.41%
> 3 train loss:  1.95008 test acc:  0.44%
> 4 train loss:  1.91754 test acc:  0.47%
> 5 train loss:  1.88534 test acc:  0.50%
> 6 train loss:  1.85349 test acc:  0.53%
> 7 train loss:  1.82198 test acc:  0.56%
> 8 train loss:  1.79077 test acc:  0.59%
> 9 train loss:  1.75989 test acc:  0.61%
```

Note that loading the weights will fail if the specified file does not exist or is incompatible with the current model architecture. Implementing file existence checks and appropriate error handling is left to the user.

================================================
FILE: candle-book/src/guide/mnist/training.md
================================================
# Candle MNIST Tutorial

## Training Implementation

First, let's create a utility function `make_linear` that accepts a `VarBuilder` and returns an initialized linear layer. The `VarBuilder` constructs a `VarMap`, which is the data structure that stores our trainable parameters.

```rust
use candle_core::{Device, Result, Tensor};
use candle_nn::{Linear, Module, VarBuilder, VarMap};

fn make_linear(vs: VarBuilder, in_dim: usize, out_dim: usize) -> Result<Linear> {
    let ws = vs.get_with_hints(
        (out_dim, in_dim),
        "weight",
        candle_nn::init::DEFAULT_KAIMING_NORMAL,
    )?;
    let bound = 1. / (in_dim as f64).sqrt();
    let bs = vs.get_with_hints(
        out_dim,
        "bias",
        candle_nn::Init::Uniform {
            lo: -bound,
            up: bound,
        },
    )?;
    Ok(Linear::new(ws, Some(bs)))
}
```

Next, let's implement a `new` method for our model class to accept a `VarBuilder` and initialize the model. We use `VarBuilder::pp` to "push prefix" so that the parameter names are organized hierarchically: the first layer weights as `first.weight` and `first.bias`, and the second layer weights as `second.weight` and `second.bias`.

```rust
impl Model {
    fn new(vs: VarBuilder) -> Result<Self> {
        const IMAGE_DIM: usize = 784;
        const HIDDEN_DIM: usize = 100;
        const LABELS: usize = 10;

        let first = make_linear(vs.pp("first"), IMAGE_DIM, HIDDEN_DIM)?;
        let second = make_linear(vs.pp("second"), HIDDEN_DIM, LABELS)?;

        Ok(Self { first, second })
    }

    fn forward(&self, image: &Tensor) -> Result<Tensor> {
        let x = self.first.forward(image)?;
        let x = x.relu()?;
        self.second.forward(&x)
    }
}
```

Now, let's add the `candle-datasets` package to our project to access the MNIST dataset:

```bash
$ cargo add --git https://github.com/huggingface/candle.git candle-datasets
```

With the dataset available, we can implement our training loop:

```rust
use candle_core::{DType, Device, Result, Tensor, D};
use candle_nn::{loss, ops, Linear, Module, Optimizer, VarBuilder, VarMap};

fn training_loop(
    m: candle_datasets::vision::Dataset,
) -> anyhow::Result<()> {
    let dev = Device::cuda_if_available(0)?;

    let train_labels = m.train_labels;
    let train_images = m.train_images.to_device(&dev)?;
    let train_labels = train_labels.to_dtype(DType::U32)?.to_device(&dev)?;

    // Initialize a VarMap to store trainable parameters
    let varmap = VarMap::new();
    let vs = VarBuilder::from_varmap(&varmap, DType::F32, &dev);
    let model = Model::new(vs.clone())?;

    let learning_rate = 0.05;
    let epochs = 10;

    // Initialize a stochastic gradient descent optimizer to update parameters
    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), learning_rate)?;
    let test_images = m.test_images.to_device(&dev)?;
    let test_labels = m.test_labels.to_dtype(DType::U32)?.to_device(&dev)?;
    
    for epoch in 1..epochs {
        // Perform forward pass on MNIST data
        let logits = model.forward(&train_images)?;
        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
        
        // Compute Negative Log Likelihood loss
        let loss = loss::nll(&log_sm, &train_labels)?;

        // Perform backward pass and update weights
        sgd.backward_step(&loss)?;

        // Evaluate model on test set
        let test_logits = model.forward(&test_images)?;
        let sum_ok = test_logits
            .argmax(D::Minus1)?
            .eq(&test_labels)?
            .to_dtype(DType::F32)?
            .sum_all()?
            .to_scalar::<f32>()?;
        let test_accuracy = sum_ok / test_labels.dims1()? as f32;
        println!(
            "{epoch:4} train loss: {:8.5} test acc: {:5.2}%",
            loss.to_scalar::<f32>()?,
            test_accuracy
        );
    }
    Ok(())
}
```

Finally, let's implement our main function:

```rust
pub fn main() -> anyhow::Result<()> {
    let m = candle_datasets::vision::mnist::load()?;
    return training_loop(m);
}
```

Let's execute the training process:

```bash
$ cargo run --release

> 1 train loss:  2.35449 test acc:  0.12%
> 2 train loss:  2.30760 test acc:  0.15%
> ...
```

================================================
FILE: candle-book/src/inference/cuda/README.md
================================================
# Advanced Cuda usage


================================================
FILE: candle-book/src/inference/cuda/porting.md
================================================
# Porting a custom kernel


================================================
FILE: candle-book/src/inference/cuda/writing.md
================================================
# Writing a custom kernel


================================================
FILE: candle-book/src/inference/hub.md
================================================
# Using the hub

Install the [`hf-hub`](https://github.com/huggingface/hf-hub) crate:

```bash
cargo add hf-hub
```

Then let's start by downloading the [model file](https://huggingface.co/bert-base-uncased/tree/main).


```rust
# extern crate candle_core;
# extern crate hf_hub;
use hf_hub::api::sync::Api;
use candle_core::Device;

let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());

let weights = repo.get("model.safetensors").unwrap();

let weights = candle_core::safetensors::load(weights, &Device::Cpu);
```

We now have access to all the [tensors](https://huggingface.co/bert-base-uncased?show_tensors=true) within the file.

You can check all the names of the tensors [here](https://huggingface.co/bert-base-uncased?show_tensors=true)


## Using async 

`hf-hub` comes with an async API.

```bash
cargo add hf-hub --features tokio
```

```rust,ignore
# This is tested directly in examples crate because it needs external dependencies unfortunately:
# See [this](https://github.com/rust-lang/mdBook/issues/706)
{{#include ../lib.rs:book_hub_1}}
```


## Using in a real model.

Now that we have our weights, we can use them in our bert architecture:

```rust
# extern crate candle_core;
# extern crate candle_nn;
# extern crate hf_hub;
# use hf_hub::api::sync::Api;
# 
# let api = Api::new().unwrap();
# let repo = api.model("bert-base-uncased".to_string());
# 
# let weights = repo.get("model.safetensors").unwrap();
use candle_core::{Device, Tensor, DType};
use candle_nn::{Linear, Module};

let weights = candle_core::safetensors::load(weights, &Device::Cpu).unwrap();

let weight = weights.get("bert.encoder.layer.0.attention.self.query.weight").unwrap();
let bias = weights.get("bert.encoder.layer.0.attention.self.query.bias").unwrap();

let linear = Linear::new(weight.clone(), Some(bias.clone()));

let input_ids = Tensor::zeros((3, 768), DType::F32, &Device::Cpu).unwrap();
let output = linear.forward(&input_ids).unwrap();
```

For a full reference, you can check out the full [bert](https://github.com/LaurentMazare/candle/tree/main/candle-examples/examples/bert) example.

## Memory mapping

For more efficient loading, instead of reading the file, you could use [`memmap2`](https://docs.rs/memmap2/latest/memmap2/)

**Note**: Be careful about memory mapping it seems to cause issues on [Windows, WSL](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/5893)
and will definitely be slower on network mounted disk, because it will issue more read calls.

```rust,ignore
{{#include ../lib.rs:book_hub_2}}
```

**Note**: This operation is **unsafe**. [See the safety notice](https://docs.rs/memmap2/latest/memmap2/struct.Mmap.html#safety).
In practice model files should never be modified, and the mmaps should be mostly READONLY anyway, so the caveat most likely does not apply, but always keep it in mind.


## Tensor Parallel Sharding

When using multiple GPUs to use in Tensor Parallel in order to get good latency, you can load only the part of the Tensor you need.

For that you need to use [`safetensors`](https://crates.io/crates/safetensors) directly.

```bash
cargo add safetensors
```


```rust,ignore
{{#include ../lib.rs:book_hub_3}}
```


================================================
FILE: candle-book/src/inference/inference.md
================================================
# Running a model


In order to run an existing model, you will need to download and use existing weights.
Most models are already available on https://huggingface.co/ in [`safetensors`](https://github.com/huggingface/safetensors) format.

Let's get started by running an old model : `bert-base-uncased`.


================================================
FILE: candle-book/src/lib.rs
================================================
#[cfg(test)]
pub mod simplified;

#[cfg(test)]
mod tests {
    use anyhow::Result;
    use candle::{DType, Device, Tensor};
    use parquet::file::reader::SerializedFileReader;

    // NOTE: Waiting on https://github.com/rust-lang/mdBook/pull/1856
    #[rustfmt::skip]
    #[tokio::test]
    async fn book_hub_1() {
// ANCHOR: book_hub_1
use candle::Device;
use hf_hub::api::tokio::Api;

let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());

let weights_filename = repo.get("model.safetensors").await.unwrap();

let weights = candle::safetensors::load(weights_filename, &Device::Cpu).unwrap();
// ANCHOR_END: book_hub_1
        assert_eq!(weights.len(), 206);
    }

    #[rustfmt::skip]
    #[test]
    fn book_hub_2() {
        {
// ANCHOR: book_hub_2
use candle::Device;
use hf_hub::api::sync::Api;
use memmap2::Mmap;
use std::fs;

let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());
let weights_filename = repo.get("model.safetensors").unwrap();

let file = fs::File::open(weights_filename).unwrap();
let mmap = unsafe { Mmap::map(&file).unwrap() };
let weights = candle::safetensors::load_buffer(&mmap[..], &Device::Cpu).unwrap();
// ANCHOR_END: book_hub_2
        assert_eq!(weights.len(), 206);
    }

    // #[rustfmt::skip]
    // #[test]
    // fn book_hub_3() {
    {
// ANCHOR: book_hub_3
use candle::{DType, Device, Tensor};
use hf_hub::api::sync::Api;
use memmap2::Mmap;
use safetensors::slice::IndexOp;
use safetensors::SafeTensors;
use std::fs;

let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());
let weights_filename = repo.get("model.safetensors").unwrap();

let file = fs::File::open(weights_filename).unwrap();
let mmap = unsafe { Mmap::map(&file).unwrap() };

// Use safetensors directly
let tensors = SafeTensors::deserialize(&mmap[..]).unwrap();
let view = tensors
    .tensor("bert.encoder.layer.0.attention.self.query.weight")
    .unwrap();

// We're going to load shard with rank 1, within a world_size of 4
// We're going to split along dimension 0 doing VIEW[start..stop, :]
let rank = 1;
let world_size = 4;
let dim = 0;
let dtype = view.dtype();
let mut tp_shape = view.shape().to_vec();
let size = tp_shape[0];

if size % world_size != 0 {
    panic!("The dimension is not divisible by `world_size`");
}
let block_size = size / world_size;
let start = rank * block_size;
let stop = (rank + 1) * block_size;

// Everything is expressed in tensor dimension
// bytes offsets is handled automatically for safetensors.

let iterator = view.slice(start..stop).unwrap();

tp_shape[dim] = block_size;

// Convert safetensors Dtype to candle DType
let dtype: DType = dtype.try_into().unwrap();

// TODO: Implement from_buffer_iterator so we can skip the extra CPU alloc.
let raw: Vec<u8> = iterator.into_iter().flatten().cloned().collect();
let tp_tensor = Tensor::from_raw_buffer(&raw, dtype, &tp_shape, &Device::Cpu).unwrap();
// ANCHOR_END: book_hub_3
        assert_eq!(view.shape(), &[768, 768]);
        assert_eq!(tp_tensor.dims(), &[192, 768]);
    }
}

    #[allow(unused)]
    #[rustfmt::skip]
    fn book_training_1() -> Result<()>{
// ANCHOR: book_training_1
use hf_hub::{api::sync::Api, Repo, RepoType};

let dataset_id = "mnist".to_string();

let api = Api::new()?;
let repo = Repo::with_revision(
    dataset_id,
    RepoType::Dataset,
    "refs/convert/parquet".to_string(),
);
let repo = api.repo(repo);
let test_parquet_filename = repo.get("mnist/test/0000.parquet")?;
let train_parquet_filename = repo.get("mnist/train/0000.parquet")?;
let test_parquet = SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?)?;
let train_parquet = SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?)?;
// ANCHOR_END: book_training_1
// Ignore unused
let _train = train_parquet;
// ANCHOR: book_training_2
for row in test_parquet {
    for (idx, (name, field)) in row?.get_column_iter().enumerate() {
        println!("Column id {idx}, name {name}, value {field}");
    }
}
// ANCHOR_END: book_training_2
let test_parquet_filename = repo.get("mnist/test/0000.parquet")?;
let train_parquet_filename = repo.get("mnist/train/0000.parquet")?;
let test_parquet = SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?)?;
let train_parquet = SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?)?;
// ANCHOR: book_training_3

let test_samples = 10_000;
let mut test_buffer_images: Vec<u8> = Vec::with_capacity(test_samples * 784);
let mut test_buffer_labels: Vec<u8> = Vec::with_capacity(test_samples);
for row in test_parquet{
    for (_name, field) in row?.get_column_iter() {
        if let parquet::record::Field::Group(subrow) = field {
            for (_name, field) in subrow.get_column_iter() {
                if let parquet::record::Field::Bytes(value) = field {
                    let image = image::load_from_memory(value.data()).unwrap();
                    test_buffer_images.extend(image.to_luma8().as_raw());
                }
            }
        }else if let parquet::record::Field::Long(label) = field {
            test_buffer_labels.push(*label as u8);
        }
    }
}
let test_images = (Tensor::from_vec(test_buffer_images, (test_samples, 784), &Device::Cpu)?.to_dtype(DType::F32)? / 255.)?;
let test_labels = Tensor::from_vec(test_buffer_labels, (test_samples, ), &Device::Cpu)?;

let train_samples = 60_000;
let mut train_buffer_images: Vec<u8> = Vec::with_capacity(train_samples * 784);
let mut train_buffer_labels: Vec<u8> = Vec::with_capacity(train_samples);
for row in train_parquet{
    for (_name, field) in row?.get_column_iter() {
        if let parquet::record::Field::Group(subrow) = field {
            for (_name, field) in subrow.get_column_iter() {
                if let parquet::record::Field::Bytes(value) = field {
                    let image = image::load_from_memory(value.data()).unwrap();
                    train_buffer_images.extend(image.to_luma8().as_raw());
                }
            }
        }else if let parquet::record::Field::Long(label) = field {
            train_buffer_labels.push(*label as u8);
        }
    }
}
let train_images = (Tensor::from_vec(train_buffer_images, (train_samples, 784), &Device::Cpu)?.to_dtype(DType::F32)? / 255.)?;
let train_labels = Tensor::from_vec(train_buffer_labels, (train_samples, ), &Device::Cpu)?;

let mnist = candle_datasets::vision::Dataset {
    train_images,
    train_labels,
    test_images,
    test_labels,
    labels: 10,
};

// ANCHOR_END: book_training_3
assert_eq!(mnist.test_images.dims(), &[10_000, 784]);
assert_eq!(mnist.test_labels.dims(), &[10_000]);
assert_eq!(mnist.train_images.dims(), &[60_000, 784]);
assert_eq!(mnist.train_labels.dims(), &[60_000]);
Ok(())
    }
}


================================================
FILE: candle-book/src/simplified.rs
================================================
//! #A simplified example in Rust of training a neural network and then using it based on the Candle Framework by Hugging Face.
//! Author: Evgeny Igumnov 2023 igumnovnsk@gmail.com
//! This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.
//!
//! ##Basic moments:
//!
//! A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
//! The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
//! The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
//! For training, samples with real data on the results of the first and second stages of different elections are used.
//! The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
//! Model parameters (weights of neurons) are initialized randomly, then optimized during training.
//! After training, the model is tested on a deferred sample to evaluate the accuracy.
//! If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.
//! Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.

#[rustfmt::skip]
mod tests {

use candle::{DType, Result, Tensor, D, Device};
use candle_nn::{loss, ops, Linear, Module, VarBuilder, VarMap, Optimizer};

// ANCHOR: book_training_simplified1
const VOTE_DIM: usize = 2;
const RESULTS: usize = 1;
const EPOCHS: usize = 10;
const LAYER1_OUT_SIZE: usize = 4;
const LAYER2_OUT_SIZE: usize = 2;
const LEARNING_RATE: f64 = 0.05;

#[derive(Clone)]
pub struct Dataset {
    pub train_votes: Tensor,
    pub train_results: Tensor,
    pub test_votes: Tensor,
    pub test_results: Tensor,
}

struct MultiLevelPerceptron {
    ln1: Linear,
    ln2: Linear,
    ln3: Linear,
}

impl MultiLevelPerceptron {
    fn new(vs: VarBuilder) -> Result<Self> {
        let ln1 = candle_nn::linear(VOTE_DIM, LAYER1_OUT_SIZE, vs.pp("ln1"))?;
        let ln2 = candle_nn::linear(LAYER1_OUT_SIZE, LAYER2_OUT_SIZE, vs.pp("ln2"))?;
        let ln3 = candle_nn::linear(LAYER2_OUT_SIZE, RESULTS + 1, vs.pp("ln3"))?;
        Ok(Self { ln1, ln2, ln3 })
    }

    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let xs = self.ln1.forward(xs)?;
        let xs = xs.relu()?;
        let xs = self.ln2.forward(&xs)?;
        let xs = xs.relu()?;
        self.ln3.forward(&xs)
    }
}

// ANCHOR_END: book_training_simplified1



// ANCHOR: book_training_simplified3
#[tokio::test]
async fn simplified() -> anyhow::Result<()> {

    let dev = Device::cuda_if_available(0)?;

    let train_votes_vec: Vec<u32> = vec![
        15, 10,
        10, 15,
        5, 12,
        30, 20,
        16, 12,
        13, 25,
        6, 14,
        31, 21,
    ];
    let train_votes_tensor = Tensor::from_vec(train_votes_vec.clone(), (train_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;

    let train_results_vec: Vec<u32> = vec![
        1,
        0,
        0,
        1,
        1,
        0,
        0,
        1,
    ];
    let train_results_tensor = Tensor::from_vec(train_results_vec, train_votes_vec.len() / VOTE_DIM, &dev)?;

    let test_votes_vec: Vec<u32> = vec![
        13, 9,
        8, 14,
        3, 10,
    ];
    let test_votes_tensor = Tensor::from_vec(test_votes_vec.clone(), (test_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;

    let test_results_vec: Vec<u32> = vec![
        1,
        0,
        0,
    ];
    let test_results_tensor = Tensor::from_vec(test_results_vec.clone(), test_results_vec.len(), &dev)?;

    let m = Dataset {
        train_votes: train_votes_tensor,
        train_results: train_results_tensor,
        test_votes: test_votes_tensor,
        test_results: test_results_tensor,
    };

    let trained_model: MultiLevelPerceptron;
    loop {
        println!("Trying to train neural network.");
        match train(m.clone(), &dev) {
            Ok(model) => {
                trained_model = model;
                break;
            },
            Err(e) => {
                println!("Error: {}", e);
                continue;
            }
        }

    }

    let real_world_votes: Vec<u32> = vec![
        13, 22,
    ];

    let tensor_test_votes = Tensor::from_vec(real_world_votes.clone(), (1, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;

    let final_result = trained_model.forward(&tensor_test_votes)?;

    let result = final_result
        .argmax(D::Minus1)?
        .to_dtype(DType::F32)?
        .get(0).map(|x| x.to_scalar::<f32>())??;
    println!("real_life_votes: {:?}", real_world_votes);
    println!("neural_network_prediction_result: {:?}", result);

    Ok(())

}
// ANCHOR_END: book_training_simplified3

// ANCHOR: book_training_simplified2
fn train(m: Dataset, dev: &Device) -> anyhow::Result<MultiLevelPerceptron> {
    let train_results = m.train_results.to_device(dev)?;
    let train_votes = m.train_votes.to_device(dev)?;
    let varmap = VarMap::new();
    let vs = VarBuilder::from_varmap(&varmap, DType::F32, dev);
    let model = MultiLevelPerceptron::new(vs.clone())?;
    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), LEARNING_RATE)?;
    let test_votes = m.test_votes.to_device(dev)?;
    let test_results = m.test_results.to_device(dev)?;
    let mut final_accuracy: f32 = 0.0;
    for epoch in 1..EPOCHS + 1 {
        let logits = model.forward(&train_votes)?;
        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
        let loss = loss::nll(&log_sm, &train_results)?;
        sgd.backward_step(&loss)?;

        let test_logits = model.forward(&test_votes)?;
        let sum_ok = test_logits
            .argmax(D::Minus1)?
            .eq(&test_results)?
            .to_dtype(DType::F32)?
            .sum_all()?
            .to_scalar::<f32>()?;
        let test_accuracy = sum_ok / test_results.dims1()? as f32;
        final_accuracy = 100. * test_accuracy;
        println!("Epoch: {epoch:3} Train loss: {:8.5} Test accuracy: {:5.2}%",
                 loss.to_scalar::<f32>()?,
                 final_accuracy
        );
        if final_accuracy == 100.0 {
            break;
        }
    }
    if final_accuracy < 100.0 {
        Err(anyhow::Error::msg("The model is not trained well enough."))
    } else {
        Ok(model)
    }
}
// ANCHOR_END: book_training_simplified2


}


================================================
FILE: candle-book/src/tracing.md
================================================
# Tracing

Tracing is a powerful tool for identifying performance issues and bottlenecks in code.

> Profiling on GPUs is trickier due to asynchronous execution, see the [GPU section](#gpu).

## Overview

Candle uses the [tracing](https://docs.rs/tracing/latest/tracing/) crate for instrumentation.

To try it out, run an example in `candle-examples` with the `--tracing` flag. 
This generates a trace file, typically named `trace-<timestamp>.json`. 
You can view the trace in Chrome by navigating to `chrome://tracing/`, clicking **Load**, and selecting the generated trace file.

## Adding Tracing

Candle includes built-in tracing for many internal operations, using [spans](https://docs.rs/tracing/latest/tracing/struct.Span.html) to mark key points of execution.

To add custom tracing in your code, you can define a span like this:

```rust
let span = tracing::span!(tracing::Level::TRACE, name);
```

Then, to record the span during execution, create a guard:

```rust
let _enter = span.enter();
```

This guard will record the span's duration, from when it is created to when it is dropped, into a global data structure managed by the tracing crate.

## Recording and Saving a Trace

To capture and save trace data, you need to configure the tracing system with an output format. Candle uses the [tracing_subscriber](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/) and [tracing_chrome](https://docs.rs/tracing-chrome/latest/tracing_chrome/) crates.

The snippet below sets up a Chrome compatible recorder that logs all tracing activity between creation and drop of the guard:

```rust
use tracing_chrome::ChromeLayerBuilder;
use tracing_subscriber::prelude::*;

let _guard = {
    let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
    tracing_subscriber::registry().with(chrome_layer).init();
    guard
};
```

## GPU

When using CUDA, Metal, or other asynchronous GPU backends, tracing may produce misleading timing data because operations are queued rather than executed immediately.

### CUDA

For CUDA-specific profiling, you have two options:

1. Set the environment variable `CUDA_LAUNCH_BLOCKING=1` which forces synchronous execution. This makes trace timings more accurate, at the cost of reduced performance.
2. Use [NVIDIA's Nsight Systems](https://developer.nvidia.com/nsight-systems) (`nsys profile` and `nsys-ui`) which are designed specifically for profiling asynchronous CUDA executions.

We recommend using NVIDIA's Nsight Systems when possible, as it offers accurate performance data without altering typical execution patterns. In contrast, setting the `CUDA_LAUNCH_BLOCKING` environment variable forces synchronous execution, which can significantly alter execution behavior.

#### Performance Profiling with NVIDIA Nsight Systems

1. Generate an `.nsys-rep` file containing performance data ([docs](https://docs.nvidia.com/nsight-systems/UserGuide/index.html#example-single-command-lines))
   - Run `nsys profile --trace cuda,nvtx,osrt --gpu-metrics-device=all --output profile_run ./target/debug/... --prompt "whatever "`
1. Open the generated `.nsys-rep` report file in Nsight Systems GUI
    - File > Open

================================================
FILE: candle-book/src/training/finetuning.md
================================================
# Fine-tuning


================================================
FILE: candle-book/src/training/mnist.md
================================================
# MNIST

So we now have downloaded the MNIST parquet files, let's put them in a simple struct.

```rust,ignore
{{#include ../lib.rs:book_training_3}}
```

The parsing of the file and putting it into single tensors requires the dataset to fit the entire memory.
It is quite rudimentary, but simple enough for a small dataset like MNIST.


================================================
FILE: candle-book/src/training/serialization.md
================================================
# Serialization


================================================
FILE: candle-book/src/training/simplified.md
================================================
# Simplified

## How its works

This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.

Basic moments:

1. A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
2. The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
3. The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
4. For training, samples with real data on the results of the first and second stages of different elections are used.
5. The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
6. Model parameters (weights of neurons) are initialized randomly, then optimized during training.
7. After training, the model is tested on a deferred sample to evaluate the accuracy.
8. If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.

Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.


```rust,ignore
{{#include ../simplified.rs:book_training_simplified1}}
```

```rust,ignore
{{#include ../simplified.rs:book_training_simplified2}}
```

```rust,ignore
{{#include ../simplified.rs:book_training_simplified3}}
```


## Example output

```bash
Trying to train neural network.
Epoch:   1 Train loss:  4.42555 Test accuracy:  0.00%
Epoch:   2 Train loss:  0.84677 Test accuracy: 33.33%
Epoch:   3 Train loss:  2.54335 Test accuracy: 33.33%
Epoch:   4 Train loss:  0.37806 Test accuracy: 33.33%
Epoch:   5 Train loss:  0.36647 Test accuracy: 100.00%
real_life_votes: [13, 22]
neural_network_prediction_result: 0.0
```


================================================
FILE: candle-book/src/training/training.md
================================================
# Training


Training starts with data. We're going to use the huggingface hub and 
start with the Hello world dataset of machine learning, MNIST.

Let's start with downloading `MNIST` from [huggingface](https://huggingface.co/datasets/mnist).

This requires [`hf-hub`](https://github.com/huggingface/hf-hub).
```bash
cargo add hf-hub
```

This is going to be very hands-on for now.

```rust,ignore
{{#include ../../../candle-examples/src/lib.rs:book_training_1}}
```

This uses the standardized `parquet` files from the `refs/convert/parquet` branch on every dataset.
Our handles are now [`parquet::file::serialized_reader::SerializedFileReader`].

We can inspect the content of the files with:

```rust,ignore
{{#include ../../../candle-examples/src/lib.rs:book_training_2}}
```

You should see something like:

```bash
Column id 1, name label, value 6
Column id 0, name image, value {bytes: [137, ....]
Column id 1, name label, value 8
Column id 0, name image, value {bytes: [137, ....]
```

So each row contains 2 columns (image, label) with image being saved as bytes.
Let's put them into a useful struct.


================================================
FILE: candle-core/Cargo.toml
================================================
[package]
name = "candle-core"
version.workspace = true
edition.workspace = true
description.workspace = true
repository.workspace = true
keywords.workspace = true
categories.workspace = true
license.workspace = true
readme = "README.md"

[dependencies]
accelerate-src = { workspace = true, optional = true }
byteorder = { workspace = true }
candle-kernels = { workspace = true, optional = true }
candle-metal-kernels = { workspace = true, optional = true }
objc2-metal = { workspace = true, optional = true }
objc2-foundation = { workspace = true, optional = true }
cudarc = { workspace = true, optional = true }
gemm = { workspace = true }
half = { workspace = true }
float8 = { workspace = true }
intel-mkl-src = { workspace = true, optional = true }
libc = { workspace = true, optional = true }
libm = { workspace = true }
memmap2 = { workspace = true }
num-traits = { workspace = true }
num_cpus = { workspace = true }
rand = { workspace = true }
rand_distr = { workspace = true }
rayon = { workspace = true }
safetensors = { workspace = true }
thiserror = { workspace = true }
yoke = { workspace = true }
zip = { workspace = true }
tokenizers = { workspace = true, features = ["onig"] }

[target.'cfg(all(not(target_arch = "wasm32"), not(target_os = "ios")))'.dependencies]
candle-ug = { workspace = true, optional = true }

[dev-dependencies]
anyhow = { workspace = true }
clap = { workspace = true }
criterion = { workspace = true }

[features]
default = []
cuda = ["cudarc", "dep:candle-kernels", "candle-ug?/cuda"]
cudnn = ["cuda", "cudarc/cudnn"]
nccl = ["cuda", "cudarc/nccl"]
mkl = ["dep:libc", "dep:intel-mkl-src"]
accelerate = ["dep:libc", "dep:accelerate-src"]
metal = [
    "dep:objc2-metal",
    "dep:objc2-foundation",
    "dep:candle-metal-kernels",
    "candle-ug?/metal",
]
ug = ["dep:candle-ug"]

[[bench]]
name = "bench_main"
harness = false

[[example]]
name = "metal_basics"
required-features = ["metal"]

[[example]]
name = "cuda_basics"
required-features = ["cuda"]


================================================
FILE: candle-core/LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: candle-core/README.md
================================================
# candle
Minimalist ML framework for Rust


================================================
FILE: candle-core/benches/bench_main.rs
================================================
mod benchmarks;

use criterion::criterion_main;

criterion_main!(
    benchmarks::affine::benches,
    benchmarks::binary::benches,
    benchmarks::broadcast::benches,
    benchmarks::copy::benches,
    benchmarks::conv_transpose2d::benches,
    benchmarks::matmul::benches,
    benchmarks::qmatmul::benches,
    benchmarks::random::benches,
    benchmarks::reduce::benches,
    benchmarks::unary::benches,
    benchmarks::where_cond::benches,
);


================================================
FILE: candle-core/benches/benchmarks/affine.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run(a: &Tensor) {
    a.affine(12.34, 56.78).unwrap();
}

fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let b = 1;
    let m = 1024;
    let k = 1024;

    let tensor = Tensor::zeros((b, m, k), dtype, device).unwrap();

    let flops = b * m * k * dtype.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_affine_benchmark(c, &device, DType::F32, "affine_f32");
        run_affine_benchmark(c, &device, DType::F16, "affine_f16");
        run_affine_benchmark(c, &device, DType::BF16, "affine_bf16");
        #[cfg(not(feature = "metal"))]
        run_affine_benchmark(c, &device, DType::F8E4M3, "affine_fp8");
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/binary.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run(lhs: &Tensor, rhs: &Tensor) -> Tensor {
    lhs.mul(rhs).unwrap()
}

fn run_unary_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let b = 1;
    let m = 1024;
    let k = 1024;

    let lhs = Tensor::arange(0.0f32, (b * m * k) as f32, device)
        .unwrap()
        .to_dtype(dtype)
        .unwrap()
        .reshape((b, m, k))
        .unwrap();

    let rhs = Tensor::arange(0.0f32, (b * m * k) as f32, device)
        .unwrap()
        .to_dtype(dtype)
        .unwrap()
        .reshape((b, m, k))
        .unwrap();

    let flops = 2 * b * m * k * dtype.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&lhs), black_box(&rhs));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        for dtype in [DType::F32, DType::BF16, DType::F16] {
            let name = format!("binary_mul_{dtype:?}");
            run_unary_benchmark(c, &device, dtype, &name);
        }
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/broadcast.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run(w: &Tensor, bias: &Tensor) {
    w.broadcast_add(bias).unwrap();
}

fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    // We simulate a candle-nn style conv2d + bias forward pass.
    let batch_size = 1;
    let ch = 1;
    let m = 126;
    let bias_size = 128;

    let x = Tensor::ones((batch_size, ch, m, m), dtype, device).unwrap();
    let bias = Tensor::ones((1, bias_size, 1, 1), dtype, device).unwrap();

    let flops = batch_size * ch * m * bias_size * dtype.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&x), black_box(&bias));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_benchmark(c, &device, DType::F32, "broadcast_add_f32");
        run_benchmark(c, &device, DType::F16, "broadcast_add_f16");
        run_benchmark(c, &device, DType::BF16, "broadcast_add_bf16");
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/conv_transpose2d.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run(
    x: &Tensor,
    k: &Tensor,
    padding: usize,
    output_padding: usize,
    stride: usize,
    dilation: usize,
) {
    x.conv_transpose2d(k, padding, output_padding, stride, dilation)
        .unwrap();
}

fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let t = Tensor::arange(0.0f32, 10000.0, device)
        .unwrap()
        .reshape((1, 4, 50, 50))
        .unwrap()
        .to_dtype(dtype)
        .unwrap();

    let kernel = Tensor::arange(0.0f32, 100.0, device)
        .unwrap()
        .reshape((4, 1, 5, 5))
        .unwrap()
        .to_dtype(dtype)
        .unwrap();

    let flops = t.dims().iter().product::<usize>() * dtype.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&t), black_box(&kernel), 1, 0, 1, 2);
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_benchmark(c, &device, DType::F32, "conv_transpose2d_f32");
        run_benchmark(c, &device, DType::F16, "conv_transpose2d_f16");
        run_benchmark(c, &device, DType::BF16, "conv_transpose2d_bf16");
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/copy.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{Device, Tensor, WithDType};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run_copy_mask_benchmark<D: WithDType>(c: &mut Criterion, device: &Device, name: &str) {
    let batch_size = 128;
    let in_seq_len = 1;
    let kv_seq_len = 1024;

    let attn_mask = vec![vec![vec![D::zero(); kv_seq_len]; in_seq_len]; batch_size];
    let size_in_bytes = batch_size * in_seq_len * kv_seq_len * D::DTYPE.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(size_in_bytes as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let attn_masks = vec![attn_mask.clone(); iters as usize];
            let start = Instant::now();
            for attn_mask in attn_masks.into_iter() {
                let tensor = Tensor::new(black_box(attn_mask), device).unwrap();
                black_box(tensor);
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_copy_mask_benchmark::<f32>(c, &device, "copy_mask");
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/matmul.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

/// Matmul benchmark shapes covering common GEMM scenarios
const MATMUL_SHAPES: &[(&str, &[usize], &[usize])] = &[
    // Original GEMV test
    ("gemv", &[1, 1, 2048], &[1, 2048, 2048]),
    // 4D Attention scenarios (multi-head attention)
    ("attn_4d_small", &[484, 6, 144, 32], &[484, 6, 32, 144]),
    ("attn_4d_large", &[121, 24, 144, 32], &[121, 24, 32, 144]),
    // Square matrix tests
    ("square_512", &[512, 512], &[512, 512]),
    ("square_1024", &[1024, 1024], &[1024, 1024]),
    // 3D Batch matmul (attention patterns)
    ("batch_1000", &[1000, 144, 32], &[1000, 32, 144]),
    // 2D Linear layer scenarios (transformer FFN)
    ("linear_large", &[17424, 768], &[768, 3072]),
];

fn run(a: &Tensor, b: &Tensor) {
    a.broadcast_matmul(b).unwrap();
}

fn calculate_flops(shape_a: &[usize], shape_b: &[usize]) -> usize {
    let batch: usize = shape_a
        .iter()
        .take(shape_a.len().saturating_sub(2))
        .product();
    let batch = if batch == 0 { 1 } else { batch };
    let m = shape_a[shape_a.len() - 2];
    let k = shape_a[shape_a.len() - 1];
    let n = shape_b[shape_b.len() - 1];
    2 * batch * m * k * n
}

fn run_bench(c: &mut Criterion, device: &Device, name: &str, shape_a: &[usize], shape_b: &[usize]) {
    let dtype = DType::F32;
    let lhs = Tensor::zeros(shape_a, dtype, device).unwrap();
    let rhs = Tensor::zeros(shape_b, dtype, device).unwrap();

    let flops = calculate_flops(shape_a, shape_b);

    let mut group = c.benchmark_group(device.bench_name(format!("matmul_{name}")));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&lhs), black_box(&rhs));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        for (name, shape_a, shape_b) in MATMUL_SHAPES {
            run_bench(c, &device, name, shape_a, shape_b);
        }
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/mod.rs
================================================
pub(crate) mod affine;
pub(crate) mod binary;
pub(crate) mod broadcast;
pub(crate) mod conv_transpose2d;
pub(crate) mod copy;
pub(crate) mod matmul;
pub(crate) mod qmatmul;
pub(crate) mod random;
pub(crate) mod reduce;
pub(crate) mod unary;
pub(crate) mod where_cond;

use candle_core::{Device, Result};

pub(crate) trait BenchDevice {
    fn sync(&self) -> Result<()>;

    fn bench_name<S: Into<String>>(&self, name: S) -> String;
}

impl BenchDevice for Device {
    fn sync(&self) -> Result<()> {
        match self {
            Device::Cpu => Ok(()),
            Device::Cuda(device) => {
                #[cfg(feature = "cuda")]
                {
                    use candle_core::backend::BackendDevice;
                    return Ok(device.synchronize()?);
                }
                #[cfg(not(feature = "cuda"))]
                panic!("Cuda device without cuda feature enabled: {device:?}")
            }
            Device::Metal(device) => {
                #[cfg(feature = "metal")]
                return device.wait_until_completed();
                #[cfg(not(feature = "metal"))]
                panic!("Metal device without metal feature enabled: {device:?}")
            }
        }
    }

    fn bench_name<S: Into<String>>(&self, name: S) -> String {
        match self {
            Device::Cpu => {
                let cpu_type = if cfg!(feature = "accelerate") {
                    "accelerate"
                } else if cfg!(feature = "mkl") {
                    "mkl"
                } else {
                    "cpu"
                };
                format!("{}_{}", cpu_type, name.into())
            }
            Device::Cuda(_) => format!("cuda_{}", name.into()),
            Device::Metal(_) => format!("metal_{}", name.into()),
        }
    }
}

struct BenchDeviceHandler {
    devices: Vec<Device>,
}

impl BenchDeviceHandler {
    pub fn new() -> Result<Self> {
        let mut devices = Vec::new();
        if cfg!(feature = "metal") {
            devices.push(Device::new_metal(0)?);
        } else if cfg!(feature = "cuda") {
            devices.push(Device::new_cuda(0)?);
        } else {
            devices.push(Device::Cpu);
        }
        Ok(Self { devices })
    }
}


================================================
FILE: candle-core/benches/benchmarks/qmatmul.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{
    quantized::{self, GgmlDType, QMatMul},
    Device, Module, Tensor,
};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run(matmul: &QMatMul, x: &Tensor) {
    matmul.forward(x).unwrap();
}

fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
    let b = 1;
    let m = 1;
    let n = 1024;
    let k = 1024;

    let lhs = (0..(m * k))
        .map(|v| v as f32 / (m * k) as f32)
        .collect::<Vec<_>>();
    let rhs = (0..(k * n))
        .map(|v| v as f32 / (n * k) as f32)
        .collect::<Vec<_>>();

    let lhs = Tensor::from_slice(&lhs, (m, k), device).unwrap();
    let rhs = Tensor::from_slice(&rhs, (k, n), device).unwrap();

    let qtensor = quantized::QTensor::quantize(&rhs.t().unwrap(), dtype).unwrap();
    let matmul = quantized::QMatMul::from_qtensor(qtensor).unwrap();

    let flops = b * m * n * k;

    let mut group = c.benchmark_group(device.bench_name(format!("qmatmul_{dtype:?}")));
    group.sample_size(200);
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&matmul), black_box(&lhs));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        for dtype in [
            GgmlDType::F32,
            GgmlDType::F16,
            GgmlDType::Q4_0,
            GgmlDType::Q4_1,
            GgmlDType::Q5_0,
            GgmlDType::Q5_1,
            GgmlDType::Q8_0,
            GgmlDType::Q2K,
            GgmlDType::Q3K,
            GgmlDType::Q4K,
            GgmlDType::Q5K,
            GgmlDType::Q6K,
        ] {
            run_bench(c, &device, dtype);
        }
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/random.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn rand_uniform(a: &Tensor) {
    a.rand_like(-1.0, 123.0).unwrap();
}

fn rand_normal(a: &Tensor) {
    a.randn_like(100.0, 15.0).unwrap();
}

fn run_random_bench(c: &mut Criterion, device: &Device) {
    let b = 1;

    let rows = 2048;
    let cols = 2048;

    let dtype = DType::F32;
    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();

    let flops = b * rows * cols * dtype.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name("random_uniform"));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |benches| {
        benches.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                rand_uniform(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();

    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();

    let mut group = c.benchmark_group(device.bench_name("random_normal"));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |benches| {
        benches.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                rand_normal(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_random_bench(c, &device);
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/reduce.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use half::{bf16, f16};
use std::hint::black_box;
use std::time::Instant;

fn run_sum(a: &Tensor) {
    a.sum_keepdim(2).unwrap();
}
fn run_arg_min(a: &Tensor) {
    a.argmin_keepdim(2).unwrap();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    let (lo, up) = (-1000.0f32, 1000.0f32);
    for device in handler.devices {
        run_reduce(c, &device, (lo, up), false);
        run_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), false);
        run_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), false);

        run_arg_reduce(c, &device, (lo, up), false);
        run_arg_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), false);
        run_arg_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), false);

        run_reduce(c, &device, (lo, up), true);
        run_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), true);
        run_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), true);

        run_arg_reduce(c, &device, (lo, up), true);
        run_arg_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), true);
        run_arg_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), true);
    }
}

fn run_reduce<T: candle_core::FloatDType>(
    c: &mut Criterion,
    device: &Device,
    (lo, up): (T, T),
    strided: bool,
) {
    let b = 1;
    let m = 1024;
    let k = 1024;

    let a = if strided {
        Tensor::rand(lo, up, (b, m, k), device)
            .unwrap()
            .transpose(0, 2)
            .unwrap()
    } else {
        Tensor::rand(lo, up, (b, m, k), device).unwrap()
    };

    let flops = b * m * k * T::DTYPE.size_in_bytes();

    let name = match T::DTYPE {
        DType::F32 => {
            if strided {
                "reduce_f32_strided"
            } else {
                "reduce_f32"
            }
        }
        DType::F16 => {
            if strided {
                "reduce_f16_strided"
            } else {
                "reduce_f16"
            }
        }
        DType::BF16 => {
            if strided {
                "reduce_bf16_strided"
            } else {
                "reduce_bf16"
            }
        }
        _ => "unknown",
    };

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run_sum(black_box(&a));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn run_arg_reduce<T: candle_core::FloatDType>(
    c: &mut Criterion,
    device: &Device,
    (lo, up): (T, T),
    strided: bool,
) {
    let b = 1;
    let m = 1024;
    let k = 1024;

    let a = if strided {
        Tensor::rand(lo, up, (b, m, k), device)
            .unwrap()
            .transpose(0, 2)
            .unwrap()
    } else {
        Tensor::rand(lo, up, (b, m, k), device).unwrap()
    };

    let flops = b * m * k * T::DTYPE.size_in_bytes();

    let name = match T::DTYPE {
        DType::F32 => {
            if strided {
                "arg_reduce_f32_strided"
            } else {
                "arg_reduce_f32"
            }
        }
        DType::F16 => {
            if strided {
                "arg_reduce_f16_strided"
            } else {
                "arg_reduce_f16"
            }
        }
        DType::BF16 => {
            if strided {
                "arg_reduce_bf16_strided"
            } else {
                "arg_reduce_bf16"
            }
        }
        _ => "unknown",
    };

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run_arg_min(black_box(&a));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/unary.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run_sqrt(a: &Tensor) {
    a.sqrt().unwrap();
}

fn run_unary_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let b = 1;
    let m = 1024;
    let k = 1024;

    let tensor = Tensor::arange(0.0f32, (b * m * k) as f32, device)
        .unwrap()
        .to_dtype(dtype)
        .unwrap()
        .reshape((b, m, k))
        .unwrap();

    let flops = b * m * k * dtype.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run_sqrt(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn run_cast(a: &Tensor, dtype: DType) {
    a.to_dtype(dtype).unwrap();
}

fn run_cast_benchmark(
    c: &mut Criterion,
    device: &Device,
    dtype: DType,
    to_dtype: DType,
    name: &str,
) {
    let b = 1;
    let m = 1024;
    let k = 1024;

    let tensor = Tensor::arange(0.0f32, (b * m * k) as f32, device)
        .unwrap()
        .to_dtype(dtype)
        .unwrap()
        .reshape((b, m, k))
        .unwrap();

    let flops = b * m * k * dtype.size_in_bytes();

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run_cast(black_box(&tensor), black_box(to_dtype));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        for dtype in [DType::F32, DType::BF16, DType::F16] {
            let to_dtype = if matches!(dtype, DType::F32) {
                DType::F16
            } else {
                DType::F32
            };
            let name = format!("cast_{}_{}", dtype.as_str(), to_dtype.as_str());
            run_cast_benchmark(c, &device, dtype, to_dtype, &name);
        }
        for dtype in [DType::F32, DType::BF16, DType::F16] {
            let name = format!("sqrt_{dtype:?}");
            run_unary_benchmark(c, &device, dtype, &name);
        }
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/benches/benchmarks/where_cond.rs
================================================
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
use candle_core::{DType, Device, Tensor};
use criterion::{criterion_group, Criterion, Throughput};
use std::hint::black_box;
use std::time::Instant;

fn run(a: &Tensor, b: &Tensor, c: &Tensor) {
    a.where_cond(b, c).unwrap();
}

const fn create_cond_arr<const N: usize>() -> [u8; N] {
    let mut arr = [0u8; N];
    let mut i = 0;
    while i < N {
        arr[i] = (i % 2) as u8;
        i += 1;
    }
    arr
}

const B: usize = 1;
const M: usize = 1024;
const K: usize = 1024;
const SIZE: usize = B * M * K;

static DATA: [u8; SIZE] = create_cond_arr::<SIZE>();

fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), device).unwrap();
    let on_true = Tensor::ones((B, M, K), dtype, device).unwrap();
    let on_false = Tensor::zeros((B, M, K), dtype, device).unwrap();

    let elements = B * M * K;
    // E.g. 2 f32 tensors + 1 u8 tensor
    let flops = (2 * elements * dtype.size_in_bytes()) + elements;

    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(
                    black_box(&tensor),
                    black_box(&on_true),
                    black_box(&on_false),
                );
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
}

fn criterion_benchmark(c: &mut Criterion) {
    let device = BenchDeviceHandler::new().unwrap();
    for d in device.devices {
        run_where_cond_benchmark(c, &d, DType::F32, "where_cond_f32");
        run_where_cond_benchmark(c, &d, DType::BF16, "where_cond_bf16");
        run_where_cond_benchmark(c, &d, DType::F16, "where_cond_f16");
    }
}

criterion_group!(benches, criterion_benchmark);


================================================
FILE: candle-core/examples/basics.rs
================================================
#[cfg(feature = "mkl")]
extern crate intel_mkl_src;

#[cfg(feature = "accelerate")]
extern crate accelerate_src;

use anyhow::Result;
use candle_core::{Device, Tensor};

fn main() -> Result<()> {
    let a = Tensor::new(&[[0.0f32, 1.0, 2.0], [3.0, 4.0, 5.0]], &Device::Cpu)?;
    let b = Tensor::new(&[[88.0f32], [99.0]], &Device::Cpu)?;
    let new_a = a.slice_scatter(&b, 1, 2)?;
    assert_eq!(a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
    assert_eq!(
        new_a.to_vec2::<f32>()?,
        [[0.0, 1.0, 88.0], [3.0, 4.0, 99.0]]
    );
    Ok(())
}


================================================
FILE: candle-core/examples/cuda_basics.rs
================================================
#[cfg(feature = "accelerate")]
extern crate accelerate_src;

#[cfg(feature = "mkl")]
extern crate intel_mkl_src;

use anyhow::Result;
use candle_core::{Device, Tensor};
// xs: [1024, 64, 1924], c Tensor[dims 128, 64, 8; f32, cuda:0] Conv1dConfig { padding: 0, stride: 4, dilation: 1, groups: 1 }
fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
    let x = Tensor::randn(0f32, 1.0, (1024, 64, 1924), &device)?;
    let c = Tensor::randn(0f32, 1.0, (128, 64, 8), &device)?;
    let _x1 = x.conv1d(&c, 0, 4, 1, 1)?;
    drop(_x1);
    for _ in 0..20 {
        let start_time = std::time::Instant::now();
        let _x1 = x.conv1d(&c, 0, 4, 1, 1)?;
        device.synchronize()?;
        println!("conv1d: {:?}", start_time.elapsed());
    }
    Ok(())
}


================================================
FILE: candle-core/examples/cuda_sum_benchmark.rs
================================================
#[cfg(feature = "mkl")]
extern crate intel_mkl_src;

#[cfg(feature = "accelerate")]
extern crate accelerate_src;

use std::str::FromStr;

use anyhow::Result;
use candle_core::{Device, Tensor};

fn cos_sin(n: usize, device: &Device) -> Result<Tensor> {
    let thetas: Vec<_> = (0..n).map(|i| i as f32 / n as f32).collect();
    let xs: Vec<_> = thetas.iter().map(|t| t.cos().abs()).collect();
    let ys: Vec<_> = thetas.iter().map(|t| t.sin().abs()).collect();
    let xs = Tensor::from_vec(xs, (n, 1), device)?;
    let ys = Tensor::from_vec(ys, (1, n), device)?;
    let ys = Tensor::cat(&[&ys, &ys, &ys, &ys, &ys, &ys], 1)?;
    Ok(xs.matmul(&ys)?)
}

fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
    let args = std::env::args().collect::<Vec<String>>();
    let n = if args.len() < 2 {
        2000usize
    } else {
        usize::from_str(&args[1])?
    };
    let xys_cpu = cos_sin(n, &Device::Cpu)?;
    let xys = cos_sin(n, &device)?;
    println!("{xys_cpu:?} {xys:?}");
    let sum_keepdim_cpu = xys_cpu.sum_keepdim(1)?;
    println!("{sum_keepdim_cpu}");
    let sum_keepdim = xys.sum_keepdim(1)?;
    println!("{sum_keepdim}");
    let start = std::time::Instant::now();
    let n_iters = 100;
    let mut v = 0f32;
    for _i in 0..n_iters {
        let sum_keepdim = xys.sum_keepdim(1)?;
        let sum_keepdim = sum_keepdim.sum_keepdim(0)?;
        let sum_keepdim: f32 = sum_keepdim.reshape(&[])?.to_scalar()?;
        v += sum_keepdim;
    }
    let elapsed = start.elapsed();
    if v > 0. {
        println!(
            "ran {n_iters} iterations, time per iter: {:?} ({v})",
            elapsed.div_f64(n_iters as f64)
        );
    }
    Ok(())
}


================================================
FILE: candle-core/examples/metal_basics.rs
================================================
#[cfg(feature = "accelerate")]
extern crate accelerate_src;

#[cfg(feature = "mkl")]
extern crate intel_mkl_src;

use anyhow::Result;
use candle_core::{Device, Tensor};

fn main() -> Result<()> {
    // This requires the code to be run with MTL_CAPTURE_ENABLED=1
    let device = Device::new_metal(0)?;
    let metal_device = match &device {
        Device::Metal(m) => m,
        _ => anyhow::bail!("unexpected device"),
    };
    metal_device.capture("/tmp/candle.gputrace")?;
    // This first synchronize ensures that a new command buffer gets created after setting up the
    // capture scope.
    device.synchronize()?;
    let x = Tensor::randn(0f32, 1.0, (128, 128), &device)?;
    let x1 = x.add(&x)?;
    println!("{x1:?}");
    // This second synchronize ensures that the command buffer gets committed before the end of the
    // capture scope.
    device.synchronize()?;
    Ok(())
}


================================================
FILE: candle-core/src/accelerate.rs
================================================
#![allow(dead_code)]
use libc::{c_char, c_double, c_float, c_int, c_long, c_ulong};

mod ffi {
    use super::*;
    extern "C" {
        // It would be nice to be able to switch to the NEWLAPACK version of the function but this
        // seems to trigger some link error. Available function names can be seen here:
        // /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Accelerate.tbd
        #[link_name = "sgemm_"]
        pub fn sgemm_ffi(
            transa: *const c_char,
            transb: *const c_char,
            m: *const c_int,
            n: *const c_int,
            k: *const c_int,
            alpha: *const c_float,
            a: *const c_float,
            lda: *const c_int,
            b: *const c_float,
            ldb: *const c_int,
            beta: *const c_float,
            c: *mut c_float,
            ldc: *const c_int,
        );
        #[link_name = "dgemm_"]
        pub fn dgemm_ffi(
            transa: *const c_char,
            transb: *const c_char,
            m: *const c_int,
            n: *const c_int,
            k: *const c_int,
            alpha: *const c_double,
            a: *const c_double,
            lda: *const c_int,
            b: *const c_double,
            ldb: *const c_int,
            beta: *const c_double,
            c: *mut c_double,
            ldc: *const c_int,
        );

        pub fn vvexpf(dst: *mut c_float, src: *const c_float, len: *const c_int);
        pub fn vvexp(dst: *mut c_double, src: *const c_double, len: *const c_int);
        pub fn vvsqrtf(dst: *mut c_float, src: *const c_float, len: *const c_int);
        pub fn vvsqrt(dst: *mut c_double, src: *const c_double, len: *const c_int);
        pub fn vvsinf(dst: *mut c_float, src: *const c_float, len: *const c_int);
        pub fn vvsin(dst: *mut c_double, src: *const c_double, len: *const c_int);
        pub fn vvcosf(dst: *mut c_float, src: *const c_float, len: *const c_int);
        pub fn vvcos(dst: *mut c_double, src: *const c_double, len: *const c_int);
        pub fn vvlogf(dst: *mut c_float, src: *const c_float, len: *const c_int);
        pub fn vvlog(dst: *mut c_double, src: *const c_double, len: *const c_int);
        pub fn vvtanhf(dst: *mut c_float, src: *const c_float, len: *const c_int);
        pub fn vvtanh(dst: *mut c_double, src: *const c_double, len: *const c_int);

        pub fn vDSP_vaddD(
            _: *const c_double,
            _: c_long,
            _: *const c_double,
            _: c_long,
            _: *mut c_double,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vadd(
            _: *const c_float,
            _: c_long,
            _: *const c_float,
            _: c_long,
            _: *mut c_float,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vsubD(
            _: *const c_double,
            _: c_long,
            _: *const c_double,
            _: c_long,
            _: *mut c_double,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vsub(
            _: *const c_float,
            _: c_long,
            _: *const c_float,
            _: c_long,
            _: *mut c_float,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vmulD(
            _: *const c_double,
            _: c_long,
            _: *const c_double,
            _: c_long,
            _: *mut c_double,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vmul(
            _: *const c_float,
            _: c_long,
            _: *const c_float,
            _: c_long,
            _: *mut c_float,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vdivD(
            _: *const c_double,
            _: c_long,
            _: *const c_double,
            _: c_long,
            _: *mut c_double,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vdiv(
            _: *const c_float,
            _: c_long,
            _: *const c_float,
            _: c_long,
            _: *mut c_float,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vminD(
            _: *const c_double,
            _: c_long,
            _: *const c_double,
            _: c_long,
            _: *mut c_double,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vmin(
            _: *const c_float,
            _: c_long,
            _: *const c_float,
            _: c_long,
            _: *mut c_float,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vmaxD(
            _: *const c_double,
            _: c_long,
            _: *const c_double,
            _: c_long,
            _: *mut c_double,
            _: c_long,
            _: c_ulong,
        );
        pub fn vDSP_vmax(
            _: *const c_float,
            _: c_long,
            _: *const c_float,
            _: c_long,
            _: *mut c_float,
            _: c_long,
            _: c_ulong,
        );
    }
}

#[allow(clippy::too_many_arguments)]
#[inline]
pub unsafe fn sgemm(
    transa: u8,
    transb: u8,
    m: i32,
    n: i32,
    k: i32,
    alpha: f32,
    a: &[f32],
    lda: i32,
    b: &[f32],
    ldb: i32,
    beta: f32,
    c: &mut [f32],
    ldc: i32,
) {
    ffi::sgemm_ffi(
        &(transa as c_char),
        &(transb as c_char),
        &m,
        &n,
        &k,
        &alpha,
        a.as_ptr(),
        &lda,
        b.as_ptr(),
        &ldb,
        &beta,
        c.as_mut_ptr(),
        &ldc,
    )
}

#[allow(clippy::too_many_arguments)]
#[inline]
pub unsafe fn dgemm(
    transa: u8,
    transb: u8,
    m: i32,
    n: i32,
    k: i32,
    alpha: f64,
    a: &[f64],
    lda: i32,
    b: &[f64],
    ldb: i32,
    beta: f64,
    c: &mut [f64],
    ldc: i32,
) {
    ffi::dgemm_ffi(
        &(transa as c_char),
        &(transb as c_char),
        &m,
        &n,
        &k,
        &alpha,
        a.as_ptr(),
        &lda,
        b.as_ptr(),
        &ldb,
        &beta,
        c.as_mut_ptr(),
        &ldc,
    )
}

#[inline]
pub fn vs_exp(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvexpf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vd_exp(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvexp(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vs_sqrt(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvsqrtf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vd_sqrt(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvsqrt(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vs_sin(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvsinf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vd_sin(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvsin(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}
#[inline]
pub fn vs_cos(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvcosf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vd_cos(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvcos(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}
#[inline]
pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvtanhf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vd_tanh(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvtanh(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vs_ln(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvlogf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vd_ln(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    unsafe { ffi::vvlog(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
}

#[inline]
pub fn vs_sqr(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
}

#[inline]
pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
        panic!("a and y have different lengths {a_len} <> {y_len}")
    }
    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
}

#[inline]
pub fn vs_tanh_inplace(y: &mut [f32]) {
    unsafe { ffi::vvtanhf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
}

#[inline]
pub fn vd_tanh_inplace(y: &mut [f64]) {
    unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
}

#[inline]
pub fn vs_exp_inplace(y: &mut [f32]) {
    unsafe { ffi::vvexpf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
}

#[inline]
pub fn vd_exp_inplace(y: &mut [f64]) {
    unsafe { ffi::vvexp(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
}

#[inline]
pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
    }
    vs_tanh_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = 0.5 * v * (1.0 + *y)
    }
}

#[inline]
pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = (2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
    }
    vd_tanh_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = 0.5 * v * (1.0 + *y)
    }
}

#[inline]
pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = -v
    }
    vs_exp_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = v / (1.0 + *y)
    }
}

#[inline]
pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = -v
    }
    vd_exp_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = v / (1.0 + *y)
    }
}

macro_rules! binary_op {
    ($fn_name:ident, $ty:ty, $accelerate_name:ident) => {
        #[inline]
        pub fn $fn_name(a: &[$ty], b: &[$ty], y: &mut [$ty]) {
            let a_len = a.len();
            let b_len = b.len();
            let y_len = y.len();
            if a_len != y_len || b_len != y_len {
                panic!(
                    "{} a,b,y len mismatch {a_len} {b_len} {y_len}",
                    stringify!($fn_name)
                );
            }
            unsafe {
                // Weird quirk of accelerate, the rhs comes before the lhs.
                ffi::$accelerate_name(
                    b.as_ptr(),
                    1,
                    a.as_ptr(),
                    1,
                    y.as_mut_ptr(),
                    1,
                    a_len as u64,
                )
            }
        }
    };
}
binary_op!(vs_add, f32, vDSP_vadd);
binary_op!(vd_add, f64, vDSP_vaddD);
binary_op!(vs_sub, f32, vDSP_vsub);
binary_op!(vd_sub, f64, vDSP_vsubD);
binary_op!(vs_mul, f32, vDSP_vmul);
binary_op!(vd_mul, f64, vDSP_vmulD);
binary_op!(vs_div, f32, vDSP_vdiv);
binary_op!(vd_div, f64, vDSP_vdivD);
binary_op!(vs_max, f32, vDSP_vmax);
binary_op!(vd_max, f64, vDSP_vmaxD);
binary_op!(vs_min, f32, vDSP_vmin);
binary_op!(vd_min, f64, vDSP_vminD);


================================================
FILE: candle-core/src/backend.rs
================================================
//! Traits to Define Backend Behavior
//!
use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
use crate::{CpuStorage, DType, Layout, Result, Shape};

pub trait BackendStorage: Sized {
    type Device: BackendDevice;

    fn try_clone(&self, _: &Layout) -> Result<Self>;

    fn dtype(&self) -> DType;

    fn device(&self) -> &Self::Device;

    // Maybe this should return a Cow instead so that no copy is done on the cpu case.
    fn to_cpu_storage(&self) -> Result<CpuStorage>;

    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self>;

    fn powf(&self, _: &Layout, _: f64) -> Result<Self>;

    fn elu(&self, _: &Layout, _: f64) -> Result<Self>;

    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self>;

    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self>;

    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self>;

    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self>;

    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self>;

    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self>;

    fn conv1d(
        &self,
        _l: &Layout,
        _kernel: &Self,
        _kernel_l: &Layout,
        _params: &crate::conv::ParamsConv1D,
    ) -> Result<Self>;

    fn conv_transpose1d(
        &self,
        _l: &Layout,
        _kernel: &Self,
        _kernel_l: &Layout,
        _params: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self>;

    fn conv2d(
        &self,
        _l: &Layout,
        _kernel: &Self,
        _kernel_l: &Layout,
        _params: &crate::conv::ParamsConv2D,
    ) -> Result<Self>;

    fn conv_transpose2d(
        &self,
        _l: &Layout,
        _kernel: &Self,
        _kernel_l: &Layout,
        _params: &crate::conv::ParamsConvTranspose2D,
    ) -> Result<Self>;

    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self>;
    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self>;
    fn upsample_bilinear2d(
        &self,
        _: &Layout,
        _: usize,
        _: usize,
        _: bool,
        _: Option<f64>,
        _: Option<f64>,
    ) -> Result<Self>;

    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self>;

    fn scatter_set(
        &mut self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: usize,
    ) -> Result<()>;

    fn scatter_add_set(
        &mut self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: usize,
    ) -> Result<()>;

    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self>;
    fn index_add(
        &self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: usize,
    ) -> Result<Self>;

    fn matmul(
        &self,
        _: &Self,
        _: (usize, usize, usize, usize),
        _: &Layout,
        _: &Layout,
    ) -> Result<Self>;

    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>;

    #[allow(clippy::too_many_arguments)]
    // Similar to cudaMemcpy2D, though values are in elements and not in bytes.
    fn copy2d(
        &self,
        _: &mut Self,
        _d1: usize,
        _d2: usize,
        _src_stride1: usize,
        _dst_stride1: usize,
        _src_offset: usize,
        _dst_offset: usize,
    ) -> Result<()>;

    fn const_set(&mut self, _: crate::scalar::Scalar, _: &Layout) -> Result<()>;
}

pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
    type Storage: BackendStorage;

    // TODO: Make the usize generic and part of a generic DeviceLocation.
    fn new(_: usize) -> Result<Self>;

    fn location(&self) -> crate::DeviceLocation;

    fn same_device(&self, _: &Self) -> bool;

    fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;

    /// # Safety
    /// This function is unsafe as it doesn't initialize the underlying data store.
    /// The caller should ensure that the data is properly initialized as early as possible
    /// after this call.
    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;

    fn storage_from_slice<T: crate::WithDType>(&self, _: &[T]) -> Result<Self::Storage>;

    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage>;

    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage>;

    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;

    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;

    fn set_seed(&self, _: u64) -> Result<()>;
    fn get_current_seed(&self) -> Result<u64>;

    /// Synchronize should block until all the operations on the device are completed.
    fn synchronize(&self) -> Result<()>;
}


================================================
FILE: candle-core/src/backprop.rs
================================================
//! Methods for backpropagation of gradients.
use crate::op::{BinaryOp, Op, ReduceOp, UnaryOp};
use crate::{Error, Result, Tensor, TensorId};
use std::collections::HashMap;

// arg has been reduced to node via reduce_dims, expand it back to arg.
// This has to handle keepdims.
fn broadcast_back(arg: &Tensor, node: &Tensor, reduced_dims: &[usize]) -> Result<Tensor> {
    if arg.rank() == node.rank() {
        // keepdim = true
        node.broadcast_as(arg.shape())
    } else {
        // keepdim = false
        // first expand the reduced dims.
        node.reshape(reduced_dims)?.broadcast_as(arg.shape())
    }
}

thread_local! {
    static CANDLE_GRAD_DO_NOT_DETACH: bool = {
        match std::env::var("CANDLE_GRAD_DO_NOT_DETACH") {
            Ok(s) => {
                !s.is_empty() && s != "0"
            },
            Err(_) => false,
        }
    }
}

impl Tensor {
    /// Return all the nodes that lead to this value in a topologically sorted vec, the first
    /// elements having dependencies on the latter ones, e.g. the first element if any is the
    /// argument.
    /// This assumes that the op graph is a DAG.
    pub fn sorted_nodes(&self) -> Vec<&Tensor> {
        // The vec of sorted nodes is passed as an owned value rather than a mutable reference
        // to get around some lifetime limitations.
        fn walk<'a>(
            node: &'a Tensor,
            nodes: Vec<&'a Tensor>,
            already_seen: &mut HashMap<TensorId, bool>,
        ) -> (bool, Vec<&'a Tensor>) {
            if let Some(&tg) = already_seen.get(&node.id()) {
                return (tg, nodes);
            }
            let mut track_grad = false;
            let mut nodes = if node.is_variable() {
                // Do not call recursively on the "leaf" nodes.
                track_grad = true;
                nodes
            } else if node.dtype().is_int() {
                nodes
            } else if let Some(op) = node.op() {
                match op {
                    Op::IndexAdd(t1, t2, t3, _)
                    | Op::Scatter(t1, t2, t3, _)
                    | Op::ScatterAdd(t1, t2, t3, _)
                    | Op::CustomOp3(t1, t2, t3, _)
                    | Op::WhereCond(t1, t2, t3) => {
                        let (tg, nodes) = walk(t1, nodes, already_seen);
                        track_grad |= tg;
                        let (tg, nodes) = walk(t2, nodes, already_seen);
                        track_grad |= tg;
                        let (tg, nodes) = walk(t3, nodes, already_seen);
                        track_grad |= tg;
                        nodes
                    }
                    Op::Conv1D {
                        arg: lhs,
                        kernel: rhs,
                        ..
                    }
                    | Op::ConvTranspose1D {
                        arg: lhs,
                        kernel: rhs,
                        ..
                    }
                    | Op::Conv2D {
                        arg: lhs,
                        kernel: rhs,
                        ..
                    }
                    | Op::ConvTranspose2D {
                        arg: lhs,
                        kernel: rhs,
                        ..
                    }
                    | Op::CustomOp2(lhs, rhs, _)
                    | Op::Binary(lhs, rhs, _)
                    | Op::Gather(lhs, rhs, _)
                    | Op::IndexSelect(lhs, rhs, _)
                    | Op::Matmul(lhs, rhs)
                    | Op::SliceScatter0(lhs, rhs, _) => {
                        let (tg, nodes) = walk(lhs, nodes, already_seen);
                        track_grad |= tg;
                        let (tg, nodes) = walk(rhs, nodes, already_seen);
                        track_grad |= tg;
                        nodes
                    }
                    Op::Cat(args, _) => args.iter().fold(nodes, |nodes, arg| {
                        let (tg, nodes) = walk(arg, nodes, already_seen);
                        track_grad |= tg;
                        nodes
                    }),
                    Op::Affine { arg, mul, .. } => {
                        if *mul == 0. {
                            nodes
                        } else {
                            let (tg, nodes) = walk(arg, nodes, already_seen);
                            track_grad |= tg;
                            nodes
                        }
                    }
                    Op::Unary(_node, UnaryOp::Ceil)
                    | Op::Unary(_node, UnaryOp::Floor)
                    | Op::Unary(_node, UnaryOp::Round)
                    | Op::Unary(_node, UnaryOp::Sign) => nodes,
                    Op::Reshape(node)
                    | Op::UpsampleNearest1D { arg: node, .. }
                    | Op::UpsampleNearest2D { arg: node, .. }
                    | Op::UpsampleBilinear2D { arg: node, .. }
                    | Op::AvgPool2D { arg: node, .. }
                    | Op::MaxPool2D { arg: node, .. }
                    | Op::Copy(node)
                    | Op::Broadcast(node)
                    | Op::Cmp(node, _)
                    | Op::Reduce(node, ReduceOp::Min | ReduceOp::Sum | ReduceOp::Max, _)
                    | Op::ToDevice(node)
                    | Op::Transpose(node, _, _)
                    | Op::Permute(node, _)
                    | Op::Narrow(node, _, _, _)
                    | Op::Unary(node, _)
                    | Op::Elu(node, _)
                    | Op::Powf(node, _)
                    | Op::CustomOp1(node, _) => {
                        let (tg, nodes) = walk(node, nodes, already_seen);
                        track_grad |= tg;
                        nodes
                    }
                    Op::ToDType(node) => {
                        if node.dtype().is_float() {
                            let (tg, nodes) = walk(node, nodes, already_seen);
                            track_grad |= tg;
                            nodes
                        } else {
                            nodes
                        }
                    }
                    Op::Reduce(_, ReduceOp::ArgMin | ReduceOp::ArgMax, _) => nodes,
                }
            } else {
                nodes
            };
            already_seen.insert(node.id(), track_grad);
            if track_grad {
                nodes.push(node);
            }
            (track_grad, nodes)
        }
        let (_tg, mut nodes) = walk(self, vec![], &mut HashMap::new());
        nodes.reverse();
        nodes
    }

    pub fn backward(&self) -> Result<GradStore> {
        let sorted_nodes = self.sorted_nodes();
        let mut grads = GradStore::new();
        grads.insert(self, self.ones_like()?.contiguous()?);
        for node in sorted_nodes.iter() {
            if node.is_variable() {
                continue;
            }
            let grad = grads
                .remove(node)
                .expect("candle internal error - grad not populated");
            // https://github.com/huggingface/candle/issues/1241
            // Ideally, we would make these operations in place where possible to ensure that we
            // do not have to allocate too often. Here we just call `.detach` to avoid computing
            // the backprop graph of the backprop itself. This would be an issue for second order
            // derivatives but these are out of scope at the moment.
            let do_not_detach = CANDLE_GRAD_DO_NOT_DETACH.with(|b| *b);
            let grad = if do_not_detach { grad } else { grad.detach() };
            if let Some(op) = node.op() {
                match op {
                    Op::Binary(lhs, rhs, BinaryOp::Add) => {
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&grad)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&grad)?;
                    }
                    Op::Binary(lhs, rhs, BinaryOp::Sub) => {
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&grad)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.sub(&grad)?;
                    }
                    Op::Binary(lhs, rhs, BinaryOp::Mul) => {
                        let lhs_grad = grad.mul(rhs)?;
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
                        let rhs_grad = grad.mul(lhs)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
                    }
                    Op::Binary(lhs, rhs, BinaryOp::Div) => {
                        let lhs_grad = grad.div(rhs)?;
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
                        let rhs_grad = grad.mul(lhs)?.div(&rhs.sqr()?)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.sub(&rhs_grad)?;
                    }
                    Op::Binary(lhs, rhs, BinaryOp::Minimum)
                    | Op::Binary(lhs, rhs, BinaryOp::Maximum) => {
                        let mask_lhs = node.eq(lhs)?.to_dtype(grad.dtype())?;
                        let mask_rhs = node.eq(rhs)?.to_dtype(grad.dtype())?;

                        // If both masks are 1 one the same point, we want to scale the
                        // gradient by 0.5 rather than 1.
                        let lhs_grad = mask_lhs.mul(&grad)?.div(&(&mask_rhs + 1.)?)?;
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;

                        let rhs_grad = mask_rhs.mul(&grad)?.div(&(&mask_lhs + 1.)?)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
                    }
                    Op::WhereCond(pred, t, f) => {
                        let zeros = grad.zeros_like()?;
                        let t_sum_grad = grads.or_insert(t)?;
                        let t_grad = pred.where_cond(&grad, &zeros)?;
                        *t_sum_grad = t_sum_grad.add(&t_grad)?;
                        let f_sum_grad = grads.or_insert(f)?;
                        let f_grad = pred.where_cond(&zeros, &grad)?;
                        *f_sum_grad = f_sum_grad.add(&f_grad)?;
                    }
                    Op::Conv1D {
                        arg,
                        kernel,
                        padding,
                        stride,
                        dilation,
                    } => {
                        // The output height for conv_transpose1d is:
                        // (l_in - 1) * stride - 2 * padding + dilation * (k_size - 1) + out_padding + 1
                        let grad_l_in = grad.dim(2)?;
                        let k_size = kernel.dim(2)?;
                        let out_size =
                            (grad_l_in - 1) * stride + dilation * (k_size - 1) + 1 - 2 * padding;
                        let out_padding = arg.dim(2)? - out_size;
                        let grad_arg = grad.conv_transpose1d(
                            kernel,
                            *padding,
                            out_padding,
                            *stride,
                            *dilation,
                            /* groups */ 1,
                        )?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;

                        let grad_kernel = arg
                            .transpose(0, 1)?
                            .conv1d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
                        let (_, _, k0) = kernel.dims3()?;
                        let (_, _, g_k0) = grad_kernel.dims3()?;
                        let grad_kernel = if g_k0 != k0 {
                            grad_kernel.narrow(2, 0, k0)?
                        } else {
                            grad_kernel
                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
                    Op::Conv2D {
                        arg,
                        kernel,
                        padding,
                        stride,
                        dilation,
                    } => {
                        // The output height for conv_transpose2d is:
                        // (i_h - 1) * stride - 2 * padding + dilation * (k_h - 1) + out_padding + 1
                        let grad_h = grad.dim(2)?;
                        let k_h = kernel.dim(2)?;
                        let out_size =
                            (grad_h - 1) * stride + dilation * (k_h - 1) + 1 - 2 * padding;
                        let out_padding = arg.dim(2)? - out_size;
                        let grad_arg = grad.conv_transpose2d(
                            kernel,
                            *padding,
                            out_padding,
                            *stride,
                            *dilation,
                        )?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;

                        let grad_kernel = arg
                            .transpose(0, 1)?
                            .conv2d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
                        let (_, _, k0, k1) = kernel.dims4()?;
                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
                        } else {
                            grad_kernel
                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
                    Op::ConvTranspose1D { .. } => Err(Error::BackwardNotSupported {
                        op: "conv-transpose1d",
                    })?,
                    Op::ConvTranspose2D {
                        arg,
                        kernel,
                        padding,
                        stride,
                        dilation,
                        output_padding: _output_padding,
                    } => {
                        let grad_arg = grad.conv2d(kernel, *padding, *stride, *dilation, 1)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;

                        let grad_kernel = grad
                            .transpose(0, 1)?
                            .conv2d(&arg.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
                        let (_, _, k0, k1) = kernel.dims4()?;
                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
                        } else {
                            grad_kernel
                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
                    Op::AvgPool2D {
                        arg,
                        kernel_size,
                        stride,
                    } => {
                        if kernel_size != stride {
                            crate::bail!("backward not supported for avgpool2d if ksize {kernel_size:?} != stride {stride:?}")
                        }
                        let (_n, _c, h, w) = arg.dims4()?;
                        let grad_arg = grad.upsample_nearest2d(h, w)?;
                        let grad_arg =
                            (grad_arg * (1f64 / (kernel_size.0 * kernel_size.1) as f64))?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
                    }
                    Op::MaxPool2D {
                        arg,
                        kernel_size,
                        stride,
                    } => {
                        if kernel_size != stride {
                            crate::bail!("backward not supported for maxpool2d if ksize {kernel_size:?} != stride {stride:?}")
                        }
                        let (_n, _c, h, w) = arg.dims4()?;
                        // For computing the max-pool gradient, we compute a mask where a 1 means
                        // that the element is the maximum, then we apply this mask to the
                        // upsampled gradient (taking into account that multiple max may exist so
                        // we scale the gradient for this case).
                        let node_upsampled = node.upsample_nearest2d(h, w)?;
                        let mask = arg.eq(&node_upsampled)?.to_dtype(arg.dtype())?;
                        let avg = mask.avg_pool2d_with_stride(*kernel_size, *stride)?;
                        let grad_arg = ((grad * avg)?.upsample_nearest2d(h, w)? * mask)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
                    }
                    Op::UpsampleNearest1D { arg, target_size } => {
                        let (_n, c, size) = arg.dims3()?;
                        if target_size % size != 0 {
                            crate::bail!("backward not supported for non integer upscaling factors")
                        }
                        let scale = target_size / size;

                        let kernel = Tensor::ones((c, 1, scale), arg.dtype(), arg.device())?;
                        let conv_sum = grad.conv1d(&kernel, 0, scale, 1, c)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = conv_sum;
                    }
                    Op::UpsampleNearest2D {
                        arg,
                        target_h,
                        target_w,
                    } => {
                        let (_n, c, h, w) = arg.dims4()?;
                        if target_h % h != 0 || target_w % w != 0 {
                            crate::bail!("backward not supported for non integer upscaling factors")
                        }
                        let scale_h = target_h / h;
                        let scale_w = target_w / w;

                        if scale_h != scale_w {
                            crate::bail!("backward not supported for non uniform upscaling factors")
                        };
                        let kernel =
                            Tensor::ones((c, 1, scale_h, scale_w), arg.dtype(), arg.device())?;
                        let conv_sum = grad.conv2d(&kernel, 0, scale_h, 1, c)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = conv_sum;
                    }
                    Op::UpsampleBilinear2D { .. } => {
                        crate::bail!("backward not supported for upsample_bilinear2d")
                    }
                    Op::SliceScatter0(lhs, rhs, start_rhs) => {
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        let rhs_grad = grad.narrow(0, *start_rhs, rhs.dim(0)?)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;

                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        let lhs_grad = grad.slice_scatter0(&rhs.zeros_like()?, *start_rhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?
                    }
                    Op::Gather(arg, indexes, dim) => {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.scatter_add(indexes, &grad, *dim)?;
                    }
                    Op::Scatter(init, indexes, src, dim) => {
                        let init_sum_grad = grads.or_insert(init)?;
                        *init_sum_grad = init_sum_grad.add(&grad)?;

                        let src_grad = grad.gather(indexes, *dim)?;
                        let src_sum_grad = grads.or_insert(src)?;
                        *src_sum_grad = src_sum_grad.add(&src_grad)?;
                    }
                    Op::ScatterAdd(init, indexes, src, dim) => {
                        let init_sum_grad = grads.or_insert(init)?;
                        let mask = init.ones_like()?;
                        let mask = mask.scatter(indexes, &mask.zeros_like()?, *dim)?;
                        *init_sum_grad = init_sum_grad.add(&grad.mul(&mask)?)?;

                        let src_grad = grad.gather(indexes, *dim)?;
                        let src_sum_grad = grads.or_insert(src)?;
                        *src_sum_grad = src_sum_grad.add(&src_grad)?;
                    }
                    Op::IndexAdd(init, indexes, src, dim) => {
                        let init_sum_grad = grads.or_insert(init)?;
                        *init_sum_grad = init_sum_grad.add(&grad)?;

                        let src_grad = grad.index_select(indexes, *dim)?;
                        let src_sum_grad = grads.or_insert(src)?;
                        *src_sum_grad = src_sum_grad.add(&src_grad)?;
                    }
                    Op::IndexSelect(arg, indexes, dim) => {

Download .txt

gitextract_tey_ubja/

├── .cargo/
│   └── config.toml
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── ci_cuda.yaml
│       ├── maturin.yml
│       ├── python.yml
│       ├── rust-ci.yml
│       └── trufflehog.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── Makefile
├── README.md
├── candle-book/
│   ├── .gitignore
│   ├── CONTRIBUTING.md
│   ├── Cargo.toml
│   ├── book.toml
│   └── src/
│       ├── README.md
│       ├── SUMMARY.md
│       ├── advanced/
│       │   └── mkl.md
│       ├── apps/
│       │   ├── README.md
│       │   ├── desktop.md
│       │   ├── rest.md
│       │   └── wasm.md
│       ├── chapter_1.md
│       ├── cuda/
│       │   ├── README.md
│       │   ├── porting.md
│       │   └── writing.md
│       ├── error_manage.md
│       ├── guide/
│       │   ├── cheatsheet.md
│       │   ├── hello_world.md
│       │   ├── installation.md
│       │   └── mnist/
│       │       ├── intro.md
│       │       ├── modeling.md
│       │       ├── saving_loading.md
│       │       └── training.md
│       ├── inference/
│       │   ├── cuda/
│       │   │   ├── README.md
│       │   │   ├── porting.md
│       │   │   └── writing.md
│       │   ├── hub.md
│       │   └── inference.md
│       ├── lib.rs
│       ├── simplified.rs
│       ├── tracing.md
│       └── training/
│           ├── finetuning.md
│           ├── mnist.md
│           ├── serialization.md
│           ├── simplified.md
│           └── training.md
├── candle-core/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── README.md
│   ├── benches/
│   │   ├── bench_main.rs
│   │   └── benchmarks/
│   │       ├── affine.rs
│   │       ├── binary.rs
│   │       ├── broadcast.rs
│   │       ├── conv_transpose2d.rs
│   │       ├── copy.rs
│   │       ├── matmul.rs
│   │       ├── mod.rs
│   │       ├── qmatmul.rs
│   │       ├── random.rs
│   │       ├── reduce.rs
│   │       ├── unary.rs
│   │       └── where_cond.rs
│   ├── examples/
│   │   ├── basics.rs
│   │   ├── cuda_basics.rs
│   │   ├── cuda_sum_benchmark.rs
│   │   └── metal_basics.rs
│   ├── src/
│   │   ├── accelerate.rs
│   │   ├── backend.rs
│   │   ├── backprop.rs
│   │   ├── conv.rs
│   │   ├── convert.rs
│   │   ├── cpu/
│   │   │   ├── avx.rs
│   │   │   ├── erf.rs
│   │   │   ├── kernels.rs
│   │   │   ├── mod.rs
│   │   │   ├── neon.rs
│   │   │   └── simd128.rs
│   │   ├── cpu_backend/
│   │   │   ├── conv2d.rs
│   │   │   ├── mod.rs
│   │   │   └── utils.rs
│   │   ├── cuda_backend/
│   │   │   ├── cudnn.rs
│   │   │   ├── device.rs
│   │   │   ├── error.rs
│   │   │   ├── mod.rs
│   │   │   └── utils.rs
│   │   ├── custom_op.rs
│   │   ├── device.rs
│   │   ├── display.rs
│   │   ├── dtype.rs
│   │   ├── dummy_cuda_backend.rs
│   │   ├── dummy_dtype.rs
│   │   ├── dummy_metal_backend.rs
│   │   ├── error.rs
│   │   ├── indexer.rs
│   │   ├── layout.rs
│   │   ├── lib.rs
│   │   ├── metal_backend/
│   │   │   ├── device.rs
│   │   │   └── mod.rs
│   │   ├── mkl.rs
│   │   ├── npy.rs
│   │   ├── op.rs
│   │   ├── pickle.rs
│   │   ├── quantized/
│   │   │   ├── avx.rs
│   │   │   ├── cuda.rs
│   │   │   ├── dummy_cuda.rs
│   │   │   ├── dummy_metal.rs
│   │   │   ├── ggml_file.rs
│   │   │   ├── gguf_file.rs
│   │   │   ├── imatrix_file.rs
│   │   │   ├── k_quants.rs
│   │   │   ├── metal.rs
│   │   │   ├── mod.rs
│   │   │   ├── neon.rs
│   │   │   ├── simd128.rs
│   │   │   ├── tokenizer.rs
│   │   │   └── utils.rs
│   │   ├── safetensors.rs
│   │   ├── scalar.rs
│   │   ├── shape.rs
│   │   ├── sort.rs
│   │   ├── storage.rs
│   │   ├── streaming.rs
│   │   ├── strided_index.rs
│   │   ├── tensor.rs
│   │   ├── tensor_cat.rs
│   │   ├── test_utils.rs
│   │   ├── utils.rs
│   │   └── variable.rs
│   └── tests/
│       ├── bilinear_tests.rs
│       ├── conv_tests.rs
│       ├── custom_op_tests.rs
│       ├── display_tests.rs
│       ├── fortran_tensor_3d.pth
│       ├── grad_tests.rs
│       ├── indexing_tests.rs
│       ├── layout_tests.rs
│       ├── matmul_tests.rs
│       ├── npy.py
│       ├── pool_tests.rs
│       ├── pth.py
│       ├── pth_tests.rs
│       ├── quantized_tests.rs
│       ├── serialization_tests.rs
│       ├── tensor_tests.rs
│       ├── test.npy
│       ├── test.pt
│       └── test_with_key.pt
├── candle-datasets/
│   ├── Cargo.toml
│   ├── README.md
│   └── src/
│       ├── batcher.rs
│       ├── hub.rs
│       ├── lib.rs
│       ├── nlp/
│       │   ├── mod.rs
│       │   └── tinystories.rs
│       └── vision/
│           ├── cifar.rs
│           ├── fashion_mnist.rs
│           ├── mnist.rs
│           └── mod.rs
├── candle-examples/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── buildtime_downloader.rs
│   ├── examples/
│   │   ├── based/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── beit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── bert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── bert_single_file_binary/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── bigcode/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── blip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── chatglm/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── chinese_clip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── clip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── codegeex4-9b/
│   │   │   ├── README.org
│   │   │   └── main.rs
│   │   ├── colpali/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── convmixer/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── convnext/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── csm/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── voices.safetensors
│   │   ├── custom-ops/
│   │   │   ├── README.md
│   │   │   ├── cuda_kernels.rs
│   │   │   ├── kernels/
│   │   │   │   ├── layernorm_kernels.cu
│   │   │   │   └── reduction_utils.cuh
│   │   │   └── main.rs
│   │   ├── debertav2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── deepseekv2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── depth_anything_v2/
│   │   │   ├── README.md
│   │   │   ├── color_map.rs
│   │   │   └── main.rs
│   │   ├── dinov2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── dinov2reg4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── distilbert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── efficientnet/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── efficientvit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── encodec/
│   │   │   ├── README.md
│   │   │   ├── audio_io.rs
│   │   │   ├── jfk-codes.safetensors
│   │   │   └── main.rs
│   │   ├── eva2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── falcon/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── fastvit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── flux/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── t5_tokenizer.py
│   │   ├── gemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── gguf-tokenizer.rs
│   │   ├── glm4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── granite/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── granitemoehybrid/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── gte-qwen/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── helium/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── hiera/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── jina-bert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── llama/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── llama2-c/
│   │   │   ├── main.rs
│   │   │   └── training.rs
│   │   ├── llama_multiprocess/
│   │   │   ├── main.rs
│   │   │   └── model.rs
│   │   ├── llava/
│   │   │   ├── constants.rs
│   │   │   ├── conversation.rs
│   │   │   ├── image_processor.rs
│   │   │   ├── main.rs
│   │   │   └── readme.md
│   │   ├── mamba/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mamba-minimal/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── model.rs
│   │   ├── mamba2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── marian-mt/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── python/
│   │   │       ├── convert_slow_tokenizer.py
│   │   │       └── requirements.txt
│   │   ├── metavoice/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mimi/
│   │   │   ├── README.md
│   │   │   ├── audio_io.rs
│   │   │   └── main.rs
│   │   ├── mistral/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mixtral/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mnist-training/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mobileclip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mobilenetv4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── mobileone/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── modernbert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── moondream/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── musicgen/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── musicgen_model.rs
│   │   ├── nomic-bert/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── nvembed_v2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── olmo/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── onnx/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── onnx-llm/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── onnx_basics.rs
│   │   ├── orpheus/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── paddleocr-vl/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── paligemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── parler-tts/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── phi/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── pixtral/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-gemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-glm4/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-lfm2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-phi/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-qwen2-instruct/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-qwen3/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-qwen3-moe/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── quantized-t5/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── qwen/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── recurrent-gemma/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── reinforcement-learning/
│   │   │   ├── README.md
│   │   │   ├── atari_wrappers.py
│   │   │   ├── ddpg.rs
│   │   │   ├── dqn.rs
│   │   │   ├── gym_env.rs
│   │   │   ├── main.rs
│   │   │   ├── policy_gradient.rs
│   │   │   └── vec_gym_env.rs
│   │   ├── replit-code/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── repvgg/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── resnet/
│   │   │   ├── README.md
│   │   │   ├── export_models.py
│   │   │   └── main.rs
│   │   ├── rwkv/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── segformer/
│   │   │   ├── README.md
│   │   │   ├── assets/
│   │   │   │   └── labels.json
│   │   │   └── main.rs
│   │   ├── segment-anything/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── siglip/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── silero-vad/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── smollm3/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── snac/
│   │   │   ├── audio_io.rs
│   │   │   └── main.rs
│   │   ├── splade/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── stable-diffusion/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── stable-diffusion-3/
│   │   │   ├── README.md
│   │   │   ├── clip.rs
│   │   │   ├── main.rs
│   │   │   ├── sampling.rs
│   │   │   └── vae.rs
│   │   ├── stable-lm/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── starcoder2/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── stella-en-v5/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── t5/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── trocr/
│   │   │   ├── image_processor.rs
│   │   │   ├── main.rs
│   │   │   └── readme.md
│   │   ├── vgg/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── vit/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── voxtral/
│   │   │   ├── README.md
│   │   │   ├── download.rs
│   │   │   ├── main.rs
│   │   │   ├── melfilters128.bytes
│   │   │   └── model.rs
│   │   ├── whisper/
│   │   │   ├── README.md
│   │   │   ├── extract_weights.py
│   │   │   ├── main.rs
│   │   │   ├── melfilters.bytes
│   │   │   ├── melfilters128.bytes
│   │   │   └── multilingual.rs
│   │   ├── whisper-microphone/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── multilingual.rs
│   │   ├── wuerstchen/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── xlm-roberta/
│   │   │   ├── Readme.md
│   │   │   └── main.rs
│   │   ├── yi/
│   │   │   ├── README.md
│   │   │   └── main.rs
│   │   ├── yolo-v3/
│   │   │   ├── README.md
│   │   │   ├── darknet.rs
│   │   │   ├── extract-weights.py
│   │   │   ├── main.rs
│   │   │   └── yolo-v3.cfg
│   │   ├── yolo-v8/
│   │   │   ├── README.md
│   │   │   ├── main.rs
│   │   │   └── model.rs
│   │   └── z_image/
│   │       ├── README.md
│   │       └── main.rs
│   └── src/
│       ├── audio.rs
│       ├── bs1770.rs
│       ├── chat_template.rs
│       ├── coco_classes.rs
│       ├── imagenet.rs
│       ├── lib.rs
│       ├── token_output_stream.rs
│       └── wav.rs
├── candle-flash-attn/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── kernels/
│   │   ├── alibi.h
│   │   ├── block_info.h
│   │   ├── dropout.h
│   │   ├── error.h
│   │   ├── flash.h
│   │   ├── flash_api.cu
│   │   ├── flash_fwd_hdim128_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim128_bf16_sm80.cu
│   │   ├── flash_fwd_hdim128_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim128_fp16_sm80.cu
│   │   ├── flash_fwd_hdim160_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim160_bf16_sm80.cu
│   │   ├── flash_fwd_hdim160_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim160_fp16_sm80.cu
│   │   ├── flash_fwd_hdim192_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim192_bf16_sm80.cu
│   │   ├── flash_fwd_hdim192_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim192_fp16_sm80.cu
│   │   ├── flash_fwd_hdim224_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim224_bf16_sm80.cu
│   │   ├── flash_fwd_hdim224_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim224_fp16_sm80.cu
│   │   ├── flash_fwd_hdim256_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim256_bf16_sm80.cu
│   │   ├── flash_fwd_hdim256_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim256_fp16_sm80.cu
│   │   ├── flash_fwd_hdim32_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim32_bf16_sm80.cu
│   │   ├── flash_fwd_hdim32_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim32_fp16_sm80.cu
│   │   ├── flash_fwd_hdim64_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim64_bf16_sm80.cu
│   │   ├── flash_fwd_hdim64_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim64_fp16_sm80.cu
│   │   ├── flash_fwd_hdim96_bf16_causal_sm80.cu
│   │   ├── flash_fwd_hdim96_bf16_sm80.cu
│   │   ├── flash_fwd_hdim96_fp16_causal_sm80.cu
│   │   ├── flash_fwd_hdim96_fp16_sm80.cu
│   │   ├── flash_fwd_kernel.h
│   │   ├── flash_fwd_launch_template.h
│   │   ├── hardware_info.h
│   │   ├── kernel_helpers.h
│   │   ├── kernel_traits.h
│   │   ├── kernel_traits_sm90.h
│   │   ├── kernels.h
│   │   ├── mask.h
│   │   ├── philox.cuh
│   │   ├── rotary.h
│   │   ├── softmax.h
│   │   ├── static_switch.h
│   │   └── utils.h
│   ├── src/
│   │   ├── ffi.rs
│   │   └── lib.rs
│   └── tests/
│       └── flash_attn_tests.rs
├── candle-flash-attn-v3/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── hkernel/
│   │   ├── combine.h
│   │   ├── copy_paged_sm90_tma.hpp
│   │   ├── copy_paged_sm90_tma_cutlass35.hpp
│   │   ├── copy_paged_sm90_tma_cutlass36.hpp
│   │   ├── epilogue_fwd_sm90_tma.hpp
│   │   ├── flash.h
│   │   ├── flash_api.cpp
│   │   ├── flash_api.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim128_bf16_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim128_e4m3_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim128_fp16_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim256_bf16_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim256_e4m3_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim256_fp16_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim64_bf16_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim64_e4m3_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa16_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa2_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa32_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa4_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_gqa8_sm90.cu
│   │   ├── flash_fwd_hdim64_fp16_sm90.cu
│   │   ├── flash_fwd_kernel.h
│   │   ├── flash_fwd_launch_template.h
│   │   ├── kernel_traits.h
│   │   ├── mainloop_fwd_sm90_tma_gmma_ws.hpp
│   │   ├── named_barrier.hpp
│   │   ├── seq_len.h
│   │   ├── softmax.h
│   │   ├── static_switch.h
│   │   ├── tile_scheduler.hpp
│   │   └── utils.h
│   ├── src/
│   │   ├── ffi.rs
│   │   └── lib.rs
│   └── tests/
│       └── flash_attn_tests.rs
├── candle-kernels/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   └── src/
│       ├── affine.cu
│       ├── binary.cu
│       ├── binary_op_macros.cuh
│       ├── cast.cu
│       ├── compatibility.cuh
│       ├── conv.cu
│       ├── cuda_utils.cuh
│       ├── ffi.rs
│       ├── fill.cu
│       ├── indexing.cu
│       ├── lib.rs
│       ├── moe/
│       │   ├── gguf.cuh
│       │   ├── moe_gguf.cu
│       │   ├── moe_utils.cuh
│       │   ├── moe_wmma.cu
│       │   └── moe_wmma_gguf.cu
│       ├── ptx.rs
│       ├── quantized.cu
│       ├── reduce.cu
│       ├── sort.cu
│       ├── ternary.cu
│       └── unary.cu
├── candle-metal-kernels/
│   ├── Cargo.toml
│   ├── README.md
│   ├── examples/
│   │   └── metal_benchmarks.rs
│   └── src/
│       ├── err.rs
│       ├── kernel.rs
│       ├── kernels/
│       │   ├── affine.rs
│       │   ├── binary.rs
│       │   ├── cast.rs
│       │   ├── convolution.rs
│       │   ├── fill.rs
│       │   ├── indexing.rs
│       │   ├── macros.rs
│       │   ├── mlx_gemm.rs
│       │   ├── mod.rs
│       │   ├── quantized.rs
│       │   ├── random.rs
│       │   ├── reduce.rs
│       │   ├── sdpa.rs
│       │   ├── sort.rs
│       │   ├── ternary.rs
│       │   └── unary.rs
│       ├── lib.rs
│       ├── metal/
│       │   ├── buffer.rs
│       │   ├── command_buffer.rs
│       │   ├── commands.rs
│       │   ├── compute_pipeline.rs
│       │   ├── device.rs
│       │   ├── encoder.rs
│       │   ├── library.rs
│       │   └── mod.rs
│       ├── metal_src/
│       │   ├── affine.metal
│       │   ├── binary.metal
│       │   ├── cast.metal
│       │   ├── conv.metal
│       │   ├── fill.metal
│       │   ├── indexing.metal
│       │   ├── mlx_gemm.metal
│       │   ├── mlx_sort.metal
│       │   ├── quantized.metal
│       │   ├── random.metal
│       │   ├── reduce.metal
│       │   ├── scaled_dot_product_attention.metal
│       │   ├── sort.metal
│       │   ├── ternary.metal
│       │   ├── unary.metal
│       │   └── utils.metal
│       ├── source.rs
│       ├── tests.rs
│       └── utils.rs
├── candle-nn/
│   ├── Cargo.toml
│   ├── README.md
│   ├── benches/
│   │   ├── bench_main.rs
│   │   └── benchmarks/
│   │       ├── conv.rs
│   │       ├── mod.rs
│   │       ├── norm.rs
│   │       └── softmax.rs
│   ├── examples/
│   │   ├── basic_optimizer.rs
│   │   └── cpu_benchmarks.rs
│   ├── src/
│   │   ├── activation.rs
│   │   ├── batch_norm.rs
│   │   ├── conv.rs
│   │   ├── cpu_flash_attention.rs
│   │   ├── embedding.rs
│   │   ├── encoding.rs
│   │   ├── func.rs
│   │   ├── group_norm.rs
│   │   ├── init.rs
│   │   ├── kv_cache.rs
│   │   ├── layer_norm.rs
│   │   ├── lib.rs
│   │   ├── linear.rs
│   │   ├── loss.rs
│   │   ├── moe.rs
│   │   ├── ops.rs
│   │   ├── optim.rs
│   │   ├── rnn.rs
│   │   ├── rotary_emb.rs
│   │   ├── sampling.rs
│   │   ├── sequential.rs
│   │   ├── var_builder.rs
│   │   └── var_map.rs
│   └── tests/
│       ├── batch_norm.rs
│       ├── cpu_flash_attn.rs
│       ├── group_norm.rs
│       ├── kv_cache.rs
│       ├── layer_norm.rs
│       ├── loss.rs
│       ├── one_hot.rs
│       ├── ops.rs
│       ├── optim.rs
│       ├── rnn.rs
│       └── sdpa.rs
├── candle-onnx/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   ├── src/
│   │   ├── eval.rs
│   │   ├── lib.rs
│   │   └── onnx.proto3
│   └── tests/
│       └── ops.rs
├── candle-pyo3/
│   ├── .gitignore
│   ├── Cargo.toml
│   ├── README.md
│   ├── _additional_typing/
│   │   ├── README.md
│   │   └── __init__.py
│   ├── build.rs
│   ├── e5.py
│   ├── py_src/
│   │   └── candle/
│   │       ├── __init__.py
│   │       ├── __init__.pyi
│   │       ├── functional/
│   │       │   ├── __init__.py
│   │       │   └── __init__.pyi
│   │       ├── models/
│   │       │   ├── bert.py
│   │       │   └── llama.py
│   │       ├── nn/
│   │       │   ├── __init__.py
│   │       │   ├── __init__.pyi
│   │       │   ├── container.py
│   │       │   ├── linear.py
│   │       │   ├── module.py
│   │       │   ├── normalization.py
│   │       │   └── sparse.py
│   │       ├── onnx/
│   │       │   ├── __init__.py
│   │       │   └── __init__.pyi
│   │       ├── testing/
│   │       │   └── __init__.py
│   │       ├── typing/
│   │       │   └── __init__.py
│   │       └── utils/
│   │           ├── __init__.py
│   │           └── __init__.pyi
│   ├── pyproject.toml
│   ├── quant-llama.py
│   ├── src/
│   │   ├── lib.rs
│   │   ├── onnx.rs
│   │   ├── shape.rs
│   │   └── utils.rs
│   ├── stub.py
│   ├── test.py
│   ├── test_pytorch.py
│   └── tests/
│       ├── __init__.py
│       ├── bindings/
│       │   ├── test_linear.py
│       │   ├── test_module.py
│       │   └── test_testing.py
│       └── native/
│           ├── test_shape.py
│           ├── test_tensor.py
│           └── test_utils.py
├── candle-transformers/
│   ├── Cargo.toml
│   ├── README.md
│   ├── src/
│   │   ├── fused_moe.rs
│   │   ├── generation/
│   │   │   └── mod.rs
│   │   ├── lib.rs
│   │   ├── models/
│   │   │   ├── based.rs
│   │   │   ├── beit.rs
│   │   │   ├── bert.rs
│   │   │   ├── bigcode.rs
│   │   │   ├── blip.rs
│   │   │   ├── blip_text.rs
│   │   │   ├── chatglm.rs
│   │   │   ├── chinese_clip/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text_model.rs
│   │   │   │   └── vision_model.rs
│   │   │   ├── clip/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text_model.rs
│   │   │   │   └── vision_model.rs
│   │   │   ├── codegeex4_9b.rs
│   │   │   ├── colpali.rs
│   │   │   ├── convmixer.rs
│   │   │   ├── convnext.rs
│   │   │   ├── csm.rs
│   │   │   ├── dac.rs
│   │   │   ├── debertav2.rs
│   │   │   ├── deepseek2.rs
│   │   │   ├── depth_anything_v2.rs
│   │   │   ├── dinov2.rs
│   │   │   ├── dinov2reg4.rs
│   │   │   ├── distilbert.rs
│   │   │   ├── efficientnet.rs
│   │   │   ├── efficientvit.rs
│   │   │   ├── encodec.rs
│   │   │   ├── eva2.rs
│   │   │   ├── falcon.rs
│   │   │   ├── fastvit.rs
│   │   │   ├── flux/
│   │   │   │   ├── autoencoder.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   ├── quantized_model.rs
│   │   │   │   └── sampling.rs
│   │   │   ├── gemma.rs
│   │   │   ├── gemma2.rs
│   │   │   ├── gemma3.rs
│   │   │   ├── glm4.rs
│   │   │   ├── glm4_new.rs
│   │   │   ├── granite.rs
│   │   │   ├── granitemoehybrid.rs
│   │   │   ├── helium.rs
│   │   │   ├── hiera.rs
│   │   │   ├── jina_bert.rs
│   │   │   ├── llama.rs
│   │   │   ├── llama2_c.rs
│   │   │   ├── llama2_c_weights.rs
│   │   │   ├── llava/
│   │   │   │   ├── config.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── utils.rs
│   │   │   ├── mamba.rs
│   │   │   ├── mamba2.rs
│   │   │   ├── marian.rs
│   │   │   ├── metavoice.rs
│   │   │   ├── mimi/
│   │   │   │   ├── conv.rs
│   │   │   │   ├── encodec.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── quantization.rs
│   │   │   │   ├── seanet.rs
│   │   │   │   └── transformer.rs
│   │   │   ├── mistral.rs
│   │   │   ├── mixformer.rs
│   │   │   ├── mixtral.rs
│   │   │   ├── mmdit/
│   │   │   │   ├── blocks.rs
│   │   │   │   ├── embedding.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   └── projections.rs
│   │   │   ├── mobileclip.rs
│   │   │   ├── mobilenetv4.rs
│   │   │   ├── mobileone.rs
│   │   │   ├── mod.rs
│   │   │   ├── modernbert.rs
│   │   │   ├── moondream.rs
│   │   │   ├── mpt.rs
│   │   │   ├── nomic_bert.rs
│   │   │   ├── nvembed_v2/
│   │   │   │   ├── embedding.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── model.rs
│   │   │   ├── olmo.rs
│   │   │   ├── olmo2.rs
│   │   │   ├── openclip/
│   │   │   │   ├── mod.rs
│   │   │   │   └── text_model.rs
│   │   │   ├── paddleocr_vl/
│   │   │   │   ├── config.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text.rs
│   │   │   │   └── vision.rs
│   │   │   ├── paligemma.rs
│   │   │   ├── parler_tts.rs
│   │   │   ├── persimmon.rs
│   │   │   ├── phi.rs
│   │   │   ├── phi3.rs
│   │   │   ├── pixtral/
│   │   │   │   ├── llava.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── vision_model.rs
│   │   │   ├── quantized_blip.rs
│   │   │   ├── quantized_blip_text.rs
│   │   │   ├── quantized_gemma3.rs
│   │   │   ├── quantized_glm4.rs
│   │   │   ├── quantized_lfm2.rs
│   │   │   ├── quantized_llama.rs
│   │   │   ├── quantized_llama2_c.rs
│   │   │   ├── quantized_metavoice.rs
│   │   │   ├── quantized_mistral.rs
│   │   │   ├── quantized_mixformer.rs
│   │   │   ├── quantized_moondream.rs
│   │   │   ├── quantized_mpt.rs
│   │   │   ├── quantized_phi.rs
│   │   │   ├── quantized_phi3.rs
│   │   │   ├── quantized_qwen2.rs
│   │   │   ├── quantized_qwen3.rs
│   │   │   ├── quantized_qwen3_moe.rs
│   │   │   ├── quantized_recurrent_gemma.rs
│   │   │   ├── quantized_rwkv_v5.rs
│   │   │   ├── quantized_rwkv_v6.rs
│   │   │   ├── quantized_stable_lm.rs
│   │   │   ├── quantized_t5.rs
│   │   │   ├── qwen2.rs
│   │   │   ├── qwen2_moe.rs
│   │   │   ├── qwen3.rs
│   │   │   ├── qwen3_moe.rs
│   │   │   ├── qwen3_vl/
│   │   │   │   ├── config.rs
│   │   │   │   ├── conv3d_temporal_2.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── text.rs
│   │   │   │   └── vision.rs
│   │   │   ├── recurrent_gemma.rs
│   │   │   ├── repvgg.rs
│   │   │   ├── resnet.rs
│   │   │   ├── rwkv_v5.rs
│   │   │   ├── rwkv_v6.rs
│   │   │   ├── rwkv_v7.rs
│   │   │   ├── segformer.rs
│   │   │   ├── segment_anything/
│   │   │   │   ├── image_encoder.rs
│   │   │   │   ├── mask_decoder.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── prompt_encoder.rs
│   │   │   │   ├── sam.rs
│   │   │   │   ├── tiny_vit.rs
│   │   │   │   └── transformer.rs
│   │   │   ├── siglip.rs
│   │   │   ├── smol/
│   │   │   │   ├── README.md
│   │   │   │   ├── mod.rs
│   │   │   │   ├── quantized_smollm3.rs
│   │   │   │   └── smollm3.rs
│   │   │   ├── snac.rs
│   │   │   ├── stable_diffusion/
│   │   │   │   ├── attention.rs
│   │   │   │   ├── clip.rs
│   │   │   │   ├── ddim.rs
│   │   │   │   ├── ddpm.rs
│   │   │   │   ├── embeddings.rs
│   │   │   │   ├── euler_ancestral_discrete.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── resnet.rs
│   │   │   │   ├── schedulers.rs
│   │   │   │   ├── unet_2d.rs
│   │   │   │   ├── unet_2d_blocks.rs
│   │   │   │   ├── uni_pc.rs
│   │   │   │   ├── utils.rs
│   │   │   │   └── vae.rs
│   │   │   ├── stable_lm.rs
│   │   │   ├── starcoder2.rs
│   │   │   ├── stella_en_v5.rs
│   │   │   ├── t5.rs
│   │   │   ├── trocr.rs
│   │   │   ├── vgg.rs
│   │   │   ├── vit.rs
│   │   │   ├── voxtral/
│   │   │   │   ├── audio.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   └── voxtral_llama.rs
│   │   │   ├── whisper/
│   │   │   │   ├── audio.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── model.rs
│   │   │   │   └── quantized_model.rs
│   │   │   ├── with_tracing.rs
│   │   │   ├── wuerstchen/
│   │   │   │   ├── attention_processor.rs
│   │   │   │   ├── common.rs
│   │   │   │   ├── ddpm.rs
│   │   │   │   ├── diffnext.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── paella_vq.rs
│   │   │   │   └── prior.rs
│   │   │   ├── xlm_roberta.rs
│   │   │   ├── yi.rs
│   │   │   └── z_image/
│   │   │       ├── mod.rs
│   │   │       ├── preprocess.rs
│   │   │       ├── sampling.rs
│   │   │       ├── scheduler.rs
│   │   │       ├── text_encoder.rs
│   │   │       ├── transformer.rs
│   │   │       └── vae.rs
│   │   ├── object_detection.rs
│   │   ├── pipelines/
│   │   │   ├── mod.rs
│   │   │   └── text_generation.rs
│   │   ├── quantized_nn.rs
│   │   ├── quantized_var_builder.rs
│   │   └── utils.rs
│   └── tests/
│       ├── generation_tests.rs
│       └── nms_tests.rs
├── candle-ug/
│   ├── Cargo.toml
│   └── src/
│       └── lib.rs
├── candle-wasm-examples/
│   ├── bert/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── bertWorker.js
│   │   ├── build-lib.sh
│   │   ├── lib-example.html
│   │   ├── src/
│   │   │   ├── bin/
│   │   │   │   └── m.rs
│   │   │   └── lib.rs
│   │   └── utils.js
│   ├── blip/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── blipWorker.js
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       ├── lib.rs
│   │       └── token_output_stream.rs
│   ├── chat-template/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   └── src/
│   │       └── lib.rs
│   ├── llama2-c/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── lib-example.html
│   │   ├── llama2cWorker.js
│   │   └── src/
│   │       ├── app.rs
│   │       ├── bin/
│   │       │   ├── app.rs
│   │       │   ├── m.rs
│   │       │   └── worker.rs
│   │       ├── lib.rs
│   │       ├── model.rs
│   │       └── worker.rs
│   ├── moondream/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── code.js
│   │   ├── index.html
│   │   ├── moondreamWorker.js
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       └── lib.rs
│   ├── phi/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── phiWorker.js
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       └── lib.rs
│   ├── quant-qwen3/
│   │   ├── .cargo/
│   │   │   └── config.toml
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── index.html
│   │   ├── serve.py
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── m.rs
│   │       └── profiler.rs
│   ├── segment-anything/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── lib-example.html
│   │   ├── samWorker.js
│   │   └── src/
│   │       ├── bin/
│   │       │   └── m.rs
│   │       └── lib.rs
│   ├── t5/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── T5ModelConditionalGeneration.js
│   │   ├── T5ModelEncoderWorker.js
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── src/
│   │   │   ├── bin/
│   │   │   │   ├── m-quantized.rs
│   │   │   │   └── m.rs
│   │   │   └── lib.rs
│   │   └── utils.js
│   ├── whisper/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build-lib.sh
│   │   ├── index.html
│   │   ├── lib-example.html
│   │   ├── main.js
│   │   ├── src/
│   │   │   ├── app.rs
│   │   │   ├── audio.rs
│   │   │   ├── bin/
│   │   │   │   ├── app.rs
│   │   │   │   ├── m.rs
│   │   │   │   └── worker.rs
│   │   │   ├── languages.rs
│   │   │   ├── lib.rs
│   │   │   └── worker.rs
│   │   └── whisperWorker.js
│   └── yolo/
│       ├── Cargo.toml
│       ├── README.md
│       ├── build-lib.sh
│       ├── index.html
│       ├── lib-example.html
│       ├── src/
│       │   ├── app.rs
│       │   ├── bin/
│       │   │   ├── app.rs
│       │   │   ├── m.rs
│       │   │   └── worker.rs
│       │   ├── coco_classes.rs
│       │   ├── lib.rs
│       │   ├── model.rs
│       │   └── worker.rs
│       └── yoloWorker.js
├── candle-wasm-tests/
│   ├── Cargo.toml
│   ├── README.md
│   ├── src/
│   │   └── lib.rs
│   ├── tests/
│   │   └── quantized_tests.rs
│   └── webdriver.json
└── tensor-tools/
    ├── Cargo.toml
    └── src/
        └── main.rs

Download .txt

Showing preview only (849K chars total). Download the full file or copy to clipboard to get everything.

SYMBOL INDEX (10803 symbols across 637 files)

FILE: candle-book/src/lib.rs
  function book_hub_1 (line 13) | async fn book_hub_1() {
  function book_hub_2 (line 30) | fn book_hub_2() {
  function book_training_1 (line 111) | fn book_training_1() -> Result<()>{

FILE: candle-book/src/simplified.rs
  constant VOTE_DIM (line 24) | const VOTE_DIM: usize = 2;
  constant RESULTS (line 25) | const RESULTS: usize = 1;
  constant EPOCHS (line 26) | const EPOCHS: usize = 10;
  constant LAYER1_OUT_SIZE (line 27) | const LAYER1_OUT_SIZE: usize = 4;
  constant LAYER2_OUT_SIZE (line 28) | const LAYER2_OUT_SIZE: usize = 2;
  constant LEARNING_RATE (line 29) | const LEARNING_RATE: f64 = 0.05;
  type Dataset (line 32) | pub struct Dataset {
  type MultiLevelPerceptron (line 39) | struct MultiLevelPerceptron {
    method new (line 46) | fn new(vs: VarBuilder) -> Result<Self> {
    method forward (line 53) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  function simplified (line 68) | async fn simplified() -> anyhow::Result<()> {
  function train (line 154) | fn train(m: Dataset, dev: &Device) -> anyhow::Result<MultiLevelPerceptro...

FILE: candle-core/benches/benchmarks/affine.rs
  function run (line 7) | fn run(a: &Tensor) {
  function run_affine_benchmark (line 11) | fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType...
  function criterion_benchmark (line 35) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/binary.rs
  function run (line 7) | fn run(lhs: &Tensor, rhs: &Tensor) -> Tensor {
  function run_unary_benchmark (line 11) | fn run_unary_benchmark(c: &mut Criterion, device: &Device, dtype: DType,...
  function criterion_benchmark (line 47) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/broadcast.rs
  function run (line 7) | fn run(w: &Tensor, bias: &Tensor) {
  function run_benchmark (line 11) | fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name:...
  function criterion_benchmark (line 38) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/conv_transpose2d.rs
  function run (line 7) | fn run(
  function run_benchmark (line 19) | fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name:...
  function criterion_benchmark (line 51) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/copy.rs
  function run_copy_mask_benchmark (line 7) | fn run_copy_mask_benchmark<D: WithDType>(c: &mut Criterion, device: &Dev...
  function criterion_benchmark (line 32) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/matmul.rs
  constant MATMUL_SHAPES (line 8) | const MATMUL_SHAPES: &[(&str, &[usize], &[usize])] = &[
  function run (line 23) | fn run(a: &Tensor, b: &Tensor) {
  function calculate_flops (line 27) | fn calculate_flops(shape_a: &[usize], shape_b: &[usize]) -> usize {
  function run_bench (line 39) | fn run_bench(c: &mut Criterion, device: &Device, name: &str, shape_a: &[...
  function criterion_benchmark (line 61) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/mod.rs
  type BenchDevice (line 15) | pub(crate) trait BenchDevice {
    method sync (line 16) | fn sync(&self) -> Result<()>;
    method bench_name (line 18) | fn bench_name<S: Into<String>>(&self, name: S) -> String;
    method sync (line 22) | fn sync(&self) -> Result<()> {
    method bench_name (line 43) | fn bench_name<S: Into<String>>(&self, name: S) -> String {
  type BenchDeviceHandler (line 61) | struct BenchDeviceHandler {
    method new (line 66) | pub fn new() -> Result<Self> {

FILE: candle-core/benches/benchmarks/qmatmul.rs
  function run (line 10) | fn run(matmul: &QMatMul, x: &Tensor) {
  function run_bench (line 14) | fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
  function criterion_benchmark (line 51) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/random.rs
  function rand_uniform (line 7) | fn rand_uniform(a: &Tensor) {
  function rand_normal (line 11) | fn rand_normal(a: &Tensor) {
  function run_random_bench (line 15) | fn run_random_bench(c: &mut Criterion, device: &Device) {
  function criterion_benchmark (line 57) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/reduce.rs
  function run_sum (line 8) | fn run_sum(a: &Tensor) {
  function run_arg_min (line 11) | fn run_arg_min(a: &Tensor) {
  function criterion_benchmark (line 15) | fn criterion_benchmark(c: &mut Criterion) {
  function run_reduce (line 37) | fn run_reduce<T: candle_core::FloatDType>(
  function run_arg_reduce (line 98) | fn run_arg_reduce<T: candle_core::FloatDType>(

FILE: candle-core/benches/benchmarks/unary.rs
  function run_sqrt (line 7) | fn run_sqrt(a: &Tensor) {
  function run_unary_benchmark (line 11) | fn run_unary_benchmark(c: &mut Criterion, device: &Device, dtype: DType,...
  function run_cast (line 40) | fn run_cast(a: &Tensor, dtype: DType) {
  function run_cast_benchmark (line 44) | fn run_cast_benchmark(
  function criterion_benchmark (line 79) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/benches/benchmarks/where_cond.rs
  function run (line 7) | fn run(a: &Tensor, b: &Tensor, c: &Tensor) {
  function create_cond_arr (line 11) | const fn create_cond_arr<const N: usize>() -> [u8; N] {
  constant B (line 21) | const B: usize = 1;
  constant M (line 22) | const M: usize = 1024;
  constant K (line 23) | const K: usize = 1024;
  constant SIZE (line 24) | const SIZE: usize = B * M * K;
  function run_where_cond_benchmark (line 28) | fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: D...
  function criterion_benchmark (line 56) | fn criterion_benchmark(c: &mut Criterion) {

FILE: candle-core/examples/basics.rs
  function main (line 10) | fn main() -> Result<()> {

FILE: candle-core/examples/cuda_basics.rs
  function main (line 10) | fn main() -> Result<()> {

FILE: candle-core/examples/cuda_sum_benchmark.rs
  function cos_sin (line 12) | fn cos_sin(n: usize, device: &Device) -> Result<Tensor> {
  function main (line 22) | fn main() -> Result<()> {

FILE: candle-core/examples/metal_basics.rs
  function main (line 10) | fn main() -> Result<()> {

FILE: candle-core/src/accelerate.rs
  function sgemm_ffi (line 11) | pub fn sgemm_ffi(
  function dgemm_ffi (line 27) | pub fn dgemm_ffi(
  function vvexpf (line 43) | pub fn vvexpf(dst: *mut c_float, src: *const c_float, len: *const c_int);
  function vvexp (line 44) | pub fn vvexp(dst: *mut c_double, src: *const c_double, len: *const c_int);
  function vvsqrtf (line 45) | pub fn vvsqrtf(dst: *mut c_float, src: *const c_float, len: *const c_int);
  function vvsqrt (line 46) | pub fn vvsqrt(dst: *mut c_double, src: *const c_double, len: *const c_int);
  function vvsinf (line 47) | pub fn vvsinf(dst: *mut c_float, src: *const c_float, len: *const c_int);
  function vvsin (line 48) | pub fn vvsin(dst: *mut c_double, src: *const c_double, len: *const c_int);
  function vvcosf (line 49) | pub fn vvcosf(dst: *mut c_float, src: *const c_float, len: *const c_int);
  function vvcos (line 50) | pub fn vvcos(dst: *mut c_double, src: *const c_double, len: *const c_int);
  function vvlogf (line 51) | pub fn vvlogf(dst: *mut c_float, src: *const c_float, len: *const c_int);
  function vvlog (line 52) | pub fn vvlog(dst: *mut c_double, src: *const c_double, len: *const c_int);
  function vvtanhf (line 53) | pub fn vvtanhf(dst: *mut c_float, src: *const c_float, len: *const c_int);
  function vvtanh (line 54) | pub fn vvtanh(dst: *mut c_double, src: *const c_double, len: *const c_int);
  function vDSP_vaddD (line 56) | pub fn vDSP_vaddD(
  function vDSP_vadd (line 65) | pub fn vDSP_vadd(
  function vDSP_vsubD (line 74) | pub fn vDSP_vsubD(
  function vDSP_vsub (line 83) | pub fn vDSP_vsub(
  function vDSP_vmulD (line 92) | pub fn vDSP_vmulD(
  function vDSP_vmul (line 101) | pub fn vDSP_vmul(
  function vDSP_vdivD (line 110) | pub fn vDSP_vdivD(
  function vDSP_vdiv (line 119) | pub fn vDSP_vdiv(
  function vDSP_vminD (line 128) | pub fn vDSP_vminD(
  function vDSP_vmin (line 137) | pub fn vDSP_vmin(
  function vDSP_vmaxD (line 146) | pub fn vDSP_vmaxD(
  function vDSP_vmax (line 155) | pub fn vDSP_vmax(
  function sgemm (line 169) | pub unsafe fn sgemm(
  function dgemm (line 203) | pub unsafe fn dgemm(
  function vs_exp (line 236) | pub fn vs_exp(a: &[f32], y: &mut [f32]) {
  function vd_exp (line 246) | pub fn vd_exp(a: &[f64], y: &mut [f64]) {
  function vs_sqrt (line 256) | pub fn vs_sqrt(a: &[f32], y: &mut [f32]) {
  function vd_sqrt (line 266) | pub fn vd_sqrt(a: &[f64], y: &mut [f64]) {
  function vs_sin (line 276) | pub fn vs_sin(a: &[f32], y: &mut [f32]) {
  function vd_sin (line 286) | pub fn vd_sin(a: &[f64], y: &mut [f64]) {
  function vs_cos (line 295) | pub fn vs_cos(a: &[f32], y: &mut [f32]) {
  function vd_cos (line 305) | pub fn vd_cos(a: &[f64], y: &mut [f64]) {
  function vs_tanh (line 314) | pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
  function vd_tanh (line 324) | pub fn vd_tanh(a: &[f64], y: &mut [f64]) {
  function vs_ln (line 334) | pub fn vs_ln(a: &[f32], y: &mut [f32]) {
  function vd_ln (line 344) | pub fn vd_ln(a: &[f64], y: &mut [f64]) {
  function vs_sqr (line 354) | pub fn vs_sqr(a: &[f32], y: &mut [f32]) {
  function vd_sqr (line 364) | pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
  function vs_tanh_inplace (line 374) | pub fn vs_tanh_inplace(y: &mut [f32]) {
  function vd_tanh_inplace (line 379) | pub fn vd_tanh_inplace(y: &mut [f64]) {
  function vs_exp_inplace (line 384) | pub fn vs_exp_inplace(y: &mut [f32]) {
  function vd_exp_inplace (line 389) | pub fn vd_exp_inplace(y: &mut [f64]) {
  function vs_gelu (line 394) | pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
  function vd_gelu (line 405) | pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
  function vs_silu (line 416) | pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
  function vd_silu (line 427) | pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {

FILE: candle-core/src/backend.rs
  type BackendStorage (line 6) | pub trait BackendStorage: Sized {
    method try_clone (line 9) | fn try_clone(&self, _: &Layout) -> Result<Self>;
    method dtype (line 11) | fn dtype(&self) -> DType;
    method device (line 13) | fn device(&self) -> &Self::Device;
    method to_cpu_storage (line 16) | fn to_cpu_storage(&self) -> Result<CpuStorage>;
    method affine (line 18) | fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self>;
    method powf (line 20) | fn powf(&self, _: &Layout, _: f64) -> Result<Self>;
    method elu (line 22) | fn elu(&self, _: &Layout, _: f64) -> Result<Self>;
    method reduce_op (line 24) | fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Se...
    method cmp (line 26) | fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Se...
    method to_dtype (line 28) | fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self>;
    method unary_impl (line 30) | fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self>;
    method binary_impl (line 32) | fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) ...
    method where_cond (line 34) | fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &L...
    method conv1d (line 36) | fn conv1d(
    method conv_transpose1d (line 44) | fn conv_transpose1d(
    method conv2d (line 52) | fn conv2d(
    method conv_transpose2d (line 60) | fn conv_transpose2d(
    method avg_pool2d (line 68) | fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize))...
    method max_pool2d (line 69) | fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize))...
    method upsample_nearest1d (line 70) | fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self>;
    method upsample_nearest2d (line 71) | fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result...
    method upsample_bilinear2d (line 72) | fn upsample_bilinear2d(
    method gather (line 82) | fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result...
    method scatter_set (line 84) | fn scatter_set(
    method scatter_add_set (line 94) | fn scatter_add_set(
    method index_select (line 104) | fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> ...
    method index_add (line 105) | fn index_add(
    method matmul (line 115) | fn matmul(
    method copy_strided_src (line 123) | fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Resu...
    method copy2d (line 127) | fn copy2d(
    method const_set (line 138) | fn const_set(&mut self, _: crate::scalar::Scalar, _: &Layout) -> Resul...
  type BackendDevice (line 141) | pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
    method new (line 145) | fn new(_: usize) -> Result<Self>;
    method location (line 147) | fn location(&self) -> crate::DeviceLocation;
    method same_device (line 149) | fn same_device(&self, _: &Self) -> bool;
    method zeros_impl (line 151) | fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::St...
    method alloc_uninit (line 157) | unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result...
    method storage_from_slice (line 159) | fn storage_from_slice<T: crate::WithDType>(&self, _: &[T]) -> Result<S...
    method storage_from_cpu_storage (line 161) | fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Sto...
    method storage_from_cpu_storage_owned (line 163) | fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self...
    method rand_uniform (line 165) | fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<...
    method rand_normal (line 167) | fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<S...
    method set_seed (line 169) | fn set_seed(&self, _: u64) -> Result<()>;
    method get_current_seed (line 170) | fn get_current_seed(&self) -> Result<u64>;
    method synchronize (line 173) | fn synchronize(&self) -> Result<()>;

FILE: candle-core/src/backprop.rs
  function broadcast_back (line 8) | fn broadcast_back(arg: &Tensor, node: &Tensor, reduced_dims: &[usize]) -...
  method sorted_nodes (line 35) | pub fn sorted_nodes(&self) -> Vec<&Tensor> {
  method backward (line 165) | pub fn backward(&self) -> Result<GradStore> {
  type GradStore (line 733) | pub struct GradStore(HashMap<TensorId, Tensor>);
    method new (line 737) | fn new() -> Self {
    method get_id (line 742) | pub fn get_id(&self, id: TensorId) -> Option<&Tensor> {
    method get (line 747) | pub fn get(&self, tensor: &Tensor) -> Option<&Tensor> {
    method remove (line 752) | pub fn remove(&mut self, tensor: &Tensor) -> Option<Tensor> {
    method insert (line 757) | pub fn insert(&mut self, tensor: &Tensor, grad: Tensor) -> Option<Tens...
    method insert_id (line 762) | pub fn insert_id(&mut self, id: TensorId, grad: Tensor) -> Option<Tens...
    method or_insert (line 768) | fn or_insert(&mut self, tensor: &Tensor) -> Result<&mut Tensor> {
    method get_ids (line 781) | pub fn get_ids(&self) -> impl Iterator<Item = &TensorId> {

FILE: candle-core/src/conv.rs
  type ParamsConv1D (line 6) | pub struct ParamsConv1D {
    method l_out (line 21) | pub(crate) fn l_out(&self) -> usize {
    method out_dims (line 25) | pub(crate) fn out_dims(&self) -> Vec<usize> {
  type ParamsConvTranspose1D (line 32) | pub struct ParamsConvTranspose1D {
    method l_out (line 45) | pub(crate) fn l_out(&self) -> usize {
    method out_dims (line 52) | pub(crate) fn out_dims(&self) -> Vec<usize> {
  type CudnnFwdAlgo (line 59) | pub enum CudnnFwdAlgo {
  type ParamsConv2D (line 72) | pub struct ParamsConv2D {
    method out_h (line 87) | pub(crate) fn out_h(&self) -> usize {
    method out_w (line 91) | pub(crate) fn out_w(&self) -> usize {
    method out_dims (line 95) | pub(crate) fn out_dims(&self) -> Vec<usize> {
  type ParamsConvTranspose2D (line 101) | pub struct ParamsConvTranspose2D {
    method out_h (line 116) | pub(crate) fn out_h(&self) -> usize {
    method out_w (line 121) | pub(crate) fn out_w(&self) -> usize {
    method out_dims (line 126) | pub(crate) fn out_dims(&self) -> Vec<usize> {
  method conv1d_single_group (line 132) | fn conv1d_single_group(&self, kernel: &Self, params: &ParamsConv1D) -> R...
  method conv1d (line 148) | pub fn conv1d(
  method conv1d_with_algo (line 160) | pub fn conv1d_with_algo(
  method conv_transpose1d_single_group (line 207) | fn conv_transpose1d_single_group(
  method conv_transpose1d (line 231) | pub fn conv_transpose1d(
  method conv2d_single_group (line 273) | fn conv2d_single_group(&self, kernel: &Self, params: &ParamsConv2D) -> R...
  method conv2d (line 289) | pub fn conv2d(
  method conv2d_with_algo (line 300) | pub fn conv2d_with_algo(
  method conv_transpose2d (line 344) | pub fn conv_transpose2d(

FILE: candle-core/src/convert.rs
  type Error (line 7) | type Error = Error;
  function try_from (line 8) | fn try_from(tensor: &Tensor) -> Result<Self, Self::Error> {
  type Error (line 14) | type Error = Error;
  function try_from (line 15) | fn try_from(tensor: &Tensor) -> Result<Self, Self::Error> {
  type Error (line 21) | type Error = Error;
  function try_from (line 22) | fn try_from(tensor: &Tensor) -> Result<Self, Self::Error> {
  type Error (line 28) | type Error = Error;
  function try_from (line 29) | fn try_from(tensor: Tensor) -> Result<Self, Self::Error> {
  type Error (line 35) | type Error = Error;
  function try_from (line 36) | fn try_from(tensor: Tensor) -> Result<Self, Self::Error> {
  type Error (line 42) | type Error = Error;
  function try_from (line 43) | fn try_from(tensor: Tensor) -> Result<Self, Self::Error> {
  type Error (line 49) | type Error = Error;
  method try_from (line 50) | fn try_from(v: &[T]) -> Result<Self, Self::Error> {
  type Error (line 56) | type Error = Error;
  method try_from (line 57) | fn try_from(v: Vec<T>) -> Result<Self, Self::Error> {
  method write_bytes (line 102) | pub fn write_bytes<W: std::io::Write>(&self, f: &mut W) -> crate::Result...

FILE: candle-core/src/cpu/avx.rs
  type CurrentCpu (line 9) | pub struct CurrentCpu {}
    type Unit (line 16) | type Unit = __m256;
    type Array (line 17) | type Array = [__m256; ARR];
    constant STEP (line 19) | const STEP: usize = STEP;
    constant EPR (line 20) | const EPR: usize = EPR;
    method n (line 22) | fn n() -> usize {
    method zero (line 26) | unsafe fn zero() -> Self::Unit {
    method zero_array (line 30) | unsafe fn zero_array() -> Self::Array {
    method from_f32 (line 34) | unsafe fn from_f32(v: f32) -> Self::Unit {
    method load (line 38) | unsafe fn load(mem_addr: *const f32) -> Self::Unit {
    method vec_add (line 42) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
    method vec_fma (line 46) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_store (line 50) | unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
    method vec_reduce (line 54) | unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
  constant STEP (line 11) | const STEP: usize = 32;
  constant EPR (line 12) | const EPR: usize = 8;
  constant ARR (line 13) | const ARR: usize = STEP / EPR;
  type CurrentCpuF16 (line 71) | pub struct CurrentCpuF16 {}
    type Unit (line 73) | type Unit = __m256;
    type Array (line 74) | type Array = [__m256; ARR];
    constant STEP (line 76) | const STEP: usize = STEP;
    constant EPR (line 77) | const EPR: usize = EPR;
    method n (line 79) | fn n() -> usize {
    method zero (line 83) | unsafe fn zero() -> Self::Unit {
    method zero_array (line 87) | unsafe fn zero_array() -> Self::Array {
    method from_f32 (line 91) | unsafe fn from_f32(v: f32) -> Self::Unit {
    method load (line 96) | unsafe fn load(mem_addr: *const f16) -> Self::Unit {
    method load (line 101) | unsafe fn load(mem_addr: *const f16) -> Self::Unit {
    method vec_add (line 109) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
    method vec_fma (line 113) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_store (line 118) | unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit) {
    method vec_store (line 123) | unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit) {
    method vec_reduce (line 131) | unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
  type CurrentCpuBF16 (line 150) | pub struct CurrentCpuBF16 {}
    type Unit (line 152) | type Unit = __m256;
    type Array (line 153) | type Array = [__m256; ARR];
    constant STEP (line 155) | const STEP: usize = STEP;
    constant EPR (line 156) | const EPR: usize = EPR;
    method n (line 158) | fn n() -> usize {
    method zero (line 162) | unsafe fn zero() -> Self::Unit {
    method zero_array (line 166) | unsafe fn zero_array() -> Self::Array {
    method from_f32 (line 170) | unsafe fn from_f32(v: f32) -> Self::Unit {
    method load (line 175) | unsafe fn load(mem_addr: *const bf16) -> Self::Unit {
    method load (line 180) | unsafe fn load(mem_addr: *const bf16) -> Self::Unit {
    method vec_add (line 188) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
    method vec_fma (line 192) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_store (line 197) | unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit) {
    method vec_store (line 202) | unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit) {
    method vec_reduce (line 210) | unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {

FILE: candle-core/src/cpu/erf.rs
  function polynomial (line 19) | pub fn polynomial(z: f64, coeff: &[f64]) -> f64 {
  function erf_f64 (line 35) | pub fn erf_f64(x: f64) -> f64 {
  function erf_f32 (line 39) | pub fn erf_f32(x: f32) -> f32 {
  function erf_inv (line 45) | pub fn erf_inv(x: f64) -> f64 {
  function erfc_f64 (line 61) | pub fn erfc_f64(x: f64) -> f64 {
  function erfc_f32 (line 65) | pub fn erfc_f32(x: f32) -> f32 {
  function erfc_inv (line 71) | pub fn erfc_inv(x: f64) -> f64 {
  constant ERF_INV_IMPL_AN (line 89) | const ERF_INV_IMPL_AN: &[f64] = &[
  constant ERF_INV_IMPL_AD (line 102) | const ERF_INV_IMPL_AD: &[f64] = &[
  constant ERF_INV_IMPL_BN (line 117) | const ERF_INV_IMPL_BN: &[f64] = &[
  constant ERF_INV_IMPL_BD (line 131) | const ERF_INV_IMPL_BD: &[f64] = &[
  constant ERF_INV_IMPL_CN (line 145) | const ERF_INV_IMPL_CN: &[f64] = &[
  constant ERF_INV_IMPL_CD (line 161) | const ERF_INV_IMPL_CD: &[f64] = &[
  constant ERF_INV_IMPL_DN (line 174) | const ERF_INV_IMPL_DN: &[f64] = &[
  constant ERF_INV_IMPL_DD (line 188) | const ERF_INV_IMPL_DD: &[f64] = &[
  constant ERF_INV_IMPL_EN (line 200) | const ERF_INV_IMPL_EN: &[f64] = &[
  constant ERF_INV_IMPL_ED (line 214) | const ERF_INV_IMPL_ED: &[f64] = &[
  constant ERF_INV_IMPL_FN (line 226) | const ERF_INV_IMPL_FN: &[f64] = &[
  constant ERF_INV_IMPL_FD (line 239) | const ERF_INV_IMPL_FD: &[f64] = &[
  constant ERF_INV_IMPL_GN (line 251) | const ERF_INV_IMPL_GN: &[f64] = &[
  constant ERF_INV_IMPL_GD (line 264) | const ERF_INV_IMPL_GD: &[f64] = &[
  function erf_inv_impl (line 277) | fn erf_inv_impl(p: f64, q: f64, s: f64) -> f64 {

FILE: candle-core/src/cpu/kernels.rs
  type VecOps (line 1) | pub trait VecOps: num_traits::NumAssign + Copy {
    method min (line 2) | fn min(self, rhs: Self) -> Self;
    method max (line 3) | fn max(self, rhs: Self) -> Self;
    method vec_dot (line 12) | unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, ...
    method vec_reduce_sum (line 26) | unsafe fn vec_reduce_sum(xs: *const Self, res: *mut Self, len: usize) {
    method vec_reduce_max (line 40) | unsafe fn vec_reduce_max(xs: *const Self, res: *mut Self, len: usize) {
    method vec_reduce_min (line 54) | unsafe fn vec_reduce_min(xs: *const Self, res: *mut Self, len: usize) {
    method min (line 64) | fn min(self, other: Self) -> Self {
    method max (line 69) | fn max(self, other: Self) -> Self {
    method vec_dot (line 74) | unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, ...
    method vec_reduce_sum (line 79) | unsafe fn vec_reduce_sum(xs: *const Self, res: *mut Self, len: usize) {
    method min (line 86) | fn min(self, other: Self) -> Self {
    method max (line 91) | fn max(self, other: Self) -> Self {
    method vec_dot (line 96) | unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, ...
    method min (line 105) | fn min(self, other: Self) -> Self {
    method max (line 110) | fn max(self, other: Self) -> Self {
    method min (line 116) | fn min(self, other: Self) -> Self {
    method max (line 121) | fn max(self, other: Self) -> Self {
    method vec_dot (line 126) | unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, ...
    method min (line 134) | fn min(self, other: Self) -> Self {
    method max (line 139) | fn max(self, other: Self) -> Self {
    method min (line 145) | fn min(self, other: Self) -> Self {
    method max (line 150) | fn max(self, other: Self) -> Self {
    method min (line 156) | fn min(self, other: Self) -> Self {
    method max (line 161) | fn max(self, other: Self) -> Self {
    method min (line 167) | fn min(self, other: Self) -> Self {
    method max (line 172) | fn max(self, other: Self) -> Self {
    method min (line 178) | fn min(self, other: Self) -> Self {
    method max (line 183) | fn max(self, other: Self) -> Self {
    method min (line 190) | fn min(self, other: Self) -> Self {
    method max (line 195) | fn max(self, other: Self) -> Self {
  function par_for_each (line 201) | pub fn par_for_each(n_threads: usize, func: impl Fn(usize) + Send + Sync) {
  function par_range (line 215) | pub fn par_range(lo: usize, up: usize, n_threads: usize, func: impl Fn(u...

FILE: candle-core/src/cpu/mod.rs
  type Cpu (line 7) | trait Cpu<const ARR: usize> {
    constant STEP (line 10) | const STEP: usize;
    constant EPR (line 11) | const EPR: usize;
    method n (line 13) | fn n() -> usize;
    method zero (line 14) | unsafe fn zero() -> Self::Unit;
    method zero_array (line 15) | unsafe fn zero_array() -> Self::Array;
    method load (line 16) | unsafe fn load(mem_addr: *const f32) -> Self::Unit;
    method vec_add (line 17) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit;
    method vec_fma (line 18) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_reduce (line 19) | unsafe fn vec_reduce(x: Self::Array, y: *mut f32);
    method from_f32 (line 20) | unsafe fn from_f32(v: f32) -> Self::Unit;
    method vec_store (line 21) | unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit);
  type CpuF16 (line 25) | trait CpuF16<const ARR: usize> {
    constant STEP (line 28) | const STEP: usize;
    constant EPR (line 29) | const EPR: usize;
    method n (line 31) | fn n() -> usize;
    method zero (line 32) | unsafe fn zero() -> Self::Unit;
    method zero_array (line 33) | unsafe fn zero_array() -> Self::Array;
    method load (line 34) | unsafe fn load(mem_addr: *const f16) -> Self::Unit;
    method vec_add (line 35) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit;
    method vec_fma (line 36) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_reduce (line 37) | unsafe fn vec_reduce(x: Self::Array, y: *mut f32);
    method from_f32 (line 38) | unsafe fn from_f32(v: f32) -> Self::Unit;
    method vec_store (line 39) | unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit);
  type CpuBF16 (line 43) | trait CpuBF16<const ARR: usize> {
    constant STEP (line 46) | const STEP: usize;
    constant EPR (line 47) | const EPR: usize;
    method n (line 49) | fn n() -> usize;
    method zero (line 50) | unsafe fn zero() -> Self::Unit;
    method zero_array (line 51) | unsafe fn zero_array() -> Self::Array;
    method load (line 52) | unsafe fn load(mem_addr: *const bf16) -> Self::Unit;
    method vec_add (line 53) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit;
    method vec_fma (line 54) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_reduce (line 55) | unsafe fn vec_reduce(x: Self::Array, y: *mut f32);
    method from_f32 (line 56) | unsafe fn from_f32(v: f32) -> Self::Unit;
    method vec_store (line 57) | unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit);
  function vec_dot_f32 (line 89) | pub(crate) unsafe fn vec_dot_f32(a_row: *const f32, b_row: *const f32, c...
  function vec_dot_f32 (line 119) | pub(crate) unsafe fn vec_dot_f32(a_row: *const f32, b_row: *const f32, c...
  function vec_sum (line 132) | pub(crate) unsafe fn vec_sum(row: *const f32, b: *mut f32, k: usize) {
  function vec_sum (line 159) | pub(crate) unsafe fn vec_sum(row: *const f32, b: *mut f32, k: usize) {
  function vec_dot_f16 (line 168) | pub(crate) unsafe fn vec_dot_f16(a_row: *const f16, b_row: *const f16, c...
  function vec_dot_bf16 (line 196) | pub(crate) unsafe fn vec_dot_bf16(a_row: *const bf16, b_row: *const bf16...
  function vec_dot_f16 (line 224) | pub(crate) unsafe fn vec_dot_f16(a_row: *const f16, b_row: *const f16, c...
  function vec_dot_bf16 (line 235) | pub(crate) unsafe fn vec_dot_bf16(a_row: *const bf16, b_row: *const bf16...

FILE: candle-core/src/cpu/neon.rs
  type CurrentCpu (line 8) | pub struct CurrentCpu {}
    method reduce_one (line 16) | unsafe fn reduce_one(x: float32x4_t) -> f32 {
    method reduce_one (line 21) | unsafe fn reduce_one(x: float32x4_t) -> f32 {
    type Unit (line 27) | type Unit = float32x4_t;
    type Array (line 28) | type Array = [float32x4_t; ARR];
    constant STEP (line 30) | const STEP: usize = STEP;
    constant EPR (line 31) | const EPR: usize = EPR;
    method n (line 33) | fn n() -> usize {
    method zero (line 37) | unsafe fn zero() -> Self::Unit {
    method from_f32 (line 41) | unsafe fn from_f32(x: f32) -> Self::Unit {
    method zero_array (line 45) | unsafe fn zero_array() -> Self::Array {
    method load (line 49) | unsafe fn load(mem_addr: *const f32) -> Self::Unit {
    method vec_add (line 53) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
    method vec_fma (line 57) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_store (line 61) | unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
    method vec_reduce (line 65) | unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
  constant STEP (line 10) | const STEP: usize = 16;
  constant EPR (line 11) | const EPR: usize = 4;
  constant ARR (line 12) | const ARR: usize = STEP / EPR;

FILE: candle-core/src/cpu/simd128.rs
  type CurrentCpu (line 4) | pub struct CurrentCpu {}
    type Unit (line 11) | type Unit = v128;
    type Array (line 12) | type Array = [v128; ARR];
    constant STEP (line 14) | const STEP: usize = STEP;
    constant EPR (line 15) | const EPR: usize = EPR;
    method n (line 17) | fn n() -> usize {
    method zero (line 21) | unsafe fn zero() -> Self::Unit {
    method zero_array (line 25) | unsafe fn zero_array() -> Self::Array {
    method from_f32 (line 29) | unsafe fn from_f32(v: f32) -> Self::Unit {
    method load (line 33) | unsafe fn load(mem_addr: *const f32) -> Self::Unit {
    method vec_add (line 37) | unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
    method vec_fma (line 41) | unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self...
    method vec_store (line 45) | unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
    method vec_reduce (line 49) | unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
  constant STEP (line 6) | const STEP: usize = 16;
  constant EPR (line 7) | const EPR: usize = 4;
  constant ARR (line 8) | const ARR: usize = STEP / EPR;

FILE: candle-core/src/cpu_backend/conv2d.rs
  type Conv2D (line 12) | pub(super) struct Conv2D<'a>(pub(super) &'a crate::conv::ParamsConv2D);
  type Conv2dImpl (line 15) | enum Conv2dImpl {
  constant DEFAULT_CONV2D_IMPL (line 21) | const DEFAULT_CONV2D_IMPL: Conv2dImpl = Conv2dImpl::TiledIm2Col;
  constant OP (line 24) | const OP: &'static str = "conv2d";
  method f (line 25) | fn f<T: WithDType + num_traits::Num + Copy + 'static>(
  function conv2d_1x1 (line 56) | fn conv2d_1x1<T: WithDType + num_traits::Num + Copy + 'static>(
  function conv2d_tiled (line 126) | fn conv2d_tiled<T: WithDType + num_traits::Num + Copy + 'static>(
  function conv2d_direct (line 280) | fn conv2d_direct<T: WithDType + num_traits::Num + Copy + 'static>(
  function alloc_uninit_vec (line 382) | fn alloc_uninit_vec<T: WithDType + Copy + 'static>(size: usize) -> Vec<T> {
  function conv2d_im2col_gemm (line 391) | fn conv2d_im2col_gemm<T: WithDType + num_traits::Num + Copy + 'static>(

FILE: candle-core/src/cpu_backend/mod.rs
  constant USE_IM2COL_CONV1D (line 16) | const USE_IM2COL_CONV1D: bool = true;
  constant USE_COL2IM_CONV1D_TR (line 17) | const USE_COL2IM_CONV1D_TR: bool = true;
  type CpuStorage (line 22) | pub enum CpuStorage {
    method as_slice (line 1659) | pub fn as_slice<D: WithDType>(&self) -> Result<&[D]> {
    method concat (line 1663) | pub fn concat(storages: &[CpuStorage]) -> Result<CpuStorage> {
  type CpuStorageRef (line 41) | pub enum CpuStorageRef<'a> {
  type CpuDevice (line 60) | pub struct CpuDevice;
  type Cmp (line 62) | struct Cmp(CmpOp);
  constant OP (line 64) | const OP: &'static str = "cmp";
  method f (line 66) | fn f<T: WithDType>(
  type WCond (line 85) | struct WCond<'a, T: IntDType>(&'a [T], &'a Layout);
  constant OP (line 88) | const OP: &'static str = "where";
  method f (line 90) | fn f<T: WithDType>(&self, t: &[T], t_l: &Layout, f: &[T], f_l: &Layout) ...
  type ReduceIndex (line 122) | struct ReduceIndex {
    method fold_impl (line 131) | fn fold_impl<T, U, F, G>(&self, src: &[T], src_l: &Layout, f: F, g: G)...
  method f (line 208) | fn f<T: WithDType, W: Fn(Vec<T>) -> CpuStorage>(
  type ReduceSum (line 231) | struct ReduceSum<'a> {
  function fold_impl (line 239) | fn fold_impl<T>(&self, src: &[T], src_l: &Layout, start_elt: T) -> Resul...
  method f (line 304) | fn f<T: WithDType>(&self, src: &[T], src_l: &Layout) -> Result<Vec<T>> {
  type Affine (line 309) | struct Affine(f64, f64);
  method f (line 312) | fn f<T: WithDType>(&self, vs: &[T], layout: &Layout) -> Result<Vec<T>> {
  type AvgPool2D (line 319) | struct AvgPool2D((usize, usize), (usize, usize));
  method f (line 322) | fn f<T: WithDType>(&self, src: &[T], layout: &Layout) -> Result<Vec<T>> {
  type MaxPool2D (line 360) | struct MaxPool2D((usize, usize), (usize, usize));
  method f (line 363) | fn f<T: WithDType>(&self, src: &[T], layout: &Layout) -> Result<Vec<T>> {
  type UpsampleNearest1D (line 402) | struct UpsampleNearest1D(usize);
  method f (line 405) | fn f<T: WithDType>(&self, src: &[T], layout: &Layout) -> Result<Vec<T>> {
  type UpsampleNearest2D (line 432) | struct UpsampleNearest2D(usize, usize);
  method f (line 435) | fn f<T: WithDType>(&self, src: &[T], layout: &Layout) -> Result<Vec<T>> {
  type UpsampleBilinear2D (line 469) | struct UpsampleBilinear2D {
  method f (line 478) | fn f<T: WithDType>(&self, src: &[T], layout: &Layout) -> Result<Vec<T>> {
  type Gather (line 588) | struct Gather<'a, I: IntDType> {
  method f (line 595) | fn f<T: WithDType>(&self, src: &[T], src_l: &Layout) -> Result<Vec<T>> {
  type IndexSelect (line 646) | struct IndexSelect<'a, T: IntDType> {
  method f (line 653) | fn f<T: WithDType>(&self, src: &[T], layout: &Layout) -> Result<Vec<T>> {
  type ElemUpdate (line 704) | trait ElemUpdate {
    method f (line 705) | fn f<T: WithDType>(dst: &mut T, src: T);
    method f (line 712) | fn f<T: WithDType>(dst: &mut T, src: T) {
    method f (line 718) | fn f<T: WithDType>(dst: &mut T, src: T) {
  type Set (line 708) | struct Set;
  type Add (line 709) | struct Add;
  type Scatter (line 723) | struct Scatter<'a, I: IntDType, M: ElemUpdate> {
  function new (line 731) | fn new(ids: &'a [I], ids_l: &'a Layout, dim: usize) -> Self {
  constant OP (line 742) | const OP: &'static str = "scatter";
  method f (line 743) | fn f<T: WithDType>(
  type IndexAdd (line 804) | struct IndexAdd<'a, I: IntDType> {
  constant OP (line 810) | const OP: &'static str = "index-add";
  method f (line 813) | fn f<T: WithDType>(&self, v1: &[T], l1: &Layout, src: &[T], src_l: &Layo...
  function copy2d_ (line 876) | fn copy2d_<T: Copy>(
  function copy_strided_src_ (line 895) | fn copy_strided_src_<T: Copy>(src: &[T], dst: &mut [T], dst_offset: usiz...
  type Conv1D (line 933) | struct Conv1D<'a>(&'a crate::conv::ParamsConv1D);
  constant OP (line 936) | const OP: &'static str = "conv1d";
  method f (line 937) | fn f<T: WithDType>(&self, inp: &[T], inp_l: &Layout, k: &[T], k_l: &Layo...
  type Im2Col1D (line 995) | struct Im2Col1D {
    method l_out (line 1003) | fn l_out(&self, l: usize) -> usize {
  method f (line 1009) | fn f<T: WithDType>(&self, vs: &[T], layout: &Layout) -> Result<Vec<T>> {
  type Im2Col (line 1054) | struct Im2Col {
    method hw_out (line 1063) | fn hw_out(&self, h: usize, w: usize) -> (usize, usize) {
  method f (line 1071) | fn f<T: WithDType>(&self, vs: &[T], layout: &Layout) -> Result<Vec<T>> {
  type Col2Im1D (line 1129) | struct Col2Im1D {
  method f (line 1134) | fn f<T: WithDType>(&self, col: &[T], l: &Layout) -> Result<Vec<T>> {
  type ConvTranspose1D (line 1157) | struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D);
  constant OP (line 1160) | const OP: &'static str = "conv_transpose1d";
  method f (line 1161) | fn f<T: WithDType>(&self, inp: &[T], inp_l: &Layout, k: &[T], k_l: &Layo...
  type ConvTranspose2D (line 1226) | struct ConvTranspose2D<'a>(&'a crate::conv::ParamsConvTranspose2D);
  constant OP (line 1229) | const OP: &'static str = "conv_transpose2d";
  method f (line 1230) | fn f<T: WithDType>(&self, inp: &[T], inp_l: &Layout, k: &[T], k_l: &Layo...
  type MatMul (line 1316) | struct MatMul((usize, usize, usize, usize));
    method striding_error (line 1319) | fn striding_error(&self, lhs_l: &Layout, rhs_l: &Layout, msg: &'static...
    method ab_skip (line 1329) | fn ab_skip(&self, lhs_l: &Layout, rhs_l: &Layout) -> Result<(usize, us...
  constant OP (line 1355) | const OP: &'static str = "mat_mul";
  method f (line 1358) | fn f<T: 'static + WithDType + num_traits::Num + Copy>(
  method f (line 1441) | fn f<T: 'static + WithDType + num_traits::Num + Copy>(
  method f (line 1532) | fn f<T: 'static + WithDType + num_traits::Num + Copy>(
  function elu (line 1650) | fn elu<T: num_traits::Float>(v: T, alpha: T) -> T {
  type Device (line 1826) | type Device = CpuDevice;
  method dtype (line 1828) | fn dtype(&self) -> DType {
  method to_dtype (line 1847) | fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result<Self> {
  method reduce_op (line 2269) | fn reduce_op(&self, op: ReduceOp, layout: &Layout, reduce_dims: &[usize]...
  method cmp (line 2325) | fn cmp(&self, op: CmpOp, rhs: &Self, lhs_l: &Layout, rhs_l: &Layout) -> ...
  method affine (line 2329) | fn affine(&self, layout: &Layout, mul: f64, add: f64) -> Result<Self> {
  method avg_pool2d (line 2333) | fn avg_pool2d(
  method max_pool2d (line 2342) | fn max_pool2d(
  method upsample_nearest1d (line 2351) | fn upsample_nearest1d(&self, layout: &Layout, sz: usize) -> Result<Self> {
  method upsample_nearest2d (line 2355) | fn upsample_nearest2d(&self, layout: &Layout, h: usize, w: usize) -> Res...
  method upsample_bilinear2d (line 2359) | fn upsample_bilinear2d(
  method powf (line 2378) | fn powf(&self, layout: &Layout, e: f64) -> Result<Self> {
  method elu (line 2414) | fn elu(&self, layout: &Layout, alpha: f64) -> Result<Self> {
  method unary_impl (line 2449) | fn unary_impl<B: UnaryOpT>(&self, layout: &Layout) -> Result<Self> {
  method binary_impl (line 2518) | fn binary_impl<B: BinaryOpT>(
  method copy2d (line 2605) | fn copy2d(
  method copy_strided_src (line 2666) | fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &La...
  method where_cond (line 2703) | fn where_cond(
  method conv1d (line 2721) | fn conv1d(
  method conv_transpose1d (line 2767) | fn conv_transpose1d(
  method conv2d (line 2822) | fn conv2d(
  method conv_transpose2d (line 2832) | fn conv_transpose2d(
  method index_select (line 2842) | fn index_select(&self, ids: &Self, l: &Layout, ids_l: &Layout, dim: usiz...
  method gather (line 2851) | fn gather(&self, l: &Layout, ids: &Self, ids_l: &Layout, dim: usize) -> ...
  method scatter_set (line 2860) | fn scatter_set(
  method scatter_add_set (line 2877) | fn scatter_add_set(
  method index_add (line 2896) | fn index_add(
  method matmul (line 2945) | fn matmul(
  method device (line 2955) | fn device(&self) -> &Self::Device {
  method try_clone (line 2959) | fn try_clone(&self, _: &Layout) -> Result<Self> {
  method to_cpu_storage (line 2963) | fn to_cpu_storage(&self) -> Result<CpuStorage> {
  method const_set (line 2967) | fn const_set(&mut self, s: crate::scalar::Scalar, l: &Layout) -> Result<...
  type Storage (line 3027) | type Storage = CpuStorage;
  method location (line 3029) | fn location(&self) -> crate::DeviceLocation {
  method same_device (line 3033) | fn same_device(&self, _: &Self) -> bool {
  method storage_from_slice (line 3037) | fn storage_from_slice<T: crate::WithDType>(&self, s: &[T]) -> Result<Sel...
  method storage_from_cpu_storage (line 3041) | fn storage_from_cpu_storage(&self, s: &CpuStorage) -> Result<Self::Stora...
  method storage_from_cpu_storage_owned (line 3045) | fn storage_from_cpu_storage_owned(&self, s: CpuStorage) -> Result<Self::...
  method new (line 3049) | fn new(_: usize) -> Result<Self> {
  method set_seed (line 3053) | fn set_seed(&self, _seed: u64) -> Result<()> {
  method get_current_seed (line 3057) | fn get_current_seed(&self) -> Result<u64> {
  method rand_uniform (line 3061) | fn rand_uniform(&self, shape: &Shape, dtype: DType, min: f64, max: f64) ...
  method rand_normal (line 3124) | fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, std: f64) ...
  method alloc_uninit (line 3187) | unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Cpu...
  method zeros_impl (line 3251) | fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<CpuStorage> {
  method synchronize (line 3271) | fn synchronize(&self) -> Result<()> {

FILE: candle-core/src/cpu_backend/utils.rs
  type C (line 5) | type C = super::CpuStorage;
  type Map1 (line 6) | pub trait Map1 {
    method f (line 7) | fn f<T: WithDType>(&self, vs: &[T], layout: &Layout) -> Result<Vec<T>>;
    method map (line 9) | fn map(&self, vs: &C, layout: &Layout) -> Result<C> {
  type Map1Any (line 30) | pub trait Map1Any {
    method f (line 31) | fn f<T: WithDType, W: Fn(Vec<T>) -> C>(&self, vs: &[T], layout: &Layou...
    method map (line 33) | fn map(&self, vs: &C, layout: &Layout) -> Result<C> {
  type Map2 (line 54) | pub trait Map2 {
    constant OP (line 55) | const OP: &'static str;
    method f (line 56) | fn f<T: WithDType>(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout...
    method map (line 58) | fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result<C> {
  type Map2InPlace (line 80) | pub trait Map2InPlace {
    constant OP (line 81) | const OP: &'static str;
    method f (line 82) | fn f<T: WithDType>(&self, v1: &mut [T], l1: &Layout, v2: &[T], l2: &La...
    method map (line 84) | fn map(&self, v1: &mut C, l1: &Layout, v2: &C, l2: &Layout) -> Result<...
  type Map2U8 (line 107) | pub trait Map2U8 {
    constant OP (line 108) | const OP: &'static str;
    method f (line 109) | fn f<T: WithDType>(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout...
    method map (line 111) | fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result<C> {
  function binary_map (line 133) | pub fn binary_map<T: Copy, U: Copy, F: FnMut(T, T) -> U>(
  function binary_map_vec (line 213) | pub fn binary_map_vec<T: Copy, F: FnMut(T, T) -> T, FV: FnMut(&[T], &[T]...
  function unary_map (line 324) | pub fn unary_map<T: Copy, U: Copy, F: FnMut(T) -> U>(
  function unary_map_vec (line 359) | pub fn unary_map_vec<T: Copy, U: Copy, F: FnMut(T) -> U, FV: FnMut(&[T],...

FILE: candle-core/src/cuda_backend/cudnn.rs
  function from (line 16) | fn from(err: cudarc::cudnn::CudnnError) -> Self {
  function from (line 22) | fn from(err: cudarc::driver::DriverError) -> Self {
  function launch_conv2d (line 27) | pub(crate) fn launch_conv2d<
  function launch_conv1d (line 126) | pub(crate) fn launch_conv1d<

FILE: candle-core/src/cuda_backend/device.rs
  type DeviceId (line 15) | pub struct DeviceId(usize);
    method new (line 18) | fn new() -> Self {
  type CudaRng (line 26) | struct CudaRng(cudarc::curand::CudaRng);
  type ModuleStore (line 29) | pub struct ModuleStore {
  type CudaDevice (line 34) | pub struct CudaDevice {
    method fmt (line 46) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    method alloc (line 53) | pub unsafe fn alloc<T: cudarc::driver::DeviceRepr>(
    method alloc_zeros (line 60) | pub fn alloc_zeros<T: cudarc::driver::DeviceRepr + cudarc::driver::Val...
    method memcpy_htod (line 67) | pub fn memcpy_htod<
    method clone_dtoh (line 79) | pub fn clone_dtoh<T: cudarc::driver::DeviceRepr, Src: cudarc::driver::...
    method memcpy_dtod (line 86) | pub fn memcpy_dtod<
    method memcpy_dtoh (line 98) | pub fn memcpy_dtoh<
    method clone_htod (line 110) | pub fn clone_htod<T: cudarc::driver::DeviceRepr, Src: cudarc::driver::...
    method cuda_stream (line 154) | pub fn cuda_stream(&self) -> Arc<cudarc::driver::CudaStream> {
    method disable_event_tracking (line 168) | pub unsafe fn disable_event_tracking(&self) {
    method is_event_tracking (line 172) | pub fn is_event_tracking(&self) -> bool {
    method compile (line 177) | pub fn compile(
    method id (line 198) | pub fn id(&self) -> DeviceId {
    method get_or_load_custom_func (line 202) | pub fn get_or_load_custom_func(
    method get_or_load_func (line 227) | pub fn get_or_load_func(&self, fn_name: &str, mdl: &kernels::Module) -...
    method cublas_handle (line 247) | pub fn cublas_handle(&self) -> Arc<cudarc::cublas::CudaBlas> {
    method new_with_stream (line 253) | pub fn new_with_stream(ordinal: usize) -> Result<Self> {
  type CudaFunc (line 118) | pub struct CudaFunc {
    type Target (line 124) | type Target = CudaFunction;
    method deref (line 126) | fn deref(&self) -> &Self::Target {
    method into_cuda_function (line 132) | pub fn into_cuda_function(self) -> CudaFunction {
    method builder (line 148) | pub fn builder(&self) -> cudarc::driver::LaunchArgs<'_> {
  type Storage (line 275) | type Storage = CudaStorage;
  method new (line 277) | fn new(ordinal: usize) -> Result<Self> {
  method set_seed (line 297) | fn set_seed(&self, seed: u64) -> Result<()> {
  method get_current_seed (line 306) | fn get_current_seed(&self) -> Result<u64> {
  method location (line 310) | fn location(&self) -> crate::DeviceLocation {
  method same_device (line 316) | fn same_device(&self, rhs: &Self) -> bool {
  method zeros_impl (line 320) | fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
  method rand_uniform (line 375) | fn rand_uniform(&self, shape: &Shape, dtype: DType, lo: f64, up: f64) ->...
  method rand_normal (line 423) | fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, std: f64) ...
  method alloc_uninit (line 474) | unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Sel...
  method storage_from_slice (line 529) | fn storage_from_slice<T: crate::WithDType>(&self, s: &[T]) -> Result<Sel...
  method storage_from_cpu_storage (line 588) | fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<CudaS...
  method storage_from_cpu_storage_owned (line 647) | fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result<...
  method synchronize (line 706) | fn synchronize(&self) -> Result<()> {

FILE: candle-core/src/cuda_backend/error.rs
  type CudaError (line 5) | pub enum CudaError {
  function from (line 49) | fn from(val: CudaError) -> Self {
  type WrapErr (line 54) | pub trait WrapErr<O> {
    method w (line 55) | fn w(self) -> std::result::Result<O, crate::Error>;
  function w (line 59) | fn w(self) -> std::result::Result<O, crate::Error> {

FILE: candle-core/src/cuda_backend/mod.rs
  type SlicePtrOrNull (line 23) | pub enum SlicePtrOrNull<T> {
  function builder_arg (line 29) | pub fn builder_arg<'a, 'b: 'a>(&'b self, builder: &mut cudarc::driver::L...
  function builder_arg (line 38) | pub fn builder_arg<'a, 'b: 'a>(&'b self, builder: &mut cudarc::driver::L...
  function params_from_layout (line 56) | pub fn params_from_layout(dev: &CudaDevice, l: &Layout) -> Result<Self> {
  type CudaStorageSlice (line 67) | pub enum CudaStorageSlice {
  type Clone (line 85) | struct Clone;
  method f (line 87) | fn f<T: DeviceRepr>(
  function kernel_name (line 97) | pub fn kernel_name<T: WithDType>(root: &str) -> String {
  type Affine (line 102) | struct Affine(f64, f64);
  method f (line 104) | fn f<T: DeviceRepr + WithDType>(
  type Elu (line 133) | struct Elu(f64);
  method f (line 135) | fn f<T: DeviceRepr + WithDType>(
  type Im2Col1D (line 164) | struct Im2Col1D {
    method l_out (line 173) | fn l_out(&self, l: usize) -> usize {
  method f (line 179) | fn f<T: DeviceRepr + WithDType>(
  type Im2Col (line 212) | struct Im2Col {
    method hw_out (line 222) | fn hw_out(&self, h: usize, w: usize) -> (usize, usize) {
  method f (line 230) | fn f<T: DeviceRepr + WithDType>(
  type Powf (line 264) | struct Powf(f64);
  method f (line 266) | fn f<T: DeviceRepr + WithDType>(
  type FastReduce (line 294) | struct FastReduce<'a>(&'a [usize], ReduceOp);
  method f (line 296) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits, W: Fn(CudaSlice<T>) ->...
  method f (line 377) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  function slice_ptr (line 404) | fn slice_ptr<T: DeviceRepr>(v: &CudaSlice<T>, lo: usize) -> (u64, cudarc...
  type IndexSelect (line 410) | struct IndexSelect<'a>(&'a CudaStorage, &'a Layout, usize);
  method f (line 412) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type Gather (line 463) | struct Gather<'a>(&'a CudaStorage, &'a Layout, usize);
  method f (line 465) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type IndexAdd (line 516) | struct IndexAdd<'a>(&'a CudaStorage, &'a Layout, usize);
  method f (line 518) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type Scatter (line 570) | struct Scatter<'a>(&'a CudaStorage, &'a Layout, usize);
  method f (line 572) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type ScatterAdd (line 622) | struct ScatterAdd<'a>(&'a CudaStorage, &'a Layout, usize);
  method f (line 624) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type Conv1D (line 674) | struct Conv1D<'a>(&'a crate::conv::ParamsConv1D);
  method f (line 676) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type Conv2D (line 718) | struct Conv2D<'a>(&'a crate::conv::ParamsConv2D);
  method f (line 720) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type Col2Im1D (line 761) | struct Col2Im1D {
  method f (line 766) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type ConvTranspose1D (line 789) | struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D);
  method f (line 791) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type ConvTranspose2D (line 837) | struct ConvTranspose2D<'a>(&'a crate::conv::ParamsConvTranspose2D);
  method f (line 839) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type PoolOp (line 886) | enum PoolOp {
  type Pool2D (line 891) | struct Pool2D {
  method f (line 900) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type UpsampleNearest2D (line 943) | struct UpsampleNearest2D(usize, usize);
  method f (line 945) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type UpsampleBilinear2D (line 983) | struct UpsampleBilinear2D {
  method f (line 992) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type WhereCond (line 1035) | struct WhereCond<'a>(&'a CudaStorage, &'a Layout);
  method f (line 1037) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  method f (line 1092) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  type Cmp (line 1127) | struct Cmp(CmpOp);
  method f (line 1129) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
  function slice_src_and_dst (line 1172) | fn slice_src_and_dst<'a, T>(
  type CudaStorage (line 1192) | pub struct CudaStorage {
    method wrap_cuda_slice (line 1249) | pub fn wrap_cuda_slice<T: CudaDType>(slice: CudaSlice<T>, device: Cuda...
    method as_cuda_slice (line 1253) | pub fn as_cuda_slice<T: CudaDType>(&self) -> Result<&CudaSlice<T>> {
    method as_cuda_slice_mut (line 1257) | pub fn as_cuda_slice_mut<T: CudaDType>(&mut self) -> Result<&mut CudaS...
    method transfer_to_device (line 1261) | pub fn transfer_to_device(&self, dst: &CudaDevice) -> Result<Self> {
  type CudaDType (line 1197) | pub trait CudaDType: Sized {
    method as_cuda_slice (line 1198) | fn as_cuda_slice(s: &CudaStorage) -> Result<&CudaSlice<Self>>;
    method as_cuda_slice_mut (line 1199) | fn as_cuda_slice_mut(s: &mut CudaStorage) -> Result<&mut CudaSlice<Sel...
    method wrap_cuda_slice (line 1200) | fn wrap_cuda_slice(s: CudaSlice<Self>, dev: CudaDevice) -> CudaStorage;
  function gemm_config (line 1343) | fn gemm_config<T>(
  type Device (line 1436) | type Device = CudaDevice;
  method try_clone (line 1438) | fn try_clone(&self, layout: &Layout) -> Result<Self> {
  method dtype (line 1444) | fn dtype(&self) -> DType {
  method device (line 1463) | fn device(&self) -> &CudaDevice {
  method const_set (line 1467) | fn const_set(&mut self, s: crate::scalar::Scalar, layout: &Layout) -> Re...
  method to_dtype (line 1507) | fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result<Self> {
  method affine (line 1648) | fn affine(&self, layout: &Layout, mul: f64, add: f64) -> Result<Self> {
  method powf (line 1654) | fn powf(&self, layout: &Layout, e: f64) -> Result<Self> {
  method elu (line 1660) | fn elu(&self, layout: &Layout, alpha: f64) -> Result<Self> {
  method reduce_op (line 1666) | fn reduce_op(&self, op: ReduceOp, layout: &Layout, sum_dims: &[usize]) -...
  method cmp (line 1672) | fn cmp(&self, op: CmpOp, rhs: &Self, lhs_l: &Layout, rhs_l: &Layout) -> ...
  method unary_impl (line 1678) | fn unary_impl<U: UnaryOpT>(&self, layout: &Layout) -> Result<Self> {
  method binary_impl (line 1684) | fn binary_impl<B: BinaryOpT>(
  method to_cpu_storage (line 1695) | fn to_cpu_storage(&self) -> Result<CpuStorage> {
  method where_cond (line 1748) | fn where_cond(
  method conv1d (line 1762) | fn conv1d(
  method conv1d (line 1813) | fn conv1d(
  method conv_transpose1d (line 1883) | fn conv_transpose1d(
  method conv2d (line 1943) | fn conv2d(
  method conv2d (line 1998) | fn conv2d(
  method conv_transpose2d (line 2068) | fn conv_transpose2d(
  method avg_pool2d (line 2081) | fn avg_pool2d(&self, l: &Layout, k: (usize, usize), stride: (usize, usiz...
  method max_pool2d (line 2094) | fn max_pool2d(&self, l: &Layout, k: (usize, usize), stride: (usize, usiz...
  method upsample_nearest1d (line 2107) | fn upsample_nearest1d(&self, _: &Layout, _out_sz: usize) -> Result<Self> {
  method upsample_nearest2d (line 2111) | fn upsample_nearest2d(&self, l: &Layout, out_w: usize, out_h: usize) -> ...
  method upsample_bilinear2d (line 2117) | fn upsample_bilinear2d(
  method index_select (line 2138) | fn index_select(&self, ids: &Self, l: &Layout, ids_l: &Layout, dim: usiz...
  method gather (line 2143) | fn gather(&self, l: &Layout, ids: &Self, ids_l: &Layout, dim: usize) -> ...
  method scatter_set (line 2148) | fn scatter_set(
  method scatter_add_set (line 2160) | fn scatter_add_set(
  method index_add (line 2172) | fn index_add(
  method matmul (line 2188) | fn matmul(
  method copy2d (line 2244) | fn copy2d(
  method copy_strided_src (line 2292) | fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &La...
  function gemm_reduced_precision_f32 (line 2482) | pub fn gemm_reduced_precision_f32() -> bool {
  function set_gemm_reduced_precision_f32 (line 2488) | pub fn set_gemm_reduced_precision_f32(b: bool) {
  function gemm_reduced_precision_f16 (line 2494) | pub fn gemm_reduced_precision_f16() -> bool {
  function set_gemm_reduced_precision_f16 (line 2500) | pub fn set_gemm_reduced_precision_f16(b: bool) {
  function gemm_reduced_precision_bf16 (line 2506) | pub fn gemm_reduced_precision_bf16() -> bool {
  function set_gemm_reduced_precision_bf16 (line 2512) | pub fn set_gemm_reduced_precision_bf16(b: bool) {
  function gemm_strided_batched_f32 (line 2516) | unsafe fn gemm_strided_batched_f32(
  function gemm_strided_batched_f16 (line 2566) | unsafe fn gemm_strided_batched_f16(
  function gemm_strided_batched_bf16 (line 2625) | unsafe fn gemm_strided_batched_bf16(

FILE: candle-core/src/cuda_backend/utils.rs
  type S (line 8) | pub type S = super::CudaStorageSlice;
  type Map1 (line 10) | pub trait Map1 {
    method f (line 11) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
    method map (line 18) | fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result<S> {
  type Map2 (line 38) | pub trait Map2 {
    method f (line 39) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
    method map (line 48) | fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice...
  type Map3 (line 66) | pub trait Map3 {
    method f (line 68) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
    method map (line 80) | fn map(
  type Map2InPlace (line 107) | pub trait Map2InPlace {
    method f (line 108) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
    method map (line 117) | fn map(
  type Map1Any (line 141) | pub trait Map1Any {
    method f (line 142) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits, W: Fn(CudaSlice<T>) ...
    method map (line 150) | fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result<S> {
  type Map2Any (line 170) | pub trait Map2Any {
    method f (line 171) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
    method map (line 180) | fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice...

FILE: candle-core/src/custom_op.rs
  type CustomOp1 (line 7) | pub trait CustomOp1 {
    method name (line 9) | fn name(&self) -> &'static str;
    method cpu_fwd (line 13) | fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(Cp...
    method cuda_fwd (line 17) | fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result...
    method metal_fwd (line 25) | fn metal_fwd(
    method bwd (line 38) | fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Res...
  type CustomOp2 (line 43) | pub trait CustomOp2 {
    method name (line 44) | fn name(&self) -> &'static str;
    method cpu_fwd (line 48) | fn cpu_fwd(
    method cuda_fwd (line 58) | fn cuda_fwd(
    method metal_fwd (line 72) | fn metal_fwd(
    method bwd (line 84) | fn bwd(
  type CustomOp3 (line 95) | pub trait CustomOp3 {
    method name (line 96) | fn name(&self) -> &'static str;
    method cpu_fwd (line 100) | fn cpu_fwd(
    method cuda_fwd (line 112) | fn cuda_fwd(
    method metal_fwd (line 128) | fn metal_fwd(
    method bwd (line 142) | fn bwd(
  method apply_op1_no_bwd (line 156) | pub fn apply_op1_no_bwd<C: CustomOp1>(&self, c: &C) -> Result<Self> {
  method apply_op2_no_bwd (line 162) | pub fn apply_op2_no_bwd<C: CustomOp2>(&self, rhs: &Self, c: &C) -> Resul...
  method apply_op3_no_bwd (line 170) | pub fn apply_op3_no_bwd<C: CustomOp3>(&self, t2: &Self, t3: &Self, c: &C...
  method apply_op1_arc (line 183) | pub fn apply_op1_arc(&self, c: Arc<Box<dyn CustomOp1 + Send + Sync>>) ->...
  method apply_op1 (line 191) | pub fn apply_op1<C: 'static + CustomOp1 + Send + Sync>(&self, c: C) -> R...
  method apply_op2_arc (line 196) | pub fn apply_op2_arc(
  method apply_op2 (line 211) | pub fn apply_op2<C: 'static + CustomOp2 + Send + Sync>(&self, r: &Self, ...
  method apply_op3_arc (line 216) | pub fn apply_op3_arc(
  method apply_op3 (line 236) | pub fn apply_op3<C: 'static + CustomOp3 + Send + Sync>(
  type InplaceOp1 (line 250) | pub trait InplaceOp1 {
    method name (line 252) | fn name(&self) -> &'static str;
    method cpu_fwd (line 256) | fn cpu_fwd(&self, storage: &mut CpuStorage, layout: &Layout) -> Result...
    method cuda_fwd (line 260) | fn cuda_fwd(&self, _storage: &mut CudaStorage, _layout: &Layout) -> Re...
    method metal_fwd (line 268) | fn metal_fwd(&self, _storage: &mut MetalStorage, _layout: &Layout) -> ...
    method name (line 421) | fn name(&self) -> &'static str {
    method cpu_fwd (line 425) | fn cpu_fwd(&self, _: &mut CpuStorage, _: &Layout) -> Result<()> {
    method metal_fwd (line 430) | fn metal_fwd(&self, sto: &mut MetalStorage, layout: &Layout) -> Result...
    method cuda_fwd (line 462) | fn cuda_fwd(&self, sto: &mut CudaStorage, layout: &Layout) -> Result<(...
  type InplaceOp2 (line 275) | pub trait InplaceOp2 {
    method name (line 276) | fn name(&self) -> &'static str;
    method cpu_fwd (line 280) | fn cpu_fwd(&self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l...
    method cuda_fwd (line 285) | fn cuda_fwd(&self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _...
    method metal_fwd (line 293) | fn metal_fwd(
  type InplaceOp3 (line 306) | pub trait InplaceOp3 {
    method name (line 307) | fn name(&self) -> &'static str;
    method cpu_fwd (line 311) | fn cpu_fwd(
    method cuda_fwd (line 323) | fn cuda_fwd(
    method metal_fwd (line 339) | fn metal_fwd(
  method inplace_op1 (line 356) | pub fn inplace_op1<C: InplaceOp1>(&self, c: &C) -> Result<()> {
  method inplace_op2 (line 361) | pub fn inplace_op2<C: InplaceOp2>(&self, rhs: &Self, c: &C) -> Result<()> {
  method inplace_op3 (line 367) | pub fn inplace_op3<C: InplaceOp3>(&self, t2: &Self, t3: &Self, c: &C) ->...
  type UgIOp1 (line 380) | pub struct UgIOp1 {
    method new (line 392) | pub fn new(

FILE: candle-core/src/device.rs
  type DeviceLocation (line 8) | pub enum DeviceLocation {
  type Device (line 16) | pub enum Device {
    method new_cuda (line 234) | pub fn new_cuda(ordinal: usize) -> Result<Self> {
    method as_cuda_device (line 238) | pub fn as_cuda_device(&self) -> Result<&crate::CudaDevice> {
    method as_metal_device (line 246) | pub fn as_metal_device(&self) -> Result<&crate::MetalDevice> {
    method new_cuda_with_stream (line 254) | pub fn new_cuda_with_stream(ordinal: usize) -> Result<Self> {
    method new_metal (line 258) | pub fn new_metal(ordinal: usize) -> Result<Self> {
    method set_seed (line 262) | pub fn set_seed(&self, seed: u64) -> Result<()> {
    method get_current_seed (line 270) | pub fn get_current_seed(&self) -> Result<u64> {
    method same_device (line 278) | pub fn same_device(&self, rhs: &Self) -> bool {
    method location (line 287) | pub fn location(&self) -> DeviceLocation {
    method is_cpu (line 295) | pub fn is_cpu(&self) -> bool {
    method is_cuda (line 299) | pub fn is_cuda(&self) -> bool {
    method is_metal (line 303) | pub fn is_metal(&self) -> bool {
    method supports_bf16 (line 307) | pub fn supports_bf16(&self) -> bool {
    method bf16_default_to_f32 (line 315) | pub fn bf16_default_to_f32(&self) -> DType {
    method cuda_if_available (line 323) | pub fn cuda_if_available(ordinal: usize) -> Result<Self> {
    method metal_if_available (line 331) | pub fn metal_if_available(ordinal: usize) -> Result<Self> {
    method rand_uniform_f64 (line 339) | pub(crate) fn rand_uniform_f64(
    method rand_uniform (line 368) | pub(crate) fn rand_uniform<T: crate::FloatDType>(
    method rand_normal_f64 (line 377) | pub(crate) fn rand_normal_f64(
    method rand_normal (line 406) | pub(crate) fn rand_normal<T: crate::FloatDType>(
    method zeros (line 415) | pub(crate) fn zeros(&self, shape: &Shape, dtype: DType) -> Result<Stor...
    method alloc_uninit (line 432) | pub(crate) unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) ...
    method storage_from_slice (line 449) | pub(crate) fn storage_from_slice<D: WithDType>(&self, data: &[D]) -> R...
    method storage (line 463) | pub(crate) fn storage<A: NdArray>(&self, array: A) -> Result<Storage> {
    method storage_owned (line 479) | pub(crate) fn storage_owned<S: WithDType>(&self, data: Vec<S>) -> Resu...
    method synchronize (line 495) | pub fn synchronize(&self) -> Result<()> {
  type NdArray (line 22) | pub trait NdArray {
    method shape (line 23) | fn shape(&self) -> Result<Shape>;
    method to_cpu_storage (line 25) | fn to_cpu_storage(&self) -> CpuStorage;
    method shape (line 29) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 33) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 39) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 43) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 49) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 53) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 59) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 63) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 71) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 75) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 89) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 93) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 107) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 111) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 117) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 131) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 138) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 152) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 163) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 178) | fn to_cpu_storage(&self) -> CpuStorage {
    method shape (line 197) | fn shape(&self) -> Result<Shape> {
    method to_cpu_storage (line 212) | fn to_cpu_storage(&self) -> CpuStorage {

FILE: candle-core/src/display.rs
  method fmt_dt (line 9) | fn fmt_dt<T: WithDType + std::fmt::Display>(
  method fmt (line 55) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
  type PrinterOptions (line 81) | pub struct PrinterOptions {
    method const_default (line 94) | const fn const_default() -> Self {
  function print_options (line 105) | pub fn print_options() -> &'static std::sync::Mutex<PrinterOptions> {
  function set_print_options (line 109) | pub fn set_print_options(options: PrinterOptions) {
  function set_print_options_default (line 113) | pub fn set_print_options_default() {
  function set_print_options_short (line 117) | pub fn set_print_options_short() {
  function set_print_options_full (line 127) | pub fn set_print_options_full() {
  function set_line_width (line 137) | pub fn set_line_width(line_width: usize) {
  function set_precision (line 141) | pub fn set_precision(precision: usize) {
  function set_edge_items (line 145) | pub fn set_edge_items(edge_items: usize) {
  function set_threshold (line 149) | pub fn set_threshold(threshold: usize) {
  function set_sci_mode (line 153) | pub fn set_sci_mode(sci_mode: Option<bool>) {
  type FmtSize (line 157) | struct FmtSize {
    method new (line 162) | fn new() -> Self {
    method final_size (line 166) | fn final_size(self) -> usize {
    method write_str (line 172) | fn write_str(&mut self, s: &str) -> std::fmt::Result {
  type TensorFormatter (line 178) | trait TensorFormatter {
    method fmt (line 181) | fn fmt<T: std::fmt::Write>(&self, v: Self::Elem, max_w: usize, f: &mut...
    method max_width (line 183) | fn max_width(&self, to_display: &Tensor) -> usize {
    method write_newline_indent (line 195) | fn write_newline_indent(i: usize, f: &mut std::fmt::Formatter) -> std:...
    method fmt_tensor (line 203) | fn fmt_tensor(
    type Elem (line 369) | type Elem = S;
    method fmt (line 371) | fn fmt<T: std::fmt::Write>(&self, v: Self::Elem, max_w: usize, f: &mut...
    type Elem (line 414) | type Elem = S;
    method fmt (line 416) | fn fmt<T: std::fmt::Write>(&self, v: Self::Elem, max_w: usize, f: &mut...
  type FloatFormatter (line 299) | struct FloatFormatter<S: WithDType> {
  function new (line 310) | fn new(t: &Tensor, po: &PrinterOptions) -> Result<Self> {
  type IntFormatter (line 398) | struct IntFormatter<S: WithDType> {
  function new (line 403) | fn new() -> Self {
  function get_summarized_data (line 421) | fn get_summarized_data(t: &Tensor, edge_items: usize) -> Result<Tensor> {
  method fmt (line 454) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {

FILE: candle-core/src/dtype.rs
  type DType (line 8) | pub enum DType {
    type Err (line 51) | type Err = DTypeParseError;
    method from_str (line 52) | fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
    method as_str (line 75) | pub fn as_str(&self) -> &'static str {
    method size_in_bytes (line 95) | pub fn size_in_bytes(&self) -> usize {
    method is_int (line 114) | pub fn is_int(&self) -> bool {
    method is_float (line 129) | pub fn is_float(&self) -> bool {
  type DTypeParseError (line 40) | pub struct DTypeParseError(String);
    method fmt (line 43) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type WithDType (line 145) | pub trait WithDType:
    constant DTYPE (line 157) | const DTYPE: DType;
    method from_f64 (line 159) | fn from_f64(v: f64) -> Self;
    method to_f64 (line 160) | fn to_f64(self) -> f64;
    method to_scalar (line 161) | fn to_scalar(self) -> crate::scalar::Scalar;
    method cpu_storage_ref (line 162) | fn cpu_storage_ref(data: &[Self]) -> CpuStorageRef<'_>;
    method to_cpu_storage_owned (line 163) | fn to_cpu_storage_owned(data: Vec<Self>) -> CpuStorage;
    method to_cpu_storage (line 165) | fn to_cpu_storage(data: &[Self]) -> CpuStorage {
    method cpu_storage_as_slice (line 169) | fn cpu_storage_as_slice(s: &CpuStorage) -> Result<&[Self]>;
    method cpu_storage_data (line 170) | fn cpu_storage_data(s: CpuStorage) -> Result<Vec<Self>>;
  type IntDType (line 238) | pub trait IntDType: WithDType + num_traits::Bounded {
    method is_true (line 239) | fn is_true(&self) -> bool;
    method as_usize (line 240) | fn as_usize(&self) -> usize;
    method is_true (line 244) | fn is_true(&self) -> bool {
    method as_usize (line 247) | fn as_usize(&self) -> usize {
    method is_true (line 253) | fn is_true(&self) -> bool {
    method as_usize (line 256) | fn as_usize(&self) -> usize {
    method is_true (line 262) | fn is_true(&self) -> bool {
    method as_usize (line 265) | fn as_usize(&self) -> usize {
    method is_true (line 271) | fn is_true(&self) -> bool {
    method as_usize (line 274) | fn as_usize(&self) -> usize {
    method is_true (line 280) | fn is_true(&self) -> bool {
    method as_usize (line 283) | fn as_usize(&self) -> usize {
  type FloatDType (line 288) | pub trait FloatDType: WithDType {}

FILE: candle-core/src/dummy_cuda_backend.rs
  type CudaDevice (line 8) | pub struct CudaDevice;
    method new_with_stream (line 28) | pub fn new_with_stream(_: usize) -> Result<Self> {
    method id (line 31) | pub fn id(&self) -> DeviceId {
    type Storage (line 235) | type Storage = CudaStorage;
    method new (line 236) | fn new(_: usize) -> Result<Self> {
    method set_seed (line 240) | fn set_seed(&self, _: u64) -> Result<()> {
    method get_current_seed (line 244) | fn get_current_seed(&self) -> Result<u64> {
    method location (line 248) | fn location(&self) -> crate::DeviceLocation {
    method same_device (line 252) | fn same_device(&self, _: &Self) -> bool {
    method zeros_impl (line 256) | fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::St...
    method alloc_uninit (line 260) | unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result...
    method storage_from_slice (line 264) | fn storage_from_slice<T: crate::WithDType>(&self, _: &[T]) -> Result<S...
    method storage_from_cpu_storage (line 268) | fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Sto...
    method storage_from_cpu_storage_owned (line 272) | fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self...
    method rand_uniform (line 276) | fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<...
    method rand_normal (line 280) | fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<S...
    method synchronize (line 284) | fn synchronize(&self) -> Result<()> {
  type CudaStorage (line 11) | pub struct CudaStorage;
    method transfer_to_device (line 14) | pub fn transfer_to_device(&self, _dst: &CudaDevice) -> Result<Self> {
    type Device (line 37) | type Device = CudaDevice;
    method try_clone (line 39) | fn try_clone(&self, _: &Layout) -> Result<Self> {
    method dtype (line 43) | fn dtype(&self) -> DType {
    method device (line 47) | fn device(&self) -> &Self::Device {
    method const_set (line 51) | fn const_set(&mut self, _: crate::scalar::Scalar, _: &Layout) -> Resul...
    method to_cpu_storage (line 55) | fn to_cpu_storage(&self) -> Result<CpuStorage> {
    method affine (line 59) | fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
    method powf (line 63) | fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
    method elu (line 67) | fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
    method reduce_op (line 71) | fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Se...
    method cmp (line 75) | fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Se...
    method to_dtype (line 79) | fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self> {
    method unary_impl (line 83) | fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
    method binary_impl (line 87) | fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) ...
    method where_cond (line 91) | fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &L...
    method conv1d (line 95) | fn conv1d(
    method conv_transpose1d (line 105) | fn conv_transpose1d(
    method conv2d (line 115) | fn conv2d(
    method conv_transpose2d (line 125) | fn conv_transpose2d(
    method index_select (line 135) | fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> ...
    method gather (line 138) | fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result...
    method scatter_set (line 142) | fn scatter_set(
    method scatter_add_set (line 154) | fn scatter_add_set(
    method index_add (line 166) | fn index_add(
    method matmul (line 178) | fn matmul(
    method copy_strided_src (line 188) | fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Resu...
    method copy2d (line 192) | fn copy2d(
    method avg_pool2d (line 205) | fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize))...
    method max_pool2d (line 209) | fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize))...
    method upsample_nearest1d (line 213) | fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
    method upsample_nearest2d (line 217) | fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result...
    method upsample_bilinear2d (line 221) | fn upsample_bilinear2d(
  type DeviceId (line 25) | pub struct DeviceId(usize);
  function gemm_reduced_precision_f16 (line 291) | pub fn gemm_reduced_precision_f16() -> bool {
  function set_gemm_reduced_precision_f16 (line 297) | pub fn set_gemm_reduced_precision_f16(_: bool) {}
  function gemm_reduced_precision_bf16 (line 301) | pub fn gemm_reduced_precision_bf16() -> bool {
  function set_gemm_reduced_precision_bf16 (line 307) | pub fn set_gemm_reduced_precision_bf16(_: bool) {}
  function gemm_reduced_precision_f32 (line 311) | pub fn gemm_reduced_precision_f32() -> bool {
  function set_gemm_reduced_precision_f32 (line 317) | pub fn set_gemm_reduced_precision_f32(_b: bool) {}

FILE: candle-core/src/dummy_dtype.rs
  type F6E2M3 (line 11) | pub struct F6E2M3;
    method fmt (line 247) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type F6E3M2 (line 16) | pub struct F6E3M2;
    method fmt (line 253) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type F4 (line 21) | pub struct F4;
    method fmt (line 259) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type F8E8M0 (line 26) | pub struct F8E8M0;
    method fmt (line 265) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {

FILE: candle-core/src/dummy_metal_backend.rs
  type MetalDevice (line 6) | pub struct MetalDevice;
    type Storage (line 228) | type Storage = MetalStorage;
    method new (line 229) | fn new(_: usize) -> Result<Self> {
    method set_seed (line 233) | fn set_seed(&self, _: u64) -> Result<()> {
    method get_current_seed (line 237) | fn get_current_seed(&self) -> Result<u64> {
    method location (line 241) | fn location(&self) -> crate::DeviceLocation {
    method same_device (line 245) | fn same_device(&self, _: &Self) -> bool {
    method zeros_impl (line 249) | fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::St...
    method alloc_uninit (line 253) | unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result...
    method storage_from_slice (line 257) | fn storage_from_slice<T: crate::WithDType>(&self, _: &[T]) -> Result<S...
    method storage_from_cpu_storage (line 261) | fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Sto...
    method storage_from_cpu_storage_owned (line 265) | fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self...
    method rand_uniform (line 269) | fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<...
    method rand_normal (line 273) | fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<S...
    method synchronize (line 277) | fn synchronize(&self) -> Result<()> {
  type MetalStorage (line 9) | pub struct MetalStorage;
    type Device (line 30) | type Device = MetalDevice;
    method try_clone (line 32) | fn try_clone(&self, _: &Layout) -> Result<Self> {
    method dtype (line 36) | fn dtype(&self) -> DType {
    method device (line 40) | fn device(&self) -> &Self::Device {
    method const_set (line 44) | fn const_set(&mut self, _: crate::scalar::Scalar, _: &Layout) -> Resul...
    method to_cpu_storage (line 48) | fn to_cpu_storage(&self) -> Result<CpuStorage> {
    method affine (line 52) | fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
    method powf (line 56) | fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
    method elu (line 60) | fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
    method reduce_op (line 64) | fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Se...
    method cmp (line 68) | fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Se...
    method to_dtype (line 72) | fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self> {
    method unary_impl (line 76) | fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
    method binary_impl (line 80) | fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) ...
    method where_cond (line 84) | fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &L...
    method conv1d (line 88) | fn conv1d(
    method conv_transpose1d (line 98) | fn conv_transpose1d(
    method conv2d (line 108) | fn conv2d(
    method conv_transpose2d (line 118) | fn conv_transpose2d(
    method index_select (line 128) | fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> ...
    method gather (line 131) | fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result...
    method scatter_set (line 135) | fn scatter_set(
    method scatter_add_set (line 147) | fn scatter_add_set(
    method index_add (line 159) | fn index_add(
    method matmul (line 171) | fn matmul(
    method copy_strided_src (line 181) | fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Resu...
    method copy2d (line 185) | fn copy2d(
    method avg_pool2d (line 198) | fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize))...
    method max_pool2d (line 202) | fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize))...
    method upsample_nearest1d (line 206) | fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
    method upsample_nearest2d (line 210) | fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result...
    method upsample_bilinear2d (line 214) | fn upsample_bilinear2d(
  type MetalError (line 12) | pub enum MetalError {
    method from (line 18) | fn from(e: String) -> Self {

FILE: candle-core/src/error.rs
  type MatMulUnexpectedStriding (line 7) | pub struct MatMulUnexpectedStriding {
  type Error (line 22) | pub enum Error {
    method fmt (line 15) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    method wrap (line 251) | pub fn wrap(err: impl std::fmt::Display + Send + Sync + 'static) -> Se...
    method msg (line 255) | pub fn msg(err: impl std::fmt::Display) -> Self {
    method debug (line 259) | pub fn debug(err: impl std::fmt::Debug) -> Self {
    method bt (line 263) | pub fn bt(self) -> Self {
    method with_path (line 275) | pub fn with_path<P: AsRef<std::path::Path>>(self, p: P) -> Self {
    method context (line 282) | pub fn context(self, c: impl std::fmt::Display + Send + Sync + 'static...
  type Result (line 248) | pub type Result<T> = std::result::Result<T, Error>;
  function zip (line 303) | pub fn zip<T, U>(r1: Result<T>, r2: Result<U>) -> Result<(T, U)> {
  type Sealed (line 312) | pub trait Sealed {}
  type Context (line 321) | pub trait Context<T, E>: private::Sealed {
    method context (line 323) | fn context<C>(self, context: C) -> std::result::Result<T, Error>
    method with_context (line 329) | fn with_context<C, F>(self, f: F) -> std::result::Result<T, Error>
  function context (line 339) | fn context<C>(self, context: C) -> std::result::Result<T, Error>
  function with_context (line 355) | fn with_context<C, F>(self, context: F) -> std::result::Result<T, Error>
  function context (line 372) | fn context<C>(self, context: C) -> std::result::Result<T, Error>
  function with_context (line 384) | fn with_context<C, F>(self, context: F) -> std::result::Result<T, Error>

FILE: candle-core/src/indexer.rs
  method index (line 27) | fn index(&self, indexers: &[TensorIndexer]) -> Result<Self, Error> {
  type TensorIndexer (line 66) | pub enum TensorIndexer {
    method from (line 77) | fn from(index: usize) -> Self {
    method from (line 83) | fn from(index: &[u32]) -> Self {
    method from (line 92) | fn from(index: Vec<u32>) -> Self {
    method from (line 102) | fn from(tensor: &Tensor) -> Self {
    method from (line 116) | fn from(range: T) -> Self {
  type RB (line 107) | trait RB: RangeBounds<usize> {}
  type IndexOp (line 134) | pub trait IndexOp<T> {
    method i (line 137) | fn i(&self, index: T) -> Result<Tensor, Error>;
  method i (line 171) | fn i(&self, index: T) -> Result<Tensor, Error> {
  method i (line 207) | fn i(&self, (a,): (A,)) -> Result<Tensor, Error> {
  method i (line 233) | fn i(&self, (a, b): (A, B)) -> Result<Tensor, Error> {

FILE: candle-core/src/layout.rs
  type Layout (line 5) | pub struct Layout {
    method new (line 13) | pub fn new(shape: Shape, stride: Vec<usize>, start_offset: usize) -> S...
    method contiguous_with_offset (line 21) | pub fn contiguous_with_offset<S: Into<Shape>>(shape: S, start_offset: ...
    method contiguous (line 31) | pub fn contiguous<S: Into<Shape>>(shape: S) -> Self {
    method dims (line 35) | pub fn dims(&self) -> &[usize] {
    method dim (line 40) | pub fn dim<D: crate::shape::Dim>(&self, dim: D) -> Result<usize> {
    method shape (line 45) | pub fn shape(&self) -> &Shape {
    method stride (line 49) | pub fn stride(&self) -> &[usize] {
    method start_offset (line 53) | pub fn start_offset(&self) -> usize {
    method contiguous_offsets (line 59) | pub fn contiguous_offsets(&self) -> Option<(usize, usize)> {
    method is_contiguous (line 71) | pub fn is_contiguous(&self) -> bool {
    method is_fortran_contiguous (line 76) | pub fn is_fortran_contiguous(&self) -> bool {
    method narrow (line 80) | pub fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<S...
    method transpose (line 109) | pub fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
    method permute (line 130) | pub fn permute(&self, idxs: &[usize]) -> Result<Self> {
    method broadcast_as (line 155) | pub fn broadcast_as<S: Into<Shape>>(&self, shape: S) -> Result<Self> {
    method strided_index (line 190) | pub(crate) fn strided_index(&self) -> crate::StridedIndex<'_> {
    method strided_blocks (line 194) | pub(crate) fn strided_blocks(&self) -> crate::StridedBlocks<'_> {
    method offsets_b (line 224) | pub(crate) fn offsets_b(&self) -> Option<ContiguousOffsetsWithBroadcas...
  type ContiguousOffsetsWithBroadcast (line 273) | pub struct ContiguousOffsetsWithBroadcast {

FILE: candle-core/src/lib.rs
  type ToUsize2 (line 131) | pub trait ToUsize2 {
    method to_usize2 (line 132) | fn to_usize2(self) -> (usize, usize);
    method to_usize2 (line 136) | fn to_usize2(self) -> (usize, usize) {
    method to_usize2 (line 142) | fn to_usize2(self) -> (usize, usize) {
  type Module (line 148) | pub trait Module {
    method forward (line 149) | fn forward(&self, xs: &Tensor) -> Result<Tensor>;
    method forward (line 153) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
    method forward (line 159) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type ModuleT (line 169) | pub trait ModuleT {
    method forward_t (line 170) | fn forward_t(&self, xs: &Tensor, train: bool) -> Result<Tensor>;
    method forward_t (line 174) | fn forward_t(&self, xs: &Tensor, _train: bool) -> Result<Tensor> {

FILE: candle-core/src/metal_backend/device.rs
  type DeviceId (line 22) | pub struct DeviceId(usize);
    method new (line 25) | pub(crate) fn new() -> Self {
  type MetalDevice (line 34) | pub struct MetalDevice {
    method fmt (line 81) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    type Target (line 87) | type Target = Device;
    method deref (line 89) | fn deref(&self) -> &Self::Target {
    method compile (line 96) | pub fn compile(
    method id (line 118) | pub fn id(&self) -> DeviceId {
    method metal_device (line 122) | pub fn metal_device(&self) -> &Device {
    method drop_unused_buffers (line 126) | fn drop_unused_buffers(&self) -> Result<()> {
    method command_encoder (line 139) | pub fn command_encoder(&self) -> Result<ComputeCommandEncoder> {
    method blit_command_encoder (line 148) | pub fn blit_command_encoder(&self) -> Result<BlitCommandEncoder> {
    method wait_until_completed (line 157) | pub fn wait_until_completed(&self) -> Result<()> {
    method kernels (line 163) | pub fn kernels(&self) -> &Kernels {
    method device (line 167) | pub fn device(&self) -> &Device {
    method new_buffer (line 172) | pub fn new_buffer(
    method new_private_buffer (line 185) | pub fn new_private_buffer(
    method new_buffer_with_data (line 203) | pub fn new_buffer_with_data<T>(&self, data: &[T]) -> Result<Arc<Buffer...
    method allocate_zeros (line 218) | pub fn allocate_zeros(&self, size_in_bytes: usize) -> Result<Arc<Buffe...
    method allocate_buffer (line 228) | pub fn allocate_buffer(&self, size: usize) -> Result<Arc<Buffer>> {
    method capture (line 247) | pub fn capture<P: AsRef<Path>>(&self, path: P) -> Result<()> {
  constant RESOURCE_OPTIONS (line 69) | pub const RESOURCE_OPTIONS: MTLResourceOptions =
  constant PRIVATE_RESOURCE_OPTIONS (line 76) | pub const PRIVATE_RESOURCE_OPTIONS: MTLResourceOptions = MTLResourceOpti...
  constant PRIVATE_RESOURCE_OPTIONS (line 78) | pub const PRIVATE_RESOURCE_OPTIONS: MTLResourceOptions = MTLResourceOpti...
  function buf_size (line 269) | fn buf_size(size: usize) -> usize {
  function find_available_buffer (line 273) | fn find_available_buffer(size: usize, buffers: &BufferMap) -> Option<Arc...

FILE: candle-core/src/metal_backend/mod.rs
  function buffer_o (line 19) | pub fn buffer_o<'a>(buffer: &'a Buffer, l: &Layout, dtype: DType) -> Buf...
  type LockError (line 28) | pub enum LockError {
  type MetalError (line 52) | pub enum MetalError {
    method from (line 36) | fn from(value: TryLockError<T>) -> Self {
    method from (line 45) | fn from(p: PoisonError<T>) -> Self {
    method from (line 68) | fn from(e: String) -> Self {
  type MetalStorage (line 74) | pub struct MetalStorage {
    method new (line 1827) | pub fn new(buffer: Arc<Buffer>, device: MetalDevice, count: usize, dty...
    method buffer (line 1836) | pub fn buffer(&self) -> &Buffer {
    method binary (line 1840) | pub fn binary(
    method to_cpu (line 1910) | pub(crate) fn to_cpu<T: Clone>(&self) -> Result<Vec<T>> {
  type Device (line 86) | type Device = MetalDevice;
  method try_clone (line 88) | fn try_clone(&self, _: &Layout) -> Result<Self> {
  method dtype (line 92) | fn dtype(&self) -> DType {
  method device (line 96) | fn device(&self) -> &Self::Device {
  method to_cpu_storage (line 100) | fn to_cpu_storage(&self) -> Result<CpuStorage> {
  method affine (line 118) | fn affine(&self, layout: &Layout, mul: f64, add: f64) -> Result<Self> {
  method powf (line 179) | fn powf(&self, layout: &Layout, pow: f64) -> Result<Self> {
  method elu (line 232) | fn elu(&self, layout: &Layout, alpha: f64) -> Result<Self> {
  method reduce_op (line 285) | fn reduce_op(&self, op: ReduceOp, layout: &Layout, sum_dims: &[usize]) -...
  method cmp (line 425) | fn cmp(&self, op: CmpOp, rhs: &Self, lhs_l: &Layout, rhs_l: &Layout) -> ...
  method const_set (line 437) | fn const_set(&mut self, s: crate::scalar::Scalar, l: &Layout) -> Result<...
  method to_dtype (line 529) | fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result<Self> {
  method unary_impl (line 647) | fn unary_impl<B: UnaryOpT>(&self, layout: &Layout) -> Result<Self> {
  method binary_impl (line 812) | fn binary_impl<B: BinaryOpT>(
  method where_cond (line 821) | fn where_cond(
  method conv1d (line 879) | fn conv1d(
  method conv_transpose1d (line 956) | fn conv_transpose1d(
  method conv2d (line 1071) | fn conv2d(
  method conv_transpose2d (line 1155) | fn conv_transpose2d(
  method avg_pool2d (line 1220) | fn avg_pool2d(
  method max_pool2d (line 1263) | fn max_pool2d(
  method upsample_nearest1d (line 1306) | fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
  method upsample_nearest2d (line 1310) | fn upsample_nearest2d(&self, inp_l: &Layout, out_w: usize, out_h: usize)...
  method upsample_bilinear2d (line 1350) | fn upsample_bilinear2d(
  method gather (line 1405) | fn gather(&self, src_l: &Layout, ids: &Self, ids_l: &Layout, dim: usize)...
  method scatter_set (line 1453) | fn scatter_set(
  method scatter_add_set (line 1503) | fn scatter_add_set(
  method index_select (line 1553) | fn index_select(&self, ids: &Self, src_l: &Layout, ids_l: &Layout, dim: ...
  method index_add (line 1612) | fn index_add(
  method matmul (line 1675) | fn matmul(
  method copy2d (line 1719) | fn copy2d(
  method copy_strided_src (line 1779) | fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &La...
  type Storage (line 1925) | type Storage = MetalStorage;
  method new (line 1927) | fn new(ordinal: usize) -> Result<Self> {
  method location (line 1952) | fn location(&self) -> crate::DeviceLocation {
  method same_device (line 1958) | fn same_device(&self, rhs: &Self) -> bool {
  method alloc_uninit (line 1962) | unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Met...
  method zeros_impl (line 1972) | fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<MetalStorage> {
  method storage_from_slice (line 1983) | fn storage_from_slice<T: crate::WithDType>(&self, s: &[T]) -> Result<Sel...
  method storage_from_cpu_storage (line 2005) | fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<Self:...
  method storage_from_cpu_storage_owned (line 2032) | fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result<...
  method rand_uniform (line 2036) | fn rand_uniform(
  method rand_normal (line 2073) | fn rand_normal(
  method set_seed (line 2110) | fn set_seed(&self, seed: u64) -> Result<()> {
  method get_current_seed (line 2123) | fn get_current_seed(&self) -> Result<u64> {
  method synchronize (line 2127) | fn synchronize(&self) -> Result<()> {
  function read_to_vec (line 2132) | fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {

FILE: candle-core/src/mkl.rs
  function vsTanh (line 7) | pub fn vsTanh(n: c_int, a: *const c_float, y: *mut c_float);
  function vdTanh (line 8) | pub fn vdTanh(n: c_int, a: *const c_double, y: *mut c_double);
  function vsExp (line 9) | pub fn vsExp(n: c_int, a: *const c_float, y: *mut c_float);
  function vdExp (line 10) | pub fn vdExp(n: c_int, a: *const c_double, y: *mut c_double);
  function vsLn (line 11) | pub fn vsLn(n: c_int, a: *const c_float, y: *mut c_float);
  function vdLn (line 12) | pub fn vdLn(n: c_int, a: *const c_double, y: *mut c_double);
  function vsSin (line 13) | pub fn vsSin(n: c_int, a: *const c_float, y: *mut c_float);
  function vdSin (line 14) | pub fn vdSin(n: c_int, a: *const c_double, y: *mut c_double);
  function vsCos (line 15) | pub fn vsCos(n: c_int, a: *const c_float, y: *mut c_float);
  function vdCos (line 16) | pub fn vdCos(n: c_int, a: *const c_double, y: *mut c_double);
  function vsSqrt (line 17) | pub fn vsSqrt(n: c_int, a: *const c_float, y: *mut c_float);
  function vdSqrt (line 18) | pub fn vdSqrt(n: c_int, a: *const c_double, y: *mut c_double);
  function vsAdd (line 20) | pub fn vsAdd(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_f...
  function vdAdd (line 21) | pub fn vdAdd(n: c_int, a: *const c_double, b: *const c_double, y: *mut c...
  function vsSub (line 22) | pub fn vsSub(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_f...
  function vdSub (line 23) | pub fn vdSub(n: c_int, a: *const c_double, b: *const c_double, y: *mut c...
  function vsMul (line 24) | pub fn vsMul(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_f...
  function vdMul (line 25) | pub fn vdMul(n: c_int, a: *const c_double, b: *const c_double, y: *mut c...
  function vsDiv (line 26) | pub fn vsDiv(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_f...
  function vdDiv (line 27) | pub fn vdDiv(n: c_int, a: *const c_double, b: *const c_double, y: *mut c...
  function vsFmax (line 28) | pub fn vsFmax(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_...
  function vdFmax (line 29) | pub fn vdFmax(n: c_int, a: *const c_double, b: *const c_double, y: *mut ...
  function vsFmin (line 30) | pub fn vsFmin(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_...
  function vdFmin (line 31) | pub fn vdFmin(n: c_int, a: *const c_double, b: *const c_double, y: *mut ...
  function sgemm_ (line 33) | pub fn sgemm_(
  function dgemm_ (line 48) | pub fn dgemm_(
  function hgemm_ (line 63) | pub fn hgemm_(
  function sgemm (line 83) | pub unsafe fn sgemm(
  function dgemm (line 117) | pub unsafe fn dgemm(
  function hgemm (line 151) | pub unsafe fn hgemm(
  function vs_exp (line 184) | pub fn vs_exp(a: &[f32], y: &mut [f32]) {
  function vd_exp (line 194) | pub fn vd_exp(a: &[f64], y: &mut [f64]) {
  function vs_ln (line 204) | pub fn vs_ln(a: &[f32], y: &mut [f32]) {
  function vd_ln (line 214) | pub fn vd_ln(a: &[f64], y: &mut [f64]) {
  function vs_sin (line 224) | pub fn vs_sin(a: &[f32], y: &mut [f32]) {
  function vd_sin (line 234) | pub fn vd_sin(a: &[f64], y: &mut [f64]) {
  function vs_cos (line 244) | pub fn vs_cos(a: &[f32], y: &mut [f32]) {
  function vd_cos (line 254) | pub fn vd_cos(a: &[f64], y: &mut [f64]) {
  function vs_sqrt (line 264) | pub fn vs_sqrt(a: &[f32], y: &mut [f32]) {
  function vd_sqrt (line 274) | pub fn vd_sqrt(a: &[f64], y: &mut [f64]) {
  function vs_sqr (line 284) | pub fn vs_sqr(a: &[f32], y: &mut [f32]) {
  function vd_sqr (line 294) | pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
  function vs_tanh (line 304) | pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
  function vd_tanh (line 314) | pub fn vd_tanh(a: &[f64], y: &mut [f64]) {
  function vs_tanh_inplace (line 327) | pub fn vs_tanh_inplace(y: &mut [f32]) {
  function vd_tanh_inplace (line 332) | pub fn vd_tanh_inplace(y: &mut [f64]) {
  function vs_exp_inplace (line 337) | pub fn vs_exp_inplace(y: &mut [f32]) {
  function vd_exp_inplace (line 342) | pub fn vd_exp_inplace(y: &mut [f64]) {
  function vs_gelu (line 347) | pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
  function vd_gelu (line 358) | pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
  function vs_silu (line 369) | pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
  function vd_silu (line 380) | pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {

FILE: candle-core/src/npy.rs
  constant NPY_MAGIC_STRING (line 36) | const NPY_MAGIC_STRING: &[u8] = b"\x93NUMPY";
  constant NPY_SUFFIX (line 37) | const NPY_SUFFIX: &str = ".npy";
  function read_header (line 39) | fn read_header<R: Read>(reader: &mut R) -> Result<String> {
  type Header (line 64) | struct Header {
    method shape (line 71) | fn shape(&self) -> Shape {
    method to_string (line 75) | fn to_string(&self) -> Result<String> {
    method parse (line 109) | fn parse(header: &str) -> Result<Header> {
  method from_reader (line 207) | pub(crate) fn from_reader<R: std::io::Read>(
  method read_npy (line 273) | pub fn read_npy<T: AsRef<Path>>(path: T) -> Result<Self> {
  method read_npz (line 284) | pub fn read_npz<T: AsRef<Path>>(path: T) -> Result<Vec<(String, Self)>> {
  method read_npz_by_name (line 306) | pub fn read_npz_by_name<T: AsRef<Path>>(path: T, names: &[&str]) -> Resu...
  method write (line 329) | fn write<T: Write>(&self, f: &mut T) -> Result<()> {
  method write_npy (line 349) | pub fn write_npy<T: AsRef<Path>>(&self, path: T) -> Result<()> {
  method write_npz (line 355) | pub fn write_npz<S: AsRef<str>, T: AsRef<Tensor>, P: AsRef<Path>>(
  type NpzTensors (line 372) | pub struct NpzTensors {
    method new (line 380) | pub fn new<T: AsRef<Path>>(path: T) -> Result<Self> {
    method names (line 399) | pub fn names(&self) -> Vec<&String> {
    method get_shape_and_dtype (line 405) | pub fn get_shape_and_dtype(&self, name: &str) -> Result<(Shape, DType)> {
    method get (line 418) | pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
  function parse (line 442) | fn parse() {

FILE: candle-core/src/op.rs
  type CmpOp (line 10) | pub enum CmpOp {
  type ReduceOp (line 20) | pub enum ReduceOp {
    method name (line 29) | pub(crate) fn name(&self) -> &'static str {
  type BinaryOp (line 42) | pub enum BinaryOp {
  type UnaryOp (line 53) | pub enum UnaryOp {
  type Op (line 76) | pub enum Op {
  type UnaryOpT (line 192) | pub trait UnaryOpT {
    constant NAME (line 193) | const NAME: &'static str;
    constant KERNEL (line 194) | const KERNEL: &'static str;
    constant V (line 195) | const V: Self;
    method bf16 (line 196) | fn bf16(v1: bf16) -> bf16;
    method f16 (line 197) | fn f16(v1: f16) -> f16;
    method f32 (line 198) | fn f32(v1: f32) -> f32;
    method f64 (line 199) | fn f64(v1: f64) -> f64;
    method u8 (line 200) | fn u8(v1: u8) -> u8;
    method u32 (line 201) | fn u32(v1: u32) -> u32;
    method i16 (line 202) | fn i16(v1: i16) -> i16;
    method i32 (line 203) | fn i32(v1: i32) -> i32;
    method i64 (line 204) | fn i64(v1: i64) -> i64;
    method f8e4m3 (line 205) | fn f8e4m3(v1: f8e4m3) -> f8e4m3;
    constant BF16_VEC (line 209) | const BF16_VEC: bool = false;
    method bf16_vec (line 210) | fn bf16_vec(_xs: &[bf16], _ys: &mut [bf16]) {}
    constant F16_VEC (line 211) | const F16_VEC: bool = false;
    method f16_vec (line 212) | fn f16_vec(_xs: &[f16], _ys: &mut [f16]) {}
    constant F32_VEC (line 213) | const F32_VEC: bool = false;
    method f32_vec (line 214) | fn f32_vec(_xs: &[f32], _ys: &mut [f32]) {}
    constant F64_VEC (line 215) | const F64_VEC: bool = false;
    method f64_vec (line 216) | fn f64_vec(_xs: &[f64], _ys: &mut [f64]) {}
    constant NAME (line 525) | const NAME: &'static str = "gelu";
    constant V (line 526) | const V: Self = Gelu;
    method bf16 (line 528) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 539) | fn f16(v: f16) -> f16 {
    method f32 (line 550) | fn f32(v: f32) -> f32 {
    method f64 (line 554) | fn f64(v: f64) -> f64 {
    method u8 (line 558) | fn u8(_: u8) -> u8 {
    method u32 (line 562) | fn u32(_: u32) -> u32 {
    method i16 (line 566) | fn i16(_: i16) -> i16 {
    method i32 (line 570) | fn i32(_: i32) -> i32 {
    method i64 (line 574) | fn i64(_: i64) -> i64 {
    method f8e4m3 (line 578) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant KERNEL (line 588) | const KERNEL: &'static str = "ugelu";
    constant F32_VEC (line 591) | const F32_VEC: bool = true;
    method f32_vec (line 595) | fn f32_vec(xs: &[f32], ys: &mut [f32]) {
    constant F64_VEC (line 600) | const F64_VEC: bool = true;
    method f64_vec (line 604) | fn f64_vec(xs: &[f64], ys: &mut [f64]) {
    constant F32_VEC (line 609) | const F32_VEC: bool = true;
    method f32_vec (line 613) | fn f32_vec(xs: &[f32], ys: &mut [f32]) {
    constant F64_VEC (line 618) | const F64_VEC: bool = true;
    method f64_vec (line 622) | fn f64_vec(xs: &[f64], ys: &mut [f64]) {
    constant NAME (line 630) | const NAME: &'static str = "erf";
    constant KERNEL (line 631) | const KERNEL: &'static str = "uerf";
    constant V (line 632) | const V: Self = Erf;
    method bf16 (line 634) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 638) | fn f16(v: f16) -> f16 {
    method f32 (line 642) | fn f32(v: f32) -> f32 {
    method f64 (line 646) | fn f64(v: f64) -> f64 {
    method u8 (line 650) | fn u8(_: u8) -> u8 {
    method u32 (line 654) | fn u32(_: u32) -> u32 {
    method i16 (line 658) | fn i16(_: i16) -> i16 {
    method i32 (line 662) | fn i32(_: i32) -> i32 {
    method i64 (line 666) | fn i64(_: i64) -> i64 {
    method f8e4m3 (line 670) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant NAME (line 677) | const NAME: &'static str = "silu";
    constant V (line 678) | const V: Self = Silu;
    method bf16 (line 680) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 684) | fn f16(v: f16) -> f16 {
    method f32 (line 688) | fn f32(v: f32) -> f32 {
    method f64 (line 692) | fn f64(v: f64) -> f64 {
    method u8 (line 696) | fn u8(_: u8) -> u8 {
    method u32 (line 700) | fn u32(_: u32) -> u32 {
    method i16 (line 704) | fn i16(_: i16) -> i16 {
    method i32 (line 708) | fn i32(_: i32) -> i32 {
    method i64 (line 712) | fn i64(_: i64) -> i64 {
    method f8e4m3 (line 716) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant KERNEL (line 719) | const KERNEL: &'static str = "usilu";
    constant F32_VEC (line 722) | const F32_VEC: bool = true;
    method f32_vec (line 726) | fn f32_vec(xs: &[f32], ys: &mut [f32]) {
    constant F64_VEC (line 731) | const F64_VEC: bool = true;
    method f64_vec (line 735) | fn f64_vec(xs: &[f64], ys: &mut [f64]) {
    constant F32_VEC (line 740) | const F32_VEC: bool = true;
    method f32_vec (line 744) | fn f32_vec(xs: &[f32], ys: &mut [f32]) {
    constant F64_VEC (line 749) | const F64_VEC: bool = true;
    method f64_vec (line 753) | fn f64_vec(xs: &[f64], ys: &mut [f64]) {
    constant NAME (line 759) | const NAME: &'static str = "abs";
    constant KERNEL (line 760) | const KERNEL: &'static str = "uabs";
    constant V (line 761) | const V: Self = Abs;
    method bf16 (line 763) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 767) | fn f16(v: f16) -> f16 {
    method f32 (line 771) | fn f32(v: f32) -> f32 {
    method f64 (line 775) | fn f64(v: f64) -> f64 {
    method u8 (line 779) | fn u8(v: u8) -> u8 {
    method u32 (line 783) | fn u32(v: u32) -> u32 {
    method i16 (line 787) | fn i16(v: i16) -> i16 {
    method i32 (line 791) | fn i32(v: i32) -> i32 {
    method i64 (line 795) | fn i64(v: i64) -> i64 {
    method f8e4m3 (line 799) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant NAME (line 805) | const NAME: &'static str = "ceil";
    constant KERNEL (line 806) | const KERNEL: &'static str = "uceil";
    constant V (line 807) | const V: Self = Ceil;
    method bf16 (line 809) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 813) | fn f16(v: f16) -> f16 {
    method f32 (line 817) | fn f32(v: f32) -> f32 {
    method f64 (line 821) | fn f64(v: f64) -> f64 {
    method u8 (line 825) | fn u8(v: u8) -> u8 {
    method u32 (line 829) | fn u32(v: u32) -> u32 {
    method i16 (line 833) | fn i16(v: i16) -> i16 {
    method i32 (line 837) | fn i32(v: i32) -> i32 {
    method i64 (line 841) | fn i64(v: i64) -> i64 {
    method f8e4m3 (line 845) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant NAME (line 851) | const NAME: &'static str = "floor";
    constant KERNEL (line 852) | const KERNEL: &'static str = "ufloor";
    constant V (line 853) | const V: Self = Floor;
    method bf16 (line 855) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 859) | fn f16(v: f16) -> f16 {
    method f32 (line 863) | fn f32(v: f32) -> f32 {
    method f64 (line 867) | fn f64(v: f64) -> f64 {
    method u8 (line 871) | fn u8(v: u8) -> u8 {
    method u32 (line 875) | fn u32(v: u32) -> u32 {
    method i16 (line 879) | fn i16(v: i16) -> i16 {
    method i32 (line 883) | fn i32(v: i32) -> i32 {
    method i64 (line 887) | fn i64(v: i64) -> i64 {
    method f8e4m3 (line 891) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant NAME (line 897) | const NAME: &'static str = "round";
    constant KERNEL (line 898) | const KERNEL: &'static str = "uround";
    constant V (line 899) | const V: Self = Round;
    method bf16 (line 901) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 905) | fn f16(v: f16) -> f16 {
    method f32 (line 909) | fn f32(v: f32) -> f32 {
    method f64 (line 913) | fn f64(v: f64) -> f64 {
    method u8 (line 917) | fn u8(v: u8) -> u8 {
    method u32 (line 921) | fn u32(v: u32) -> u32 {
    method i16 (line 925) | fn i16(v: i16) -> i16 {
    method i32 (line 929) | fn i32(v: i32) -> i32 {
    method i64 (line 933) | fn i64(v: i64) -> i64 {
    method f8e4m3 (line 937) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant NAME (line 943) | const NAME: &'static str = "gelu_erf";
    constant KERNEL (line 944) | const KERNEL: &'static str = "ugelu_erf";
    constant V (line 945) | const V: Self = GeluErf;
    method bf16 (line 947) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 951) | fn f16(v: f16) -> f16 {
    method f32 (line 955) | fn f32(v: f32) -> f32 {
    method f64 (line 959) | fn f64(v: f64) -> f64 {
    method u8 (line 963) | fn u8(_: u8) -> u8 {
    method u32 (line 967) | fn u32(_: u32) -> u32 {
    method i16 (line 971) | fn i16(_: i16) -> i16 {
    method i32 (line 975) | fn i32(_: i32) -> i32 {
    method i64 (line 979) | fn i64(_: i64) -> i64 {
    method f8e4m3 (line 983) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant NAME (line 989) | const NAME: &'static str = "relu";
    constant KERNEL (line 990) | const KERNEL: &'static str = "urelu";
    constant V (line 991) | const V: Self = Relu;
    method bf16 (line 993) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 997) | fn f16(v: f16) -> f16 {
    method f32 (line 1001) | fn f32(v: f32) -> f32 {
    method f64 (line 1005) | fn f64(v: f64) -> f64 {
    method u8 (line 1009) | fn u8(v: u8) -> u8 {
    method u32 (line 1013) | fn u32(v: u32) -> u32 {
    method i16 (line 1017) | fn i16(v: i16) -> i16 {
    method i32 (line 1021) | fn i32(v: i32) -> i32 {
    method i64 (line 1025) | fn i64(v: i64) -> i64 {
    method f8e4m3 (line 1029) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
    constant NAME (line 1099) | const NAME: &'static str = "sign";
    constant KERNEL (line 1100) | const KERNEL: &'static str = "usign";
    constant V (line 1101) | const V: Self = Sign;
    method bf16 (line 1103) | fn bf16(v: bf16) -> bf16 {
    method f16 (line 1107) | fn f16(v: f16) -> f16 {
    method f32 (line 1111) | fn f32(v: f32) -> f32 {
    method f64 (line 1115) | fn f64(v: f64) -> f64 {
    method u8 (line 1119) | fn u8(v: u8) -> u8 {
    method u32 (line 1123) | fn u32(v: u32) -> u32 {
    method i16 (line 1127) | fn i16(v: i16) -> i16 {
    method i32 (line 1131) | fn i32(v: i32) -> i32 {
    method i64 (line 1135) | fn i64(v: i64) -> i64 {
    method f8e4m3 (line 1139) | fn f8e4m3(v: f8e4m3) -> f8e4m3 {
  type BinaryOpT (line 219) | pub trait BinaryOpT {
    constant NAME (line 220) | const NAME: &'static str;
    constant KERNEL (line 221) | const KERNEL: &'static str;
    constant V (line 222) | const V: Self;
    method bf16 (line 223) | fn bf16(v1: bf16, v2: bf16) -> bf16;
    method f16 (line 224) | fn f16(v1: f16, v2: f16) -> f16;
    method f32 (line 225) | fn f32(v1: f32, v2: f32) -> f32;
    method f64 (line 226) | fn f64(v1: f64, v2: f64) -> f64;
    method u8 (line 227) | fn u8(v1: u8, v2: u8) -> u8;
    method u32 (line 228) | fn u32(v1: u32, v2: u32) -> u32;
    method i16 (line 229) | fn i16(v1: i16, v2: i16) -> i16;
    method i32 (line 230) | fn i32(v1: i32, v2: i32) -> i32;
    method i64 (line 231) | fn i64(v1: i64, v2: i64) -> i64;
    method f8e4m3 (line 232) | fn f8e4m3(v1: f8e4m3, v2: f8e4m3) -> f8e4m3;
    constant BF16_VEC (line 234) | const BF16_VEC: bool = false;
    method bf16_vec (line 235) | fn bf16_vec(_xs1: &[bf16], _xs2: &[bf16], _ys: &mut [bf16]) {}
    constant F16_VEC (line 236) | const F16_VEC: bool = false;
    method f16_vec (line 237) | fn f16_vec(_xs1: &[f16], _xs2: &[f16], _ys: &mut [f16]) {}
    constant F32_VEC (line 238) | const F32_VEC: bool = false;
    method f32_vec (line 239) | fn f32_vec(_xs1: &[f32], _xs2: &[f32], _ys: &mut [f32]) {}
    constant F64_VEC (line 240) | const F64_VEC: bool = false;
    method f64_vec (line 241) | fn f64_vec(_xs1: &[f64], _xs2: &[f64], _ys: &mut [f64]) {}
    constant U8_VEC (line 242) | const U8_VEC: bool = false;
    method u8_vec (line 243) | fn u8_vec(_xs1: &[u8], _xs2: &[u8], _ys: &mut [u8]) {}
    constant U32_VEC (line 244) | const U32_VEC: bool = false;
    method u32_vec (line 245) | fn u32_vec(_xs1: &[u32], _xs2: &[u32], _ys: &mut [u32]) {}
    constant I64_VEC (line 246) | const I64_VEC: bool = false;
    method i64_vec (line 247) | fn i64_vec(_xs1: &[i64], _xs2: &[i64], _ys: &mut [i64]) {}
  type Add (line 250) | pub struct Add;
  type Div (line 251) | pub struct Div;
  type Mul (line 252) | pub struct Mul;
  type Sub (line 253) | pub struct Sub;
  type Maximum (line 254) | pub struct Maximum;
  type Minimum (line 255) | pub struct Minimum;
  type Exp (line 256) | pub struct Exp;
  type Log (line 257) | pub struct Log;
  type Sin (line 258) | pub struct Sin;
  type Cos (line 259) | pub struct Cos;
  type Abs (line 260) | pub struct Abs;
  type Neg (line 261) | pub struct Neg;
  type Recip (line 262) | pub struct Recip;
  type Sqr (line 263) | pub struct Sqr;
  type Sqrt (line 264) | pub struct Sqrt;
  type Gelu (line 265) | pub struct Gelu;
  type GeluErf (line 266) | pub struct GeluErf;
  type Erf (line 267) | pub struct Erf;
  type Relu (line 268) | pub struct Relu;
  type Silu (line 269) | pub struct Silu;
  type Tanh (line 270) | pub struct Tanh;
  type Floor (line 271) | pub struct Floor;
  type Ceil (line 272) | pub struct Ceil;
  type Round (line 273) | pub struct Round;
  type Sign (line 274) | pub struct Sign;
  constant SQRT_TWO_OVER_PI_F32 (line 517) | const SQRT_TWO_OVER_PI_F32: f32 = 0.79788456080286535587989211986876373;
  constant SQRT_TWO_OVER_PI_F64 (line 519) | const SQRT_TWO_OVER_PI_F64: f64 = 0.79788456080286535587989211986876373;
  type BackpropOp (line 1037) | pub struct BackpropOp(Option<Op>);
    method none (line 1040) | pub fn none() -> Self {
    method new1 (line 1044) | pub(crate) fn new1(arg: &Tensor, f: impl Fn(Tensor) -> Op) -> Self {
    method new2 (line 1053) | pub(crate) fn new2(arg1: &Tensor, arg2: &Tensor, f: impl Fn(Tensor, Te...
    method new3 (line 1062) | pub(crate) fn new3(
    method new (line 1076) | pub(crate) fn new<A: AsRef<Tensor>>(args: &[A], f: impl Fn(Vec<Tensor>...
    method is_none (line 1086) | pub(crate) fn is_none(&self) -> bool {
    type Target (line 1092) | type Target = Option<Op>;
    method deref (line 1093) | fn deref(&self) -> &Self::Target {

FILE: candle-core/src/pickle.rs
  constant VERBOSE (line 9) | const VERBOSE: bool = false;
  type OpCode (line 14) | pub enum OpCode {
    type Error (line 53) | type Error = u8;
    method try_from (line 54) | fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
  function read_to_newline (line 94) | fn read_to_newline<R: BufRead>(r: &mut R) -> Result<Vec<u8>> {
  type Object (line 105) | pub enum Object {
    method unicode (line 134) | pub fn unicode(self) -> OResult<String> {
    method reduce (line 141) | pub fn reduce(self) -> OResult<(Self, Self)> {
    method none (line 148) | pub fn none(self) -> OResult<()> {
    method persistent_load (line 155) | pub fn persistent_load(self) -> OResult<Self> {
    method bool (line 162) | pub fn bool(self) -> OResult<bool> {
    method int (line 169) | pub fn int(self) -> OResult<i32> {
    method int_or_long (line 176) | pub fn int_or_long(self) -> OResult<i64> {
    method tuple (line 184) | pub fn tuple(self) -> OResult<Vec<Self>> {
    method dict (line 191) | pub fn dict(self) -> OResult<Vec<(Self, Self)>> {
    method class (line 198) | pub fn class(self) -> OResult<(String, String)> {
    method into_tensor_info (line 208) | pub fn into_tensor_info(
  type OResult (line 131) | type OResult<T> = std::result::Result<T, Object>;
  type Error (line 259) | type Error = Object;
  method try_from (line 260) | fn try_from(value: Object) -> std::result::Result<Self, Self::Error> {
  type Error (line 269) | type Error = Object;
  function try_from (line 270) | fn try_from(value: Object) -> std::result::Result<Self, Self::Error> {
  type Error (line 279) | type Error = Object;
  function try_from (line 280) | fn try_from(value: Object) -> std::result::Result<Self, Self::Error> {
  type Stack (line 296) | pub struct Stack {
    method empty (line 302) | pub fn empty() -> Self {
    method stack (line 309) | pub fn stack(&self) -> &[Object] {
    method read_loop (line 313) | pub fn read_loop<R: BufRead>(&mut self, r: &mut R) -> Result<()> {
    method finalize (line 322) | pub fn finalize(mut self) -> Result<Object> {
    method push (line 326) | fn push(&mut self, obj: Object) {
    method pop (line 330) | fn pop(&mut self) -> Result<Object> {
    method build (line 338) | fn build(&mut self) -> Result<()> {
    method reduce (line 355) | fn reduce(&mut self) -> Result<()> {
    method last (line 383) | fn last(&mut self) -> Result<&mut Object> {
    method memo_get (line 390) | fn memo_get(&self, id: u32) -> Result<Object> {
    method memo_put (line 400) | fn memo_put(&mut self, id: u32) -> Result<()> {
    method persistent_load (line 406) | fn persistent_load(&self, id: Object) -> Result<Object> {
    method new_obj (line 410) | fn new_obj(&self, class: Object, args: Object) -> Result<Object> {
    method pop_to_marker (line 417) | fn pop_to_marker(&mut self) -> Result<Vec<Object>> {
    method read (line 437) | pub fn read<R: BufRead>(&mut self, r: &mut R) -> Result<bool> {
  method from (line 619) | fn from(value: Object) -> Self {
  function rebuild_args (line 626) | fn rebuild_args(args: Object) -> Result<(Layout, DType, String, usize)> {
  type TensorInfo (line 656) | pub struct TensorInfo {
  function read_pth_tensor_info (line 670) | pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
  type PthTensors (line 743) | pub struct PthTensors {
    method new (line 751) | pub fn new<P: AsRef<std::path::Path>>(path: P, key: Option<&str>) -> R...
    method tensor_infos (line 761) | pub fn tensor_infos(&self) -> &HashMap<String, TensorInfo> {
    method get (line 765) | pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
  function read_all_with_key (line 820) | pub fn read_all_with_key<P: AsRef<std::path::Path>>(
  function read_all (line 839) | pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(Strin...

FILE: candle-core/src/quantized/avx.rs
  function sum_i16_pairs_float (line 13) | pub(crate) unsafe fn sum_i16_pairs_float(x: __m256i) -> __m256 {
  function mul_sum_us8_pairs_float (line 20) | pub(crate) unsafe fn mul_sum_us8_pairs_float(ax: __m256i, sy: __m256i) -...
  function hsum_float_8 (line 26) | pub(crate) unsafe fn hsum_float_8(x: __m256) -> f32 {
  function bytes_from_nibbles_32 (line 35) | pub(crate) unsafe fn bytes_from_nibbles_32(rsi: *const u8) -> __m256i {
  function mul_sum_i8_pairs_float (line 43) | pub(crate) unsafe fn mul_sum_i8_pairs_float(x: __m256i, y: __m256i) -> _...
  function vec_dot_q4_0_q8_0 (line 50) | pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ...
  function vec_dot_q8_0_q8_0 (line 71) | pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ...
  function get_scale_shuffle (line 90) | unsafe fn get_scale_shuffle(i: usize) -> __m128i {
  function get_scale_shuffle_k4 (line 102) | unsafe fn get_scale_shuffle_k4(i: usize) -> __m256i {
  function get_scale_shuffle_q3k (line 119) | unsafe fn get_scale_shuffle_q3k(i: usize) -> __m256i {
  function vec_dot_q6k_q8k (line 131) | pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]...
  function mm256_set_m128i (line 219) | unsafe fn mm256_set_m128i(a: __m128i, b: __m128i) -> __m256i {
  function vec_dot_q2k_q8k (line 224) | pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]...
  function vec_dot_q3k_q8k (line 307) | pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]...
  function vec_dot_q4k_q8k (line 443) | pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]...
  function vec_dot_q5k_q8k (line 528) | pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]...
  function vec_dot_q8k_q8k (line 641) | pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]...

FILE: candle-core/src/quantized/cuda.rs
  type PaddedCudaSlice (line 10) | struct PaddedCudaSlice {
  type QCudaStorage (line 16) | pub struct QCudaStorage {
    method indexed_moe_forward (line 492) | pub fn indexed_moe_forward(
    method zeros (line 529) | pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -...
    method dtype (line 544) | pub fn dtype(&self) -> GgmlDType {
    method device (line 548) | pub fn device(&self) -> &CudaDevice {
    method dequantize (line 552) | pub fn dequantize(&self, elem_count: usize) -> Result<CudaStorage> {
    method dequantize_f16 (line 605) | pub fn dequantize_f16(&self, elem_count: usize) -> Result<CudaStorage> {
    method quantize (line 609) | pub fn quantize(&mut self, src: &CudaStorage) -> Result<()> {
    method quantize_imatrix (line 632) | pub fn quantize_imatrix(
    method quantize_imatrix_onto (line 660) | pub fn quantize_imatrix_onto(
    method quantize_onto (line 689) | pub fn quantize_onto(&mut self, src: &crate::CpuStorage) -> Result<()> {
    method storage_size_in_bytes (line 713) | pub fn storage_size_in_bytes(&self) -> usize {
    method fwd (line 717) | pub fn fwd(
    method data (line 740) | pub fn data(&self) -> Result<Vec<u8>> {
    method device_ptr (line 747) | pub fn device_ptr(&self) -> Result<*const u8> {
    method dequantize_matmul_vec (line 754) | fn dequantize_matmul_vec(
    method dequantize_matmul (line 794) | fn dequantize_matmul(
  function set_force_dmmv (line 24) | pub fn set_force_dmmv(f: bool) {
  constant WARP_SIZE (line 28) | pub const WARP_SIZE: usize = 32;
  constant MMQ_X_Q4_0_AMPERE (line 29) | pub const MMQ_X_Q4_0_AMPERE: usize = 4;
  constant MMQ_Y_Q4_0_AMPERE (line 30) | pub const MMQ_Y_Q4_0_AMPERE: usize = 32;
  constant NWARPS_Q4_0_AMPERE (line 31) | pub const NWARPS_Q4_0_AMPERE: usize = 4;
  constant GGML_CUDA_MMV_X (line 32) | pub const GGML_CUDA_MMV_X: usize = 32;
  constant GGML_CUDA_MMV_Y (line 33) | pub const GGML_CUDA_MMV_Y: usize = 1;
  constant CUDA_QUANTIZE_BLOCK_SIZE (line 34) | pub const CUDA_QUANTIZE_BLOCK_SIZE: usize = 256;
  constant CUDA_DEQUANTIZE_BLOCK_SIZE (line 35) | pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256;
  constant MATRIX_ROW_PADDING (line 36) | pub const MATRIX_ROW_PADDING: usize = 512;
  function ceil_div (line 38) | fn ceil_div(p: usize, q: usize) -> usize {
  function pad (line 42) | fn pad(p: usize, q: usize) -> usize {
  function quantize_q8_1 (line 46) | fn quantize_q8_1(
  function dequantize_f32 (line 103) | fn dequantize_f32(
  function dequantize_f16 (line 163) | fn dequantize_f16(
  function dequantize_mul_mat_vec (line 223) | fn dequantize_mul_mat_vec(
  function mul_mat_vec_via_q8_1 (line 269) | fn mul_mat_vec_via_q8_1(
  function mul_mat_via_q8_1 (line 340) | fn mul_mat_via_q8_1(
  function indexed_moe_forward_fused_q8_1_input (line 410) | fn indexed_moe_forward_fused_q8_1_input(
  function load_quantized (line 842) | pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
  function cuda_quantize_q8_1 (line 868) | fn cuda_quantize_q8_1() -> Result<()> {
  function cuda_mmv_q8_1 (line 882) | fn cuda_mmv_q8_1() -> Result<()> {
  function cuda_mm_q8_1 (line 921) | fn cuda_mm_q8_1() -> Result<()> {
  function cuda_mm_q8_1_pad (line 962) | fn cuda_mm_q8_1_pad() -> Result<()> {

FILE: candle-core/src/quantized/dummy_cuda.rs
  type QCudaStorage (line 5) | pub struct QCudaStorage {
    method zeros (line 11) | pub fn zeros(_: &CudaDevice, _: usize, _: GgmlDType) -> Result<Self> {
    method dtype (line 15) | pub fn dtype(&self) -> GgmlDType {
    method device (line 19) | pub fn device(&self) -> &CudaDevice {
    method dequantize (line 23) | pub fn dequantize(&self, _elem_count: usize) -> Result<CudaStorage> {
    method dequantize_f16 (line 27) | pub fn dequantize_f16(&self, _elem_count: usize) -> Result<CudaStorage> {
    method quantize (line 31) | pub fn quantize(&mut self, _src: &CudaStorage) -> Result<()> {
    method quantize_imatrix (line 35) | pub fn quantize_imatrix(
    method quantize_imatrix_onto (line 44) | pub fn quantize_imatrix_onto(
    method quantize_onto (line 53) | pub fn quantize_onto(&mut self, _src: &crate::CpuStorage) -> Result<()> {
    method device_ptr (line 57) | pub fn device_ptr(&self) -> Result<*const u8> {
    method storage_size_in_bytes (line 61) | pub fn storage_size_in_bytes(&self) -> usize {
    method fwd (line 65) | pub fn fwd(
    method data (line 74) | pub fn data(&self) -> Result<Vec<u8>> {
    method indexed_moe_forward (line 78) | pub fn indexed_moe_forward(
  function load_quantized (line 90) | pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(

FILE: candle-core/src/quantized/dummy_metal.rs
  type QMetalStorage (line 5) | pub struct QMetalStorage {
    method zeros (line 11) | pub fn zeros(_: &MetalDevice, _: usize, _: GgmlDType) -> Result<Self> {
    method dtype (line 15) | pub fn dtype(&self) -> GgmlDType {
    method device (line 19) | pub fn device(&self) -> &MetalDevice {
    method dequantize (line 23) | pub fn dequantize(&self, _elem_count: usize) -> Result<MetalStorage> {
    method quantize (line 27) | pub fn quantize(&mut self, _src: &MetalStorage) -> Result<()> {
    method quantize_imatrix (line 31) | pub fn quantize_imatrix(
    method quantize_imatrix_onto (line 40) | pub fn quantize_imatrix_onto(
    method quantize_onto (line 49) | pub fn quantize_onto(&mut self, _src: &crate::CpuStorage) -> Result<()> {
    method storage_size_in_bytes (line 53) | pub fn storage_size_in_bytes(&self) -> usize {
    method fwd (line 57) | pub fn fwd(
    method data (line 66) | pub fn data(&self) -> Result<Vec<u8>> {
    method indexed_moe_forward (line 70) | pub fn indexed_moe_forward(
  function load_quantized (line 82) | pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(

FILE: candle-core/src/quantized/ggml_file.rs
  type Magic (line 10) | enum Magic {
    type Error (line 19) | type Error = crate::Error;
    method try_from (line 20) | fn try_from(value: u32) -> Result<Self> {
  type VersionedMagic (line 34) | pub enum VersionedMagic {
    method read (line 43) | fn read<R: std::io::Read>(reader: &mut R) -> Result<Self> {
    method align32 (line 60) | fn align32(&self) -> bool {
  type HParams (line 69) | pub struct HParams {
    method read (line 80) | fn read<R: std::io::Read>(reader: &mut R) -> Result<Self> {
  type Vocab (line 101) | pub struct Vocab {
    method read (line 106) | fn read<R: std::io::Read>(reader: &mut R, n_vocab: usize) -> Result<Se...
  function from_raw_data (line 120) | fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
  function qtensor_from_ggml (line 138) | pub fn qtensor_from_ggml(
  function read_one_tensor (line 191) | fn read_one_tensor<R: std::io::Seek + std::io::Read>(
  type Content (line 225) | pub struct Content {
    method read (line 234) | pub fn read<R: std::io::Seek + std::io::Read>(
    method remove (line 260) | pub fn remove(&mut self, name: &str) -> Result<super::QTensor> {

FILE: candle-core/src/quantized/gguf_file.rs
  constant DEFAULT_ALIGNMENT (line 10) | pub const DEFAULT_ALIGNMENT: u64 = 32;
  type Magic (line 13) | enum Magic {
    type Error (line 18) | type Error = crate::Error;
    method try_from (line 19) | fn try_from(value: u32) -> Result<Self> {
  type VersionedMagic (line 29) | pub enum VersionedMagic {
    method read (line 36) | fn read<R: std::io::Read>(reader: &mut R) -> Result<Self> {
  type TensorInfo (line 51) | pub struct TensorInfo {
    method read (line 58) | pub fn read<R: std::io::Seek + std::io::Read>(
  type Content (line 85) | pub struct Content {
    method read (line 396) | pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Resul...
    method tensor (line 470) | pub fn tensor<R: std::io::Seek + std::io::Read>(
  function read_string (line 92) | fn read_string<R: std::io::Read>(reader: &mut R, magic: &VersionedMagic)...
  type ValueType (line 110) | pub enum ValueType {
    method from_u32 (line 356) | fn from_u32(v: u32) -> Result<Self> {
    method to_u32 (line 376) | fn to_u32(self) -> u32 {
  type Value (line 143) | pub enum Value {
    method value_type (line 160) | pub fn value_type(&self) -> ValueType {
    method to_u8 (line 178) | pub fn to_u8(&self) -> Result<u8> {
    method to_i8 (line 185) | pub fn to_i8(&self) -> Result<i8> {
    method to_u16 (line 192) | pub fn to_u16(&self) -> Result<u16> {
    method to_i16 (line 199) | pub fn to_i16(&self) -> Result<i16> {
    method to_u32 (line 206) | pub fn to_u32(&self) -> Result<u32> {
    method to_i32 (line 213) | pub fn to_i32(&self) -> Result<i32> {
    method to_u64 (line 221) | pub fn to_u64(&self) -> Result<u64> {
    method to_i64 (line 233) | pub fn to_i64(&self) -> Result<i64> {
    method to_f32 (line 240) | pub fn to_f32(&self) -> Result<f32> {
    method to_f64 (line 247) | pub fn to_f64(&self) -> Result<f64> {
    method to_bool (line 254) | pub fn to_bool(&self) -> Result<bool> {
    method to_vec (line 261) | pub fn to_vec(&self) -> Result<&Vec<Value>> {
    method to_string (line 268) | pub fn to_string(&self) -> Result<&String> {
    method read (line 275) | fn read<R: std::io::Read>(
    method write (line 316) | fn write<W: std::io::Write>(&self, w: &mut W) -> Result<()> {
  function write_string (line 484) | fn write_string<W: std::io::Write>(w: &mut W, str: &str) -> Result<()> {
  function write (line 491) | pub fn write<W: std::io::Seek + std::io::Write>(

FILE: candle-core/src/quantized/imatrix_file.rs
  function load_imatrix (line 10) | pub fn load_imatrix<P: AsRef<Path>>(fname: P) -> Result<HashMap<String, ...

FILE: candle-core/src/quantized/k_quants.rs
  constant QK_K (line 13) | pub const QK_K: usize = 256;
  constant K_SCALE_SIZE (line 14) | pub const K_SCALE_SIZE: usize = 12;
  constant QK4_0 (line 16) | pub const QK4_0: usize = 32;
  constant QK4_1 (line 17) | pub const QK4_1: usize = 32;
  constant QK5_0 (line 18) | pub const QK5_0: usize = 32;
  constant QK5_1 (line 19) | pub const QK5_1: usize = 32;
  constant QK8_0 (line 20) | pub const QK8_0: usize = 32;
  constant QK8_1 (line 21) | pub const QK8_1: usize = 32;
  type GgmlType (line 23) | pub trait GgmlType: Sized + Clone + Send + Sync {
    constant DTYPE (line 24) | const DTYPE: GgmlDType;
    constant BLCK_SIZE (line 25) | const BLCK_SIZE: usize;
    constant DIRECT_COPY (line 26) | const DIRECT_COPY: bool = false;
    method zeros (line 30) | fn zeros() -> Self {
    method to_float (line 33) | fn to_float(xs: &[Self], ys: &mut [f32]);
    method from_float (line 34) | fn from_float(xs: &[f32], ys: &mut [Self]);
    method from_float_imatrix (line 35) | fn from_float_imatrix(
    method direct_copy (line 47) | fn direct_copy(_xs: &[f32], _ys: &mut [Self]) {}
    method vec_dot (line 51) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32;
    method vec_dot_unopt (line 54) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32;
    constant DTYPE (line 173) | const DTYPE: GgmlDType = GgmlDType::Q4_0;
    constant BLCK_SIZE (line 174) | const BLCK_SIZE: usize = QK4_0;
    type VecDotType (line 175) | type VecDotType = BlockQ8_0;
    method to_float (line 178) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    method from_float (line 200) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method vec_dot (line 240) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 253) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    constant DTYPE (line 274) | const DTYPE: GgmlDType = GgmlDType::Q4_1;
    constant BLCK_SIZE (line 275) | const BLCK_SIZE: usize = QK4_1;
    type VecDotType (line 276) | type VecDotType = BlockQ8_1;
    method vec_dot (line 278) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 282) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 312) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 351) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 375) | const DTYPE: GgmlDType = GgmlDType::Q5_0;
    constant BLCK_SIZE (line 376) | const BLCK_SIZE: usize = QK5_0;
    type VecDotType (line 377) | type VecDotType = BlockQ8_0;
    method vec_dot (line 379) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 393) | fn vec_dot_unopt(_n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f...
    method from_float (line 416) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 455) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 481) | const DTYPE: GgmlDType = GgmlDType::Q5_1;
    constant BLCK_SIZE (line 482) | const BLCK_SIZE: usize = QK5_1;
    type VecDotType (line 483) | type VecDotType = BlockQ8_1;
    method vec_dot (line 485) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 489) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 523) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 566) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 594) | const DTYPE: GgmlDType = GgmlDType::Q8_0;
    constant BLCK_SIZE (line 595) | const BLCK_SIZE: usize = QK8_0;
    type VecDotType (line 596) | type VecDotType = BlockQ8_0;
    method to_float (line 599) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    method from_float (line 617) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method vec_dot (line 649) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 662) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    constant DTYPE (line 684) | const DTYPE: GgmlDType = GgmlDType::Q8_1;
    constant BLCK_SIZE (line 685) | const BLCK_SIZE: usize = QK8_1;
    type VecDotType (line 686) | type VecDotType = BlockQ8_1;
    method vec_dot (line 688) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 692) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 712) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 743) | fn to_float(_xs: &[Self], _ys: &mut [f32]) {
    constant DTYPE (line 749) | const DTYPE: GgmlDType = GgmlDType::Q2K;
    constant BLCK_SIZE (line 750) | const BLCK_SIZE: usize = QK_K;
    type VecDotType (line 751) | type VecDotType = BlockQ8K;
    method vec_dot (line 754) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 767) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 820) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method from_float_imatrix (line 884) | fn from_float_imatrix(xs: &[f32], ys: &mut [Self], imatrix_weights: &[...
    method to_float (line 942) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 982) | const DTYPE: GgmlDType = GgmlDType::Q3K;
    constant BLCK_SIZE (line 983) | const BLCK_SIZE: usize = QK_K;
    type VecDotType (line 984) | type VecDotType = BlockQ8K;
    method vec_dot (line 987) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 997) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 1119) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method from_float_imatrix (line 1200) | fn from_float_imatrix(xs: &[f32], ys: &mut [Self], imatrix_weights: &[...
    method to_float (line 1298) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 1354) | const DTYPE: GgmlDType = GgmlDType::Q4K;
    constant BLCK_SIZE (line 1355) | const BLCK_SIZE: usize = QK_K;
    type VecDotType (line 1356) | type VecDotType = BlockQ8K;
    method vec_dot (line 1359) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 1372) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 1454) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method from_float_imatrix (line 1514) | fn from_float_imatrix(xs: &[f32], ys: &mut [Self], imatrix_weights: &[...
    method to_float (line 1579) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 1611) | const DTYPE: GgmlDType = GgmlDType::Q5K;
    constant BLCK_SIZE (line 1612) | const BLCK_SIZE: usize = QK_K;
    type VecDotType (line 1613) | type VecDotType = BlockQ8K;
    method vec_dot (line 1616) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 1626) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 1716) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method from_float_imatrix (line 1791) | fn from_float_imatrix(xs: &[f32], ys: &mut [Self], imatrix_weights: &[...
    method to_float (line 1874) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 1912) | const DTYPE: GgmlDType = GgmlDType::Q6K;
    constant BLCK_SIZE (line 1913) | const BLCK_SIZE: usize = QK_K;
    type VecDotType (line 1914) | type VecDotType = BlockQ8K;
    method vec_dot (line 1917) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 1930) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 1989) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method from_float_imatrix (line 2061) | fn from_float_imatrix(xs: &[f32], ys: &mut [Self], imatrix_weights: &[...
    method to_float (line 2142) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 2177) | const DTYPE: GgmlDType = GgmlDType::Q8K;
    constant BLCK_SIZE (line 2178) | const BLCK_SIZE: usize = QK_K;
    type VecDotType (line 2179) | type VecDotType = BlockQ8K;
    method vec_dot (line 2182) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 2195) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 2214) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 2253) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    constant DTYPE (line 2353) | const DTYPE: GgmlDType = GgmlDType::F32;
    constant BLCK_SIZE (line 2354) | const BLCK_SIZE: usize = 1;
    constant DIRECT_COPY (line 2355) | const DIRECT_COPY: bool = true;
    type VecDotType (line 2356) | type VecDotType = f32;
    method vec_dot (line 2358) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 2362) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 2370) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 2381) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    method direct_copy (line 2392) | fn direct_copy(xs: &[f32], ys: &mut [Self]) {
    constant DTYPE (line 2398) | const DTYPE: GgmlDType = GgmlDType::F16;
    constant BLCK_SIZE (line 2399) | const BLCK_SIZE: usize = 1;
    constant DIRECT_COPY (line 2400) | const DIRECT_COPY: bool = true;
    type VecDotType (line 2401) | type VecDotType = f16;
    method vec_dot (line 2403) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 2407) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 2415) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 2426) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    method direct_copy (line 2437) | fn direct_copy(xs: &[f32], ys: &mut [Self]) {
    constant DTYPE (line 2443) | const DTYPE: GgmlDType = GgmlDType::BF16;
    constant BLCK_SIZE (line 2444) | const BLCK_SIZE: usize = 1;
    constant DIRECT_COPY (line 2445) | const DIRECT_COPY: bool = true;
    type VecDotType (line 2446) | type VecDotType = bf16;
    method vec_dot (line 2448) | fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method vec_dot_unopt (line 2452) | fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> f32 {
    method from_float (line 2460) | fn from_float(xs: &[f32], ys: &mut [Self]) {
    method to_float (line 2471) | fn to_float(xs: &[Self], ys: &mut [f32]) {
    method direct_copy (line 2482) | fn direct_copy(xs: &[f32], ys: &mut [Self]) {
  type BlockQ4_0 (line 59) | pub struct BlockQ4_0 {
  constant _ (line 63) | const _: () = assert!(std::mem::size_of::<BlockQ4_0>() == 18);
  type BlockQ4_1 (line 67) | pub struct BlockQ4_1 {
  constant _ (line 72) | const _: () = assert!(std::mem::size_of::<BlockQ4_1>() == 20);
  type BlockQ5_0 (line 76) | pub struct BlockQ5_0 {
  constant _ (line 81) | const _: () = assert!(std::mem::size_of::<BlockQ5_0>() == 22);
  type BlockQ5_1 (line 85) | pub struct BlockQ5_1 {
  constant _ (line 91) | const _: () = assert!(std::mem::size_of::<BlockQ5_1>() == 24);
  type BlockQ8_0 (line 95) | pub struct BlockQ8_0 {
  constant _ (line 99) | const _: () = assert!(std::mem::size_of::<BlockQ8_0>() == 34);
  type BlockQ8_1 (line 103) | pub struct BlockQ8_1 {
  constant _ (line 108) | const _: () = assert!(std::mem::size_of::<BlockQ8_1>() == 36);
  type BlockQ2K (line 112) | pub struct BlockQ2K {
  constant _ (line 118) | const _: () = assert!(QK_K / 16 + QK_K / 4 + 2 * 2 == std::mem::size_of:...
  type BlockQ3K (line 122) | pub struct BlockQ3K {
  constant _ (line 128) | const _: () = assert!(QK_K / 8 + QK_K / 4 + 12 + 2 == std::mem::size_of:...
  type BlockQ4K (line 133) | pub struct BlockQ4K {
  constant _ (line 139) | const _: () = assert!(QK_K / 2 + K_SCALE_SIZE + 2 * 2 == std::mem::size_...
  type BlockQ5K (line 143) | pub struct BlockQ5K {
  constant _ (line 150) | const _: () =
  type BlockQ6K (line 155) | pub struct BlockQ6K {
  constant _ (line 161) | const _: () = assert!(3 * QK_K / 4 + QK_K / 16 + 2 == std::mem::size_of:...
  type BlockQ8K (line 165) | pub struct BlockQ8K {
  constant _ (line 170) | const _: () = assert!(4 + QK_K + QK_K / 16 * 2 == std::mem::size_of::<Bl...
  function matmul (line 2268) | pub fn matmul<T: GgmlType>(
  function matmul_f16 (line 2317) | pub fn matmul_f16<T: GgmlType>(

FILE: candle-core/src/quantized/metal.rs
  type QMetalStorage (line 7) | pub struct QMetalStorage {
    method zeros (line 14) | pub fn zeros(device: &MetalDevice, elem_count: usize, dtype: GgmlDType...
    method dtype (line 24) | pub fn dtype(&self) -> GgmlDType {
    method device (line 28) | pub fn device(&self) -> &MetalDevice {
    method buffer (line 32) | pub fn buffer(&self) -> &Buffer {
    method dequantize (line 36) | pub fn dequantize(&self, elem_count: usize) -> Result<MetalStorage> {
    method quantize (line 119) | pub fn quantize(&mut self, src: &MetalStorage) -> Result<()> {
    method quantize_imatrix (line 131) | pub fn quantize_imatrix(
    method quantize_imatrix_onto (line 148) | pub fn quantize_imatrix_onto(
    method quantize_onto (line 169) | pub fn quantize_onto(&mut self, src: &crate::CpuStorage) -> Result<()> {
    method storage_size_in_bytes (line 185) | pub fn storage_size_in_bytes(&self) -> usize {
    method fwd_mv (line 189) | fn fwd_mv(
    method fwd (line 246) | pub fn fwd(
    method data (line 338) | pub fn data(&self) -> Result<Vec<u8>> {
  function load_quantized (line 351) | pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
  function read_to_vec (line 364) | fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
  function from (line 372) | fn from(value: GgmlDType) -> Self {

FILE: candle-core/src/quantized/mod.rs
  function as_t_slice (line 38) | fn as_t_slice<T>(data: Cow<'_, [u8]>) -> &[T] {
  type QTensor (line 54) | pub struct QTensor {
    method fmt (line 457) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
    method new (line 477) | pub fn new<S: Into<Shape>>(storage: QStorage, shape: S) -> Result<Self> {
    method quantize (line 483) | pub fn quantize(src: &Tensor, dtype: GgmlDType) -> Result<Self> {
    method quantize_imatrix (line 503) | pub fn quantize_imatrix(
    method quantize_imatrix_onto (line 539) | pub fn quantize_imatrix_onto(
    method quantize_onto (line 582) | pub fn quantize_onto(src: &Tensor, dtype: GgmlDType, dev: &Device) -> ...
    method dtype (line 609) | pub fn dtype(&self) -> GgmlDType {
    method device (line 613) | pub fn device(&self) -> Device {
    method rank (line 617) | pub fn rank(&self) -> usize {
    method shape (line 621) | pub fn shape(&self) -> &Shape {
    method dequantize (line 625) | pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
    method dequantize_f16 (line 631) | pub fn dequantize_f16(&self, device: &Device) -> Result<Tensor> {
    method storage_size_in_bytes (line 648) | pub fn storage_size_in_bytes(&self) -> usize {
    method data (line 652) | pub fn data(&self) -> Result<Cow<'_, [u8]>> {
    method indexed_moe_forward (line 656) | pub fn indexed_moe_forward(&self, x: &Tensor, ids: &Tensor) -> Result<...
    method device_ptr (line 684) | pub fn device_ptr(&self) -> Result<*const u8> {
    method name (line 775) | fn name(&self) -> &'static str {
    method cpu_fwd (line 779) | fn cpu_fwd(
    method metal_fwd (line 834) | fn metal_fwd(
    method cuda_fwd (line 846) | fn cuda_fwd(
  method qzeros (line 60) | fn qzeros(&self, elem_count: usize, dtype: GgmlDType) -> Result<QStorage> {
  type QStorage (line 78) | pub enum QStorage {
    method from_data (line 85) | pub fn from_data(data: Cow<'_, [u8]>, device: &Device, dtype: GgmlDTyp...
    method block_size (line 125) | fn block_size(&self) -> usize {
    method dtype (line 133) | fn dtype(&self) -> GgmlDType {
    method device (line 141) | fn device(&self) -> Device {
    method size_in_bytes (line 149) | fn size_in_bytes(&self) -> usize {
    method quantize (line 157) | fn quantize(&mut self, src: &Storage) -> Result<()> {
    method quantize_imatrix (line 169) | fn quantize_imatrix(
    method quantize_onto (line 190) | fn quantize_onto(&mut self, src: &Storage) -> Result<()> {
    method quantize_imatrix_onto (line 202) | fn quantize_imatrix_onto(
    method dequantize (line 223) | fn dequantize(&self, elem_count: usize) -> Result<Storage> {
    method data (line 231) | fn data(&self) -> Result<Cow<'_, [u8]>> {
    method device_ptr (line 244) | pub fn device_ptr(&self) -> Result<*const u8> {
  type GgmlDType (line 255) | pub enum GgmlDType {
    method from_u32 (line 274) | pub(crate) fn from_u32(u: u32) -> Result<Self> {
    method to_u32 (line 297) | pub(crate) fn to_u32(self) -> u32 {
    method cpu_zeros (line 319) | pub fn cpu_zeros(&self, elem_count: usize) -> Box<dyn QuantizedType> {
    method from_data (line 339) | pub fn from_data(&self, data: Cow<'_, [u8]>) -> Box<dyn QuantizedType> {
    method type_size (line 360) | pub fn type_size(&self) -> usize {
    method block_size (line 382) | pub fn block_size(&self) -> usize {
  type QuantizedType (line 398) | pub trait QuantizedType: Send + Sync {
    method dtype (line 399) | fn dtype(&self) -> GgmlDType;
    method matmul_t (line 400) | fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut ...
    method matmul_t_f16 (line 401) | fn matmul_t_f16(&self, mkn: (usize, usize, usize), lhs: &[f16], dst: &...
    method dequantize (line 402) | fn dequantize(&self, elem_count: usize) -> Result<CpuStorage>;
    method storage_size_in_bytes (line 403) | fn storage_size_in_bytes(&self) -> usize;
    method as_ptr (line 404) | fn as_ptr(&self) -> *const u8;
    method block_size (line 405) | fn block_size(&self) -> usize;
    method from_float (line 407) | fn from_float(&mut self, xs: &[f32]);
    method from_float_imatrix (line 409) | fn from_float_imatrix(&mut self, xs: &[f32], imatrix_weights: &[f32], ...
    method size (line 410) | fn size(&self) -> usize;
    method matmul_t (line 414) | fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut ...
    method matmul_t_f16 (line 417) | fn matmul_t_f16(&self, mkn: (usize, usize, usize), lhs: &[f16], dst: &...
    method size (line 421) | fn size(&self) -> usize {
    method from_float (line 425) | fn from_float(&mut self, xs: &[f32]) {
    method from_float_imatrix (line 429) | fn from_float_imatrix(&mut self, xs: &[f32], imatrix_weights: &[f32], ...
    method dtype (line 433) | fn dtype(&self) -> GgmlDType {
    method block_size (line 437) | fn block_size(&self) -> usize {
    method dequantize (line 441) | fn dequantize(&self, elem_count: usize) -> Result<CpuStorage> {
    method storage_size_in_bytes (line 447) | fn storage_size_in_bytes(&self) -> usize {
    method as_ptr (line 451) | fn as_ptr(&self) -> *const u8 {
  function check_shape (line 462) | fn check_shape(shape: &Shape, block_size: usize) -> Result<()> {
  type QMatMul (line 695) | pub enum QMatMul {
    method from_arc (line 724) | pub fn from_arc(qtensor: std::sync::Arc<QTensor>) -> Result<Self> {
    method from_qtensor (line 741) | pub fn from_qtensor(qtensor: QTensor) -> Result<Self> {
    method dequantize_f16 (line 745) | pub fn dequantize_f16(&self) -> Result<Tensor> {
    method forward_via_f16 (line 753) | pub fn forward_via_f16(&self, xs: &Tensor) -> Result<Tensor> {
    method indexed_moe_forward (line 764) | pub fn indexed_moe_forward(&self, x: &Tensor, ids: &Tensor) -> Result<...
    method forward (line 860) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {

FILE: candle-core/src/quantized/neon.rs
  function vdotq_s32 (line 15) | unsafe fn vdotq_s32(a: int8x16_t, b: int8x16_t) -> int32x4_t {
  function vec_dot_q4_0_q8_0 (line 23) | pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ...
  function vec_dot_q8_0_q8_0 (line 65) | pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ...
  function vec_dot_q8k_q8k (line 98) | pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]...
  function vec_dot_q6k_q8k (line 123) | pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]...
  function vec_dot_q5k_q8k (line 232) | pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]...
  function vec_dot_q4k_q8k (line 317) | pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]...
  function vec_dot_q3k_q8k (line 398) | pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]...
  function vec_dot_q2k_q8k (line 522) | pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]...
  function multiply_accum_with_scale (line 605) | unsafe fn multiply_accum_with_scale(

FILE: candle-core/src/quantized/simd128.rs
  function vec_dot_q4_0_q8_0 (line 8) | pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ...
  function vec_dot_q8_0_q8_0 (line 54) | pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ...
  function vec_dot_q2k_q8k (line 94) | pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]...
  function vec_dot_q4k_q8k (line 179) | pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]...
  function vec_dot_q6k_q8k (line 269) | pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]...
  function vec_dot_q8k_q8k (line 392) | pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]...

FILE: candle-core/src/quantized/tokenizer.rs
  type TokenizerFromGguf (line 20) | pub trait TokenizerFromGguf: Sized {
    method from_gguf (line 21) | fn from_gguf(ct: &gguf_file::Content) -> Result<Self>;
    method from_gguf (line 202) | fn from_gguf(ct: &gguf_file::Content) -> Result<Self> {
  function metadata_value (line 24) | fn metadata_value<'a>(ct: &'a gguf_file::Content, key: &str) -> Result<&...
  function gguf_value_to_u32 (line 30) | fn gguf_value_to_u32(v: &gguf_file::Value) -> Result<u32> {
  function value_to_string_array (line 45) | fn value_to_string_array(v: &gguf_file::Value, name: &str) -> Result<Vec...
  function merges_from_value (line 58) | fn merges_from_value(v: &gguf_file::Value) -> Result<Vec<(String, String...
  type Pipeline (line 69) | struct Pipeline {
    method apply (line 77) | fn apply(self, tokenizer: &mut Tokenizer) {
  function pre_tokenizer_sequence (line 93) | fn pre_tokenizer_sequence(regex: &str, byte_level: ByteLevelPre) -> Resu...
  function pipeline_from_pre (line 103) | fn pipeline_from_pre(pre: &str) -> Result<Pipeline> {
  function template_processor (line 138) | fn template_processor(

FILE: candle-core/src/quantized/utils.rs
  function nearest_int (line 1) | pub(super) fn nearest_int(v: f32) -> i32 {
  function group_for_quantization (line 8) | pub(super) fn group_for_quantization<'a, 'b, T: super::k_quants::GgmlType>(
  function group_for_dequantization (line 30) | pub(super) fn group_for_dequantization<'a, 'b, T: super::k_quants::GgmlT...
  function get_scale_min_k4 (line 50) | pub(super) fn get_scale_min_k4(j: usize, q: &[u8]) -> (u8, u8) {
  function make_qx_quants (line 62) | pub(super) unsafe fn make_qx_quants(
  function make_qkx1_quants (line 227) | pub(super) fn make_qkx1_quants(nmax: i32, ntry: usize, x: &[f32]) -> (f3...
  function make_q3_quants (line 287) | pub(super) fn make_q3_quants(x: &[f32], nmax: i32, do_rmse: bool) -> f32 {
  function make_qkx3_quants (line 356) | pub(super) fn make_qkx3_quants(
  function make_qp_quants (line 470) | pub(super) fn make_qp_quants(

FILE: candle-core/src/safetensors.rs
  function from (line 23) | fn from(value: DType) -> Self {
  type Error (line 44) | type Error = Error;
  method try_from (line 45) | fn try_from(value: st::Dtype) -> Result<Self> {
  method dtype (line 67) | fn dtype(&self) -> st::Dtype {
  method shape (line 70) | fn shape(&self) -> &[usize] {
  method data (line 74) | fn data(&self) -> Cow<'_, [u8]> {
  method data_len (line 80) | fn data_len(&self) -> usize {
  function dtype (line 88) | fn dtype(&self) -> st::Dtype {
  function shape (line 91) | fn shape(&self) -> &[usize] {
  function data (line 95) | fn data(&self) -> Cow<'_, [u8]> {
  function data_len (line 101) | fn data_len(&self) -> usize {
  method save_safetensors (line 109) | pub fn save_safetensors<P: AsRef<Path>>(&self, name: &str, filename: P) ...
  function convert_slice (line 115) | fn convert_slice<T: WithDType>(data: &[u8], shape: &[usize], device: &De...
  function convert_slice_with_cast (line 140) | fn convert_slice_with_cast<T: Sized + Copy, U: WithDType, F: Fn(T) -> Re...
  function convert_with_cast_ (line 172) | fn convert_with_cast_<T: Sized + Copy, U: WithDType, F: Fn(T) -> Result<...
  function convert_ (line 180) | fn convert_<T: WithDType>(view: &st::TensorView<'_>, device: &Device) ->...
  function convert_back_ (line 184) | fn convert_back_<T: WithDType>(mut vs: Vec<T>) -> Vec<u8> {
  type Load (line 198) | pub trait Load {
    method load (line 199) | fn load(&self, device: &Device) -> Result<Tensor>;
    method load (line 203) | fn load(&self, device: &Device) -> Result<Tensor> {
  method from_raw_buffer (line 209) | pub fn from_raw_buffer(
  function convert (line 286) | fn convert(view: &st::TensorView<'_>, device: &Device) -> Result<Tensor> {
  function convert_dummy (line 312) | fn convert_dummy(view: &st::TensorView<'_>, device: &Device) -> Result<T...
  function convert_back (line 380) | fn convert_back(tensor: &Tensor) -> Result<Vec<u8>> {
  function load (line 400) | pub fn load<P: AsRef<Path>>(filename: P, device: &Device) -> Result<Hash...
  function load_buffer (line 405) | pub fn load_buffer(data: &[u8], device: &Device) -> Result<HashMap<Strin...
  function save (line 413) | pub fn save<K: AsRef<str> + Ord + std::fmt::Display, P: AsRef<Path>>(
  type SafeTensors_ (line 421) | struct SafeTensors_<'a>(SafeTensors<'a>);
  type MmapedSafetensors (line 423) | pub struct MmapedSafetensors {
    method new (line 434) | pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
    method multi (line 461) | pub unsafe fn multi<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
    method load (line 489) | pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
    method tensors (line 493) | pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
    method get (line 501) | pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
  type SliceSafetensors (line 518) | pub struct SliceSafetensors<'a> {
  function new (line 524) | pub fn new(buffer: &'a [u8]) -> Result<Self> {
  function load (line 529) | pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
  function tensors (line 533) | pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
  function get (line 537) | pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
  type BufferedSafetensors (line 542) | pub struct BufferedSafetensors {
    method new (line 548) | pub fn new(buffer: Vec<u8>) -> Result<Self> {
    method load (line 559) | pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
    method tensors (line 563) | pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
    method get (line 567) | pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
  type MmapedFile (line 572) | pub struct MmapedFile {
    method new (line 584) | pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
    method deserialize (line 596) | pub fn deserialize(&self) -> Result<SafeTensors<'_>> {
  function save_single_tensor (line 609) | fn save_single_tensor() {
  function save_load_multiple_tensors (line 618) | fn save_load_multiple_tensors() {
  function load_u8 (line 633) | fn load_u8() {

FILE: candle-core/src/scalar.rs
  type Scalar (line 8) | pub enum Scalar {
    method from (line 22) | fn from(value: T) -> Self {
    method zero (line 28) | pub fn zero(dtype: DType) -> Self {
    method one (line 46) | pub fn one(dtype: DType) -> Self {
    method dtype (line 64) | pub fn dtype(&self) -> DType {
    method to_f64 (line 79) | pub fn to_f64(&self) -> f64 {
  type TensorScalar (line 95) | pub enum TensorScalar {
  type TensorOrScalar (line 100) | pub trait TensorOrScalar {
    method to_tensor_scalar (line 101) | fn to_tensor_scalar(self) -> Result<TensorScalar>;
    method to_tensor_scalar (line 105) | fn to_tensor_scalar(self) -> Result<TensorScalar> {
    method to_tensor_scalar (line 111) | fn to_tensor_scalar(self) -> Result<TensorScalar> {

FILE: candle-core/src/shape.rs
  type Shape (line 6) | pub struct Shape(Vec<usize>);
    method fmt (line 11) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    method from (line 17) | fn from(dims: &[usize; C]) -> Self {
    method from (line 23) | fn from(dims: &[usize]) -> Self {
    method from (line 29) | fn from(shape: &Shape) -> Self {
    method from (line 35) | fn from(_: ()) -> Self {
    method from (line 41) | fn from(d1: usize) -> Self {
    method from (line 64) | fn from(dims: Vec<usize>) -> Self {
    method from_dims (line 106) | pub fn from_dims(dims: &[usize]) -> Self {
    method rank (line 111) | pub fn rank(&self) -> usize {
    method into_dims (line 115) | pub fn into_dims(self) -> Vec<usize> {
    method dims (line 120) | pub fn dims(&self) -> &[usize] {
    method dim (line 125) | pub fn dim<D: Dim>(&self, dim: D) -> Result<usize> {
    method elem_count (line 131) | pub fn elem_count(&self) -> usize {
    method stride_contiguous (line 137) | pub(crate) fn stride_contiguous(&self) -> Vec<usize> {
    method is_contiguous (line 153) | pub fn is_contiguous(&self, stride: &[usize]) -> bool {
    method is_fortran_contiguous (line 168) | pub fn is_fortran_contiguous(&self, stride: &[usize]) -> bool {
    method extend (line 184) | pub fn extend(mut self, additional_dims: &[usize]) -> Self {
    method broadcast_shape_binary_op (line 191) | pub fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) ...
    method broadcast_shape_matmul (line 229) | pub(crate) fn broadcast_shape_matmul(&self, rhs: &Self) -> Result<(Sha...
  constant SCALAR (line 8) | pub const SCALAR: Shape = Shape(vec![]);
  type Dim (line 253) | pub trait Dim {
    method to_index (line 254) | fn to_index(&self, shape: &Shape, op: &'static str) -> Result<usize>;
    method to_index_plus_one (line 255) | fn to_index_plus_one(&self, shape: &Shape, op: &'static str) -> Result...
    method to_index (line 259) | fn to_index(&self, shape: &Shape, op: &'static str) -> Result<usize> {
    method to_index_plus_one (line 273) | fn to_index_plus_one(&self, shape: &Shape, op: &'static str) -> Result...
    method to_index (line 312) | fn to_index(&self, shape: &Shape, op: &'static str) -> Result<usize> {
    method to_index_plus_one (line 322) | fn to_index_plus_one(&self, shape: &Shape, op: &'static str) -> Result...
  type D (line 289) | pub enum D {
    method out_of_range (line 296) | fn out_of_range(&self, shape: &Shape, op: &'static str) -> Error {
  type Dims (line 333) | pub trait Dims: Sized {
    method to_indexes_internal (line 334) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
    method to_indexes (line 336) | fn to_indexes(self, shape: &Shape, op: &'static str) -> Result<Vec<usi...
    method to_indexes_internal (line 361) | fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec...
    method to_indexes_internal (line 367) | fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec...
    method to_indexes_internal (line 373) | fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec...
    method to_indexes_internal (line 379) | fn to_indexes_internal(self, _: &Shape, _: &'static str) -> Result<Vec...
    method to_indexes_internal (line 385) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
    method to_indexes_internal (line 392) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
    method to_indexes_internal (line 399) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
    method to_indexes_internal (line 407) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
    method to_indexes_internal (line 416) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
    method to_indexes_internal (line 426) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
    method to_indexes_internal (line 437) | fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Resul...
  type ShapeWithOneHole (line 470) | pub trait ShapeWithOneHole {
    method into_shape (line 471) | fn into_shape(self, el_count: usize) -> Result<Shape>;
    method into_shape (line 475) | fn into_shape(self, _el_count: usize) -> Result<Shape> {
    method into_shape (line 481) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 497) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 504) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 511) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 518) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 525) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 532) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 540) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 548) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 556) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 564) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 572) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 580) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 588) | fn into_shape(self, el_count: usize) -> Result<Shape> {
    method into_shape (line 596) | fn into_shape(self, el_count: usize) -> Result<Shape> {
  function hole_size (line 486) | fn hole_size(el_count: usize, prod_d: usize, s: &dyn std::fmt::Debug) ->...
  function stride (line 608) | fn stride() {
  function test_from_tuple (line 620) | fn test_from_tuple() {

FILE: candle-core/src/sort.rs
  type ArgSort (line 5) | struct ArgSort {
    method asort (line 11) | fn asort<T: crate::WithDType>(&self, vs: &[T], layout: &crate::Layout)...
    method f (line 65) | fn f<T: DeviceRepr + WithDType + ValidAsZeroBits, W: Fn(CudaSlice<T>) ...
    method name (line 107) | fn name(&self) -> &'static str {
    method cpu_fwd (line 111) | fn cpu_fwd(
    method cuda_fwd (line 152) | fn cuda_fwd(
    method metal_fwd (line 169) | fn metal_fwd(
  function next_power_of_2 (line 246) | fn next_power_of_2(x: usize) -> usize {
  method arg_sort_last_dim (line 260) | pub fn arg_sort_last_dim(&self, asc: bool) -> Result<Tensor> {
  method sort_last_dim (line 280) | pub fn sort_last_dim(&self, asc: bool) -> Result<(Tensor, Tensor)> {

FILE: candle-core/src/storage.rs
  type Storage (line 10) | pub enum Storage {
    method try_clone (line 17) | pub fn try_clone(&self, layout: &Layout) -> Result<Self> {
    method device (line 31) | pub fn device(&self) -> Device {
    method dtype (line 39) | pub fn dtype(&self) -> DType {
    method same_device (line 47) | pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Resu...
    method same_dtype (line 67) | pub(crate) fn same_dtype(&self, rhs: &Self, op: &'static str) -> Resul...
    method const_set (line 77) | pub(crate) fn const_set(&mut self, v: Scalar, l: &Layout) -> Result<()> {
    method affine (line 85) | pub(crate) fn affine(&self, layout: &Layout, mul: f64, add: f64) -> Re...
    method powf (line 102) | pub(crate) fn powf(&self, layout: &Layout, alpha: f64) -> Result<Self> {
    method elu (line 119) | pub(crate) fn elu(&self, layout: &Layout, alpha: f64) -> Result<Self> {
    method cmp (line 136) | pub(crate) fn cmp(
    method reduce_op (line 171) | pub(crate) fn reduce_op(&self, op: ReduceOp, layout: &Layout, s: &[usi...
    method to_dtype (line 188) | pub(crate) fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result...
    method apply_op1 (line 205) | pub(crate) fn apply_op1(&self, l: &Layout, c: &dyn CustomOp1) -> Resul...
    method apply_op2 (line 222) | pub(crate) fn apply_op2(
    method apply_op3 (line 247) | pub(crate) fn apply_op3(
    method inplace_op1 (line 275) | pub(crate) fn inplace_op1(&mut self, l: &Layout, c: &dyn InplaceOp1) -...
    method inplace_op2 (line 283) | pub(crate) fn inplace_op2(
    method inplace_op3 (line 299) | pub(crate) fn inplace_op3(
    method unary_impl (line 320) | pub(crate) fn unary_impl<B: op::UnaryOpT>(&self, layout: &Layout) -> R...
    method binary_impl (line 337) | pub(crate) fn binary_impl<B: op::BinaryOpT>(
    method conv1d (line 371) | pub(crate) fn conv1d(
    method conv_transpose1d (line 402) | pub(crate) fn conv_transpose1d(
    method conv2d (line 433) | pub(crate) fn conv2d(
    method conv_transpose2d (line 464) | pub(crate) fn conv_transpose2d(
    method avg_pool2d (line 495) | pub(crate) fn avg_pool2d(
    method max_pool2d (line 517) | pub(crate) fn max_pool2d(
    method upsample_nearest1d (line 539) | pub(crate) fn upsample_nearest1d(&self, layout: &Layout, sz: usize) ->...
    method upsample_nearest2d (line 556) | pub(crate) fn upsample_nearest2d(&self, layout: &Layout, h: usize, w: ...
    method upsample_bilinear2d (line 573) | pub(crate) fn upsample_bilinear2d(
    method where_cond (line 601) | pub(crate) fn where_cond(
    method gather (line 634) | pub(crate) fn gather(
    method scatter_set (line 659) | pub(crate) fn scatter_set(
    method scatter_add (line 685) | pub(crate) fn scatter_add(
    method index_add (line 711) | pub(crate) fn index_add(
    method index_select (line 739) | pub(crate) fn index_select(
    method matmul (line 769) | pub(crate) fn matmul(
    method copy_strided_src (line 801) | pub(crate) fn copy_strided_src(
    method copy2d (line 823) | pub(crate) fn copy2d(

FILE: candle-core/src/streaming.rs
  type Dim (line 5) | pub trait Dim: crate::shape::Dim + Copy {}
  type StreamTensor (line 11) | pub struct StreamTensor(Option<Tensor>);
    method fmt (line 14) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    method from (line 23) | fn from(value: Option<Tensor>) -> Self {
    method from (line 29) | fn from(value: Tensor) -> Self {
    method from (line 35) | fn from(_value: ()) -> Self {
    method empty (line 41) | pub fn empty() -> Self {
    method from_tensor (line 45) | pub fn from_tensor(tensor: Tensor) -> Self {
    method shape (line 49) | pub fn shape(&self) -> Option<&Shape> {
    method cat2 (line 53) | pub fn cat2<D: Dim>(&self, rhs: &Self, dim: D) -> Result<Self> {
    method seq_len (line 65) | pub fn seq_len<D: Dim>(&self, dim: D) -> Result<usize> {
    method reset (line 72) | pub fn reset(&mut self) {
    method narrow (line 76) | pub fn narrow<D: Dim>(&self, dim: D, offset: usize, len: usize) -> Res...
    method split (line 94) | pub fn split<D: Dim>(&self, dim: D, lhs_len: usize) -> Result<(Self, S...
    method as_option (line 116) | pub fn as_option(&self) -> Option<&Tensor> {
    method apply (line 120) | pub fn apply<M: crate::Module>(&self, m: &M) -> Result<Self> {
  type StreamingModule (line 131) | pub trait StreamingModule {
    method step (line 133) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor>;
    method reset_state (line 134) | fn reset_state(&mut self);
    method reset_state (line 203) | fn reset_state(&mut self) {}
    method step (line 205) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type BinOp (line 138) | pub enum BinOp {
  type StreamingBinOp (line 146) | pub struct StreamingBinOp {
    method new (line 154) | pub fn new(op: BinOp, dim: crate::D) -> Self {
    method reset_state (line 163) | pub fn reset_state(&mut self) {
    method forward (line 168) | pub fn forward(&self, lhs: &Tensor, rhs: &Tensor) -> Result<Tensor> {
    method step (line 177) | pub fn step(&mut self, lhs: &StreamTensor, rhs: &StreamTensor) -> Resu...
  type Map (line 200) | pub struct Map<T: crate::Module>(T);

FILE: candle-core/src/strided_index.rs
  type StridedIndex (line 6) | pub struct StridedIndex<'a> {
  function new (line 15) | pub(crate) fn new(dims: &'a [usize], stride: &'a [usize], start_offset: ...
  function from_layout (line 32) | pub(crate) fn from_layout(l: &'a Layout) -> Self {
  type Item (line 38) | type Item = usize;
  method next (line 41) | fn next(&mut self) -> Option<Self::Item> {
  method size_hint (line 73) | fn size_hint(&self) -> (usize, Option<usize>) {
  method len (line 79) | fn len(&self) -> usize {
  type StridedBlocks (line 85) | pub enum StridedBlocks<'a> {

FILE: candle-core/src/tensor.rs
  type TensorId (line 12) | pub struct TensorId(usize);
    method new (line 15) | fn new() -> Self {
  type Tensor_ (line 23) | pub struct Tensor_ {
  type Tensor (line 68) | pub struct Tensor(Arc<Tensor_>);
    method as_ref (line 46) | fn as_ref(&self) -> &Tensor {
    type Target (line 71) | type Target = Tensor_;
    method deref (line 73) | fn deref(&self) -> &Self::Target {
    method ones_impl (line 180) | pub(crate) fn ones_impl<S: Into<Shape>>(
    method ones (line 203) | pub fn ones<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -...
    method const_set (line 207) | pub fn const_set(&self, value: crate::scalar::Scalar) -> Result<()> {
    method zero_set (line 211) | pub fn zero_set(&self) -> Result<()> {
    method one_set (line 215) | pub fn one_set(&self) -> Result<()> {
    method ones_like (line 228) | pub fn ones_like(&self) -> Result<Self> {
    method zeros_impl (line 234) | pub(crate) fn zeros_impl<S: Into<Shape>>(
    method zeros (line 255) | pub fn zeros<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) ...
    method zeros_like (line 269) | pub fn zeros_like(&self) -> Result<Self> {
    method empty_impl (line 275) | pub(crate) unsafe fn empty_impl<S: Into<Shape>>(
    method empty (line 298) | pub unsafe fn empty<S: Into<Shape>>(shape: S, dtype: DType, device: &D...
    method empty_like (line 314) | pub unsafe fn empty_like(&self) -> Result<Self> {
    method rand_impl (line 318) | pub(crate) fn rand_impl<S: Into<Shape>, T: crate::FloatDType>(
    method rand_f64_impl (line 331) | pub(crate) fn rand_f64_impl<S: Into<Shape>>(
    method rand (line 346) | pub fn rand<S: Into<Shape>, T: crate::FloatDType>(
    method rand_like (line 355) | pub fn rand_like(&self, lo: f64, up: f64) -> Result<Self> {
    method randn_impl (line 359) | pub(crate) fn randn_impl<S: Into<Shape>, T: crate::FloatDType>(
    method randn_f64_impl (line 372) | pub(crate) fn randn_f64_impl<S: Into<Shape>>(
    method randn_like (line 386) | pub fn randn_like(&self, mean: f64, stdev: f64) -> Result<Self> {
    method randn (line 399) | pub fn randn<S: Into<Shape>, T: crate::FloatDType>(
    method new_impl (line 408) | pub(crate) fn new_impl<A: crate::device::NdArray>(
    method new (line 425) | pub fn new<A: crate::device::NdArray>(array: A, device: &Device) -> Re...
    method full (line 440) | pub fn full<D: crate::WithDType, S: Into<Shape>>(
    method from_iter (line 461) | pub fn from_iter<D: crate::WithDType>(
    method arange (line 479) | pub fn arange<D: crate::WithDType>(start: D, end: D, device: &Device) ...
    method arange_step (line 492) | pub fn arange_step<D: crate::WithDType>(
    method from_vec_impl (line 518) | pub(crate) fn from_vec_impl<S: ShapeWithOneHole, D: crate::WithDType>(
    method from_vec (line 543) | pub fn from_vec<S: ShapeWithOneHole, D: crate::WithDType>(
    method from_slice (line 564) | pub fn from_slice<S: ShapeWithOneHole, D: crate::WithDType>(
    method same_shape_binary_op (line 575) | pub(crate) fn same_shape_binary_op(&self, rhs: &Self, op: &'static str...
    method track_op (line 592) | pub fn track_op(&self) -> bool {
    method from_storage (line 601) | pub fn from_storage<S: Into<Shape>>(
    method round_to (line 655) | pub fn round_to(&self, decimals: i32) -> Result<Self> {
    method to_scalar (line 662) | pub fn to_scalar<S: crate::WithDType>(&self) -> Result<S> {
    method to_vec0 (line 683) | pub fn to_vec0<S: crate::WithDType>(&self) -> Result<S> {
    method repeat (line 688) | pub fn repeat<S: Into<Shape>>(&self, shape: S) -> Result<Tensor> {
    method meshgrid (line 742) | pub fn meshgrid<A: AsRef<Tensor>>(args: &[A], xy_indexing: bool) -> Re...
    method affine (line 784) | pub fn affine(&self, mul: f64, add: f64) -> Result<Self> {
    method elu (line 794) | pub fn elu(&self, alpha: f64) -> Result<Self> {
    method powf (line 804) | pub fn powf(&self, e: f64) -> Result<Self> {
    method check_dim (line 813) | pub(crate) fn check_dim(&self, dim: usize, op: &'static str) -> Result...
    method chunk (line 828) | pub fn chunk<D: Dim>(&self, chunks: usize, dim: D) -> Result<Vec<Self>> {
    method narrow (line 878) | pub fn narrow<D: Dim>(&self, dim: D, start: usize, len: usize) -> Resu...
    method squeeze_dims (line 917) | fn squeeze_dims(self, dims: &[usize]) -> Result<Self> {
    method reduce_impl (line 939) | fn reduce_impl<D: Dim>(&self, dim: D, keepdim: bool, op: ReduceOp) -> ...
    method sum_impl (line 958) | fn sum_impl<D: Dims>(&self, sum_dims: D, keepdim: bool) -> Result<Self> {
    method roll (line 989) | pub fn roll<D>(&self, shift: i32, dim: D) -> Result<Self>
    method sum_keepdim (line 1022) | pub fn sum_keepdim<D: Dims>(&self, sum_dims: D) -> Result<Self> {
    method sum (line 1029) | pub fn sum<D: Dims>(&self, sum_dims: D) -> Result<Self> {
    method mean_keepdim (line 1050) | pub fn mean_keepdim<D: Dims>(&self, mean_dims: D) -> Result<Self> {
    method mean (line 1060) | pub fn mean<D: Dims>(&self, mean_dims: D) -> Result<Self> {
    method var_keepdim (line 1068) | pub fn var_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
    method var (line 1076) | pub fn var<D: Dim>(&self, dim: D) -> Result<Self> {
    method max_keepdim (line 1083) | pub fn max_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
    method max (line 1088) | pub fn max<D: Dim>(&self, dim: D) -> Result<Self> {
    method min_keepdim (line 1094) | pub fn min_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
    method min (line 1099) | pub fn min<D: Dim>(&self, dim: D) -> Result<Self> {
    method argmax_keepdim (line 1103) | pub fn argmax_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
    method argmax (line 1108) | pub fn argmax<D: Dim>(&self, dim: D) -> Result<Self> {
    method argmin_keepdim (line 1112) | pub fn argmin_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
    method argmin (line 1117) | pub fn argmin<D: Dim>(&self, dim: D) -> Result<Self> {
    method cmp (line 1125) | pub fn cmp<T: TensorOrScalar>(&self, rhs: T, op: CmpOp) -> Result<Self> {
    method eq (line 1142) | pub fn eq<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
    method ne (line 1147) | pub fn ne<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
    method lt (line 1153) | pub fn lt<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
    method gt (line 1159) | pub fn gt<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
    method ge (line 1165) | pub fn ge<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
    method le (line 1171) | pub fn le<T: TensorOrScalar>(&self, rhs: T) -> Result<Self> {
    method clamp (line 1176) | pub fn clamp<T1: TensorOrScalar, T2: TensorOrScalar>(&self, min: T1, m...
    method interpolate1d (line 1184) | pub fn interpolate1d(&self, target_size: usize) -> Result<Self> {
    method upsample_nearest1d (line 1194) | pub fn upsample_nearest1d(&self, target_size: usize) -> Result<Self> {
    method interpolate2d (line 1203) | pub fn interpolate2d(&self, target_h: usize, target_w: usize) -> Resul...
    method upsample_nearest2d (line 1217) | pub fn upsample_nearest2d(&self, target_h: usize, target_w: usize) -> ...
    method upsample_bilinear2d (line 1244) | pub fn upsample_bilinear2d(
    method upsample_bilinear2d_with_scale (line 1292) | pub fn upsample_bilinear2d_with_scale(
    method avg_pool2d (line 1340) | pub fn avg_pool2d<T: crate::ToUsize2>(&self, sz: T) -> Result<Self> {
    method avg_pool2d_with_stride (line 1347) | pub fn avg_pool2d_with_stride<T: crate::ToUsize2>(
    method max_pool2d (line 1378) | pub fn max_pool2d<T: crate::ToUsize2>(&self, sz: T) -> Result<Self> {
    method max_pool2d_with_stride (line 1385) | pub fn max_pool2d_with_stride<T: crate::ToUsize2>(
    method dot (line 1425) | pub fn dot(&self, rhs: &Self) -> Result<Self> {
    method norm (line 1449) | pub fn norm(&self) -> Result<Self> {
    method mv (line 1471) | pub fn mv(&self, rhs: &Self) -> Result<Self> {
    method matmul (line 1495) | pub fn matmul(&self, rhs: &Self) -> Result<Self> {
    method broadcast_matmul (line 1545) | pub fn broadcast_matmul(&self, rhs: &Self) -> Result<Self> {
    method where_cond (line 1565) | pub fn where_cond(&self, on_true: &Self, on_false: &Self) -> Result<Se...
    method embedding (line 1598) | pub fn embedding(&self, ids: &Self) -> Result<Self> {
    method scatter_checks (line 1610) | fn scatter_checks(&self, indexes: &Self, source: &Self, dim: usize) ->...
    method scatter (line 1644) | pub fn scatter<D: Dim>(&self, indexes: &Self, source: &Self, dim: D) -...
    method scatter_set (line 1666) | pub fn scatter_set<D: Dim>(&self, indexes: &Self, source: &Self, dim: ...
    method scatter_add (line 1683) | pub fn scatter_add<D: Dim>(&self, indexes: &Self, source: &Self, dim: ...
    method scatter_add_set (line 1705) | pub fn scatter_add_set<D: Dim>(&self, indexes: &Self, source: &Self, d...
    method slice_scatter (line 1723) | pub fn slice_scatter<D: Dim>(&self, src: &Self, dim: D, start: usize) ...
    method slice_scatter0 (line 1736) | pub fn slice_scatter0(&self, src: &Self, start: usize) -> Result<Self> {
    method index_add (line 1792) | pub fn index_add<D: Dim>(&self, indexes: &Self, source: &Self, dim: D)...
    method gather (line 1853) | pub fn gather<D: Dim>(&self, indexes: &Self, dim: D) -> Result<Self> {
    method index_select (line 1892) | pub fn index_select<D: Dim>(&self, indexes: &Self, dim: D) -> Result<S...
    method strided_index (line 1917) | pub fn strided_index(&self) -> crate::StridedIndex<'_> {
    method strided_blocks (line 1925) | pub fn strided_blocks(&self) -> crate::StridedBlocks<'_> {
    method to_vec1 (line 1930) | pub fn to_vec1<S: crate::WithDType>(&self) -> Result<Vec<S>> {
    method to_vec2 (line 1955) | pub fn to_vec2<S: crate::WithDType>(&self) -> Result<Vec<Vec<S>>> {
    method to_vec3 (line 1986) | pub fn to_vec3<S: crate::WithDType>(&self) -> Result<Vec<Vec<Vec<S>>>> {
    method dtype (line 2027) | pub fn dtype(&self) -> DType {
    method device (line 2032) | pub fn device(&self) -> &Device {
    method shape (line 2037) | pub fn shape(&self) -> &Shape {
    method dims (line 2042) | pub fn dims(&self) -> &[usize] {
    method dim (line 2047) | pub fn dim<D: Dim>(&self, dim: D) -> Result<usize> {
    method layout (line 2054) | pub fn layout(&self) -> &Layout {
    method stride (line 2058) | pub fn stride(&self) -> &[usize] {
    method rank (line 2063) | pub fn rank(&self) -> usize {
    method elem_count (line 2068) | pub fn elem_count(&self) -> usize {
    method id (line 2073) | pub fn id(&self) -> TensorId {
    method is_variable (line 2079) | pub fn is_variable(&self) -> bool {
    method op (line 2083) | pub(crate) fn op(&self) -> &Option<Op> {
    method max_all (line 2097) | pub fn max_all(&self) -> Result<Tensor> {
    method min_all (line 2115) | pub fn min_all(&self) -> Result<Tensor> {
    method sum_all (line 2133) | pub fn sum_all(&self) -> Result<Tensor> {
    method mean_all (line 2138) | pub fn mean_all(&self) -> Result<Tensor> {
    method flatten_ (line 2142) | fn flatten_<D1: Dim, D2: Dim>(
    method flatten (line 2174) | pub fn flatten<D1: Dim, D2: Dim>(&self, start_dim: D1, end_dim: D2) ->...
    method flatten_to (line 2179) | pub fn flatten_to<D: Dim>(&self, end_dim: D) -> Result<Tensor> {
    method flatten_from (line 2185) | pub fn flatten_from<D: Dim>(&self, start_dim: D) -> Result<Tensor> {
    method flatten_all (line 2198) | pub fn flatten_all(&self) -> Result<Tensor> {
    method get (line 2213) | pub fn get(&self, i: usize) -> Result<Tensor> {
    method get_on_dim (line 2235) | pub fn get_on_dim<D: Dim>(&self, dim: D, index: usize) -> Result<Tenso...
    method t (line 2250) | pub fn t(&self) -> Result<Tensor> {
    method transpose (line 2265) | pub fn transpose<D1: Dim, D2: Dim>(&self, dim1: D1, dim2: D2) -> Resul...
    method permute (line 2295) | pub fn permute<D: Dims>(&self, dims: D) -> Result<Tensor> {
    method is_contiguous (line 2321) | pub fn is_contiguous(&self) -> bool {
    method is_fortran_contiguous (line 2326) | pub fn is_fortran_contiguous(&self) -> bool {
    method copy (line 2332) | pub fn copy(&self) -> Result<Tensor> {
    method detach (line 2350) | pub fn detach(&self) -> Tensor {
    method to_device (line 2368) | pub fn to_device(&self, device: &Device) -> Result<Tensor> {
    method broadcast_left (line 2411) | pub fn broadcast_left<S: Into<Shape>>(&self, left_shape: S) -> Result<...
    method broadcast_as (line 2425) | pub fn broadcast_as<S: Into<Shape>>(&self, shape: S) -> Result<Self> {
    method expand (line 2439) | pub fn expand<S: Into<Shape>>(&self, shape: S) -> Result<Self> {
    method to_dtype (line 2453) | pub fn to_dtype(&self, dtype: DType) -> Result<Self> {
    method contiguous (line 2466) | pub fn contiguous(&self) -> Result<Tensor> {
    method force_contiguous (line 2480) | pub fn force_contiguous(&self) -> Result<Tensor> {
    method make_var (line 2491) | pub(crate) fn make_var(&self) -> Result<Tensor> {
    method reshape (line 2523) | pub fn reshape<S: ShapeWithOneHole>(&self, s: S) -> Result<Tensor> {
    method squeeze (line 2566) | pub fn squeeze<D: Dim>(&self, dim: D) -> Result<Self> {
    method unsqueeze (line 2604) | pub fn unsqueeze<D: Dim>(&self, dim: D) -> Result<Self> {
    method stack (line 2642) | pub fn stack<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<S...
    method pad_with_zeros (line 2656) | pub fn pad_with_zeros<D: Dim>(&self, dim: D, left: usize, right: usize...
    method pad_with_same (line 2684) | pub fn pad_with_same<D: Dim>(&self, dim: D, left: usize, right: usize)...
    method apply (line 2723) | pub fn apply<M: crate::Module>(&self, m: &M) -> Result<Self> {
    method apply_t (line 2728) | pub fn apply_t<M: crate::ModuleT>(&self, m: &M, train: bool) -> Result...
    method storage (line 2732) | pub(crate) fn storage(&self) -> std::sync::RwLockReadGuard<'_, Storage> {
    method storage_mut (line 2736) | pub(crate) fn storage_mut(&self) -> std::sync::RwLockWriteGuard<'_, St...
    method storage_mut_and_layout (line 2742) | pub(crate) fn storage_mut_and_layout(
    method storage_and_layout (line 2750) | pub fn storage_and_layout(&self) -> (std::sync::RwLockReadGuard<'_, St...
    method same_storage (line 2755) | pub(crate) fn same_storage(&self, rhs: &Self) -> bool {
    method normalize_axis (line 2763) | pub fn normalize_axis(&self, axis: i64) -> Result<usize> {
    method tril2 (line 2779) | pub fn tril2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
    method triu2 (line 2787) | pub fn triu2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
    method eye (line 2795) | pub fn eye(n: usize, dtype: DType, device: &Device) -> Result<Self> {
    method cumsum (line 2806) | pub fn cumsum<D: Dim>(&self, dim: D) -> Result<Self> {
    method slice_assign (line 2826) | pub fn slice_assign<D: std::ops::RangeBounds<usize>>(
    method log_sum_exp (line 2881) | pub fn log_sum_exp<D: Dims>(&self, sum_dims: D) -> Result<Self> {
    method pow (line 2898) | pub fn pow(&self, rhs: &Tensor) -> Result<Self> {
    method broadcast_pow (line 2903) | pub fn broadcast_pow(&self, rhs: &Tensor) -> Result<Self> {
    method flip (line 2918) | pub fn flip(&self, dims: &[usize]) -> Result<Tensor> {
    method unfold (line 2931) | pub fn unfold<D: Dim>(&self, dim: D, size: usize, step: usize) -> Resu...
    method from (line 3113) | fn from((storage, shape): (Storage, S)) -> Self {
  function from_storage (line 159) | pub(crate) fn from_storage<S: Into<Shape>>(
  type Output (line 3047) | type Output = Result<Tensor>;
  function add (line 3049) | fn add(self, rhs: Tensor) -> Self::Output {
  type Output (line 3055) | type Output = Result<Tensor>;
  function add (line 3057) | fn add(self, rhs: &Tensor) -> Self::Output {
  type Output (line 3063) | type Output = Result<Tensor>;
  function mul (line 3065) | fn mul(self, rhs: Tensor) -> Self::Output {
  type Output (line 3071) | type Output = Result<Tensor>;
  function mul (line 3073) | fn mul(self, rhs: &Tensor) -> Self::Output {
  type Output (line 3079) | type Output = Result<Tensor>;
  function sub (line 3081) | fn sub(self, rhs: Tensor) -> Self::Output {
  type Output (line 3087) | type Output = Result<Tensor>;
  function sub (line 3089) | fn sub(self, rhs: &Tensor) -> Self::Output {
  type Output (line 3095) | type Output = Result<Tensor>;
  function div (line 3098) | fn div(self, rhs: Tensor) -> Self::Output {
  type Output (line 3104) | type Output = Result<Tensor>;
  function div (line 3107) | fn div(self, rhs: &Tensor) -> Self::Output {

FILE: candle-core/src/tensor_cat.rs
  method cat (line 21) | pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
  method cat0 (line 76) | fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
  method cat_contiguous (line 151) | fn cat_contiguous<A: AsRef<Tensor>>(args: &[A], dim: usize) -> Result<Se...
  method slice_set (line 246) | pub fn slice_set<D: Dim>(&self, src: &Self, dim: D, offset: usize) -> Re...

FILE: candle-core/src/test_utils.rs
  function assert_tensor_eq (line 27) | pub fn assert_tensor_eq(t1: &Tensor, t2: &Tensor) -> Result<()> {
  function to_vec0_round (line 36) | pub fn to_vec0_round(t: &Tensor, digits: i32) -> Result<f32> {
  function to_vec1_round (line 42) | pub fn to_vec1_round(t: &Tensor, digits: i32) -> Result<Vec<f32>> {
  function to_vec2_round (line 49) | pub fn to_vec2_round(t: &Tensor, digits: i32) -> Result<Vec<Vec<f32>>> {
  function to_vec3_round (line 59) | pub fn to_vec3_round(t: &Tensor, digits: i32) -> Result<Vec<Vec<Vec<f32>...

FILE: candle-core/src/utils.rs
  function get_num_threads (line 4) | pub fn get_num_threads() -> usize {
  function has_accelerate (line 15) | pub fn has_accelerate() -> bool {
  function has_mkl (line 19) | pub fn has_mkl() -> bool {
  function cuda_is_available (line 23) | pub fn cuda_is_available() -> bool {
  function metal_is_available (line 27) | pub fn metal_is_available() -> bool {
  function with_avx (line 31) | pub fn with_avx() -> bool {
  function with_neon (line 35) | pub fn with_neon() -> bool {
  function with_simd128 (line 39) | pub fn with_simd128() -> bool {
  function with_f16c (line 43) | pub fn with_f16c() -> bool {

FILE: candle-core/src/variable.rs
  type Var (line 10) | pub struct Var(Tensor);
    method fmt (line 13) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    type Target (line 19) | type Target = Tensor;
    method deref (line 21) | fn deref(&self) -> &Self::Target {
    method zeros (line 27) | pub fn zeros<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) ...
    method ones (line 32) | pub fn ones<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -...
    method from_tensor (line 38) | pub fn from_tensor(t: &Tensor) -> Result<Self> {
    method rand_f64 (line 47) | pub fn rand_f64<S: Into<Shape>>(
    method randn_f64 (line 58) | pub fn randn_f64<S: Into<Shape>>(
    method rand (line 69) | pub fn rand<S: Into<Shape>, T: crate::FloatDType>(
    method randn (line 79) | pub fn randn<S: Into<Shape>, T: crate::FloatDType>(
    method new (line 91) | pub fn new<A: crate::device::NdArray>(array: A, device: &Device) -> Re...
    method from_vec (line 97) | pub fn from_vec<S: Into<Shape>, D: crate::WithDType>(
    method from_slice (line 106) | pub fn from_slice<S: Into<Shape>, D: crate::WithDType>(
    method as_detached_tensor (line 115) | pub fn as_detached_tensor(&self) -> Tensor {
    method as_tensor (line 119) | pub fn as_tensor(&self) -> &Tensor {
    method into_inner (line 124) | pub fn into_inner(self) -> Tensor {
    method set (line 130) | pub fn set(&self, src: &Tensor) -> Result<()> {

FILE: candle-core/tests/bilinear_tests.rs
  function bilinear_pytorch_2x_upscale (line 14) | fn bilinear_pytorch_2x_upscale(dev: &Device) -> Result<()> {
  function bilinear_pytorch_downscale (line 50) | fn bilinear_pytorch_downscale(dev: &Device) -> Result<()> {
  function bilinear_pytorch_multi_channel (line 82) | fn bilinear_pytorch_multi_channel(dev: &Device) -> Result<()> {
  function bilinear_pytorch_align_corners_true (line 159) | fn bilinear_pytorch_align_corners_true(dev: &Device) -> Result<()> {
  function bilinear_pytorch_scale_factor (line 210) | fn bilinear_pytorch_scale_factor(dev: &Device) -> Result<()> {
  function bilinear_pytorch_non_square_exact (line 237) | fn bilinear_pytorch_non_square_exact(dev: &Device) -> Result<()> {
  function bilinear_pytorch_tiny_1x1_to_3x3 (line 275) | fn bilinear_pytorch_tiny_1x1_to_3x3(dev: &Device) -> Result<()> {
  function bilinear_pytorch_tiny_1x2_to_3x6 (line 300) | fn bilinear_pytorch_tiny_1x2_to_3x6(dev: &Device) -> Result<()> {
  function bilinear_pytorch_large_64x64_to_128x128 (line 334) | fn bilinear_pytorch_large_64x64_to_128x128(dev: &Device) -> Result<()> {
  function bilinear_output_dimensions (line 373) | fn bilinear_output_dimensions(dev: &Device) -> Result<()> {
  function bilinear_identity (line 413) | fn bilinear_identity(dev: &Device) -> Result<()> {
  function bilinear_align_corners_difference (line 423) | fn bilinear_align_corners_difference(dev: &Device) -> Result<()> {

FILE: candle-core/tests/conv_tests.rs
  function conv1d (line 25) | fn conv1d(dev: &Device) -> Result<()> {
  function conv1d_small (line 98) | fn conv1d_small(dev: &Device) -> Result<()> {
  function conv2d (line 140) | fn conv2d(dev: &Device) -> Result<()> {
  function conv2d_small (line 287) | fn conv2d_small(dev: &Device) -> Result<()> {
  function conv2d_smaller (line 336) | fn conv2d_smaller(dev: &Device) -> Result<()> {
  function conv2d_non_square (line 366) | fn conv2d_non_square(dev: &Device) -> Result<()> {
  function conv2d_grad (line 416) | fn conv2d_grad(dev: &Device) -> Result<()> {

FILE: candle-core/tests/custom_op_tests.rs
  function fwd (line 6) | fn fwd<T: num_traits::Float>(v: T, alpha: f64) -> T {
  type Elu (line 15) | struct Elu {
    method name (line 117) | fn name(&self) -> &'static str {
    method cpu_fwd (line 121) | fn cpu_fwd(&self, s: &mut CpuStorage, _l: &Layout) -> Result<()> {
  method name (line 20) | fn name(&self) -> &'static str {
  method cpu_fwd (line 24) | fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Sha...
  function custom_op1_no_backward (line 36) | fn custom_op1_no_backward() -> Result<()> {
  function bwd (line 49) | fn bwd<T: num_traits::Float>(v: T, alpha: f64) -> T {
  type EluBackward (line 58) | struct EluBackward {
  method name (line 63) | fn name(&self) -> &'static str {
  method cpu_fwd (line 67) | fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Sha...
  type EluWithBackward (line 78) | struct EluWithBackward(Elu);
    method new (line 81) | fn new(alpha: f64) -> Self {
  method name (line 87) | fn name(&self) -> &'static str {
  method cpu_fwd (line 91) | fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Sha...
  method bwd (line 95) | fn bwd(&self, arg: &Tensor, _res: &Tensor, grad_res: &Tensor) -> Result<...
  function custom_op1_with_backward (line 103) | fn custom_op1_with_backward() -> Result<()> {
  function inplace_op1 (line 136) | fn inplace_op1() -> Result<()> {
  function ug_op (line 151) | fn ug_op() -> Result<()> {

FILE: candle-core/tests/display_tests.rs
  function display_scalar (line 5) | fn display_scalar() -> Result<()> {
  function display_vector (line 22) | fn display_vector() -> Result<()> {
  function display_multi_dim (line 51) | fn display_multi_dim() -> Result<()> {

FILE: candle-core/tests/grad_tests.rs
  function simple_grad (line 5) | fn simple_grad(device: &Device) -> Result<()> {
  function sum_grad (line 19) | fn sum_grad(device: &Device) -> Result<()> {
  function matmul_grad (line 39) | fn matmul_grad(device: &Device) -> Result<()> {
  function grad_descent (line 68) | fn grad_descent(device: &Device) -> Result<()> {
  function unary_grad (line 82) | fn unary_grad(device: &Device) -> Result<()> {
  function binary_grad (line 466) | fn binary_grad(device: &Device) -> Result<()> {
  function test_flip_backprop (line 509) | fn test_flip_backprop() -> Result<()> {

FILE: candle-core/tests/indexing_tests.rs
  function integer_index (line 5) | fn integer_index() -> Result<()> {
  function range_index (line 21) | fn range_index() -> Result<()> {
  function index_3d (line 76) | fn index_3d() -> Result<()> {
  function slice_assign (line 96) | fn slice_assign() -> Result<()> {

FILE: candle-core/tests/layout_tests.rs
  function contiguous (line 4) | fn contiguous(device: &Device) -> Result<()> {
  function strided_blocks (line 55) | fn strided_blocks() -> Result<()> {

FILE: candle-core/tests/matmul_tests.rs
  function matmul (line 3) | fn matmul(device: &Device) -> Result<()> {
  function matmul_bf16 (line 52) | fn matmul_bf16(device: &Device) -> Result<()> {
  function broadcast_matmul (line 66) | fn broadcast_matmul(device: &Device) -> Result<()> {
  function tensor_dot (line 86) | fn tensor_dot() -> Result<()> {
  function tensor_mv (line 96) | fn tensor_mv() -> Result<()> {
  function squeeze_mm (line 106) | fn squeeze_mm(device: &Device) -> Result<()> {
  function mm_layout (line 117) | fn mm_layout(device: &Device) -> Result<()> {

FILE: candle-core/tests/pool_tests.rs
  function avg_pool2d (line 4) | fn avg_pool2d(dev: &Device) -> Result<()> {
  function max_pool2d (line 21) | fn max_pool2d(dev: &Device) -> Result<()> {
  function avg_pool2d_pytorch (line 45) | fn avg_pool2d_pytorch(dev: &Device) -> Result<()> {
  function upsample_nearest2d (line 85) | fn upsample_nearest2d(dev: &Device) -> Result<()> {

FILE: candle-core/tests/pth_tests.rs
  function test_pth (line 3) | fn test_pth() {
  function test_pth_with_key (line 9) | fn test_pth_with_key() {
  function test_pth_fortran_contiguous (line 17) | fn test_pth_fortran_contiguous() {

FILE: candle-core/tests/quantized_tests.rs
  constant GGML_TEST_SIZE (line 11) | const GGML_TEST_SIZE: usize = 32 * 128;
  constant GGML_MAX_QUANTIZATION_TOTAL_ERROR (line 13) | const GGML_MAX_QUANTIZATION_TOTAL_ERROR: f32 = 0.002;
  constant GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS (line 14) | const GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS: f32 = 0.0075;
  constant GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS (line 15) | const GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS: f32 = 0.0040;
  constant GGML_MAX_DOT_PRODUCT_ERROR (line 16) | const GGML_MAX_DOT_PRODUCT_ERROR: f32 = 0.02;
  function test_matmul (line 18) | fn test_matmul(
  function test_matmul_mm (line 57) | fn test_matmul_mm() -> Result<()> {
  function quantized_matmul (line 91) | fn quantized_matmul(device: &Device) -> Result<()> {
  function quantized_matmul_neg (line 151) | fn quantized_matmul_neg(device: &Device) -> Result<()> {
  function qmm_batch (line 223) | fn qmm_batch(dev: &Device) -> Result<()> {
  function quantize_q4_0 (line 264) | fn quantize_q4_0(device: &Device) -> Result<()> {
  function quantize_q4_1 (line 298) | fn quantize_q4_1(device: &Device) -> Result<()> {
  function quantize_q5_0 (line 331) | fn quantize_q5_0(device: &Device) -> Result<()> {
  function quantize_q5_1 (line 364) | fn quantize_q5_1(device: &Device) -> Result<()> {
  function get_test_vector2 (line 395) | fn get_test_vector2(bound: f32, size: usize, device: &Device) -> Result<...
  function round_vector (line 410) | fn round_vector(values: &[f32]) -> Vec<f32> {
  function compare_with_error (line 417) | fn compare_with_error(values: &[f32], expected: &[f32], tolerance: f32) {
  function create_ggml_like_vector (line 430) | fn create_ggml_like_vector(offset: f32) -> Vec<f32> {
  function calculate_rmse (line 437) | fn calculate_rmse(a: &[f32], b: &[f32]) -> f32 {
  function ggml_quantization_error_test (line 450) | fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_e...
  function imatrix_quantize_q6k (line 474) | fn imatrix_quantize_q6k() -> Result<()> {
  function imatrix_quantize_q5k (line 512) | fn imatrix_quantize_q5k() -> Result<()> {
  function imatrix_quantize_q4k (line 550) | fn imatrix_quantize_q4k() -> Result<()> {
  function imatrix_quantize_q3k (line 595) | fn imatrix_quantize_q3k() -> Result<()> {
  function imatrix_quantize_q2k (line 633) | fn imatrix_quantize_q2k() -> Result<()> {
  function quantize_q2k (line 670) | fn quantize_q2k(device: &Device) -> Result<()> {
  function quantize_q3k (line 718) | fn quantize_q3k(device: &Device) -> Result<()> {
  function quantize_q4k (line 765) | fn quantize_q4k(device: &Device) -> Result<()> {
  function quantize_q5k (line 812) | fn quantize_q5k(device: &Device) -> Result<()> {
  function quantize_q6k (line 859) | fn quantize_q6k(device: &Device) -> Result<()> {
  function quantize_q8k (line 906) | fn quantize_q8k(device: &Device) -> Result<()> {
  function vec_dot_reference (line 1015) | fn vec_dot_reference(a: &[f32], b: &[f32]) -> f32 {
  function ggml_reference_matmul_error (line 1020) | fn ggml_reference_matmul_error(dtype: GgmlDType) -> Result<f32> {
  function ggml_matmul_error_test (line 1045) | fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> {
  function ggml_matmul_error_test_ (line 1060) | fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32...
  function quantized_mm (line 1113) | fn quantized_mm() -> Result<()> {
  function get_random_tensors (line 1127) | fn get_random_tensors(
  function quantized_matmul_q2k (line 1250) | fn quantized_matmul_q2k() -> Result<()> {
  function quantized_matmul_q3k (line 1276) | fn quantized_matmul_q3k() -> Result<()> {
  function quantized_matmul_q4k (line 1302) | fn quantized_matmul_q4k() -> Result<()> {
  function quantized_matmul_q5k (line 1328) | fn quantized_matmul_q5k() -> Result<()> {
  function quantized_matmul_q6k (line 1355) | fn quantized_matmul_q6k() -> Result<()> {
  function quantized_matmul_q8k (line 1380) | fn quantized_matmul_q8k() -> Result<()> {

FILE: candle-core/tests/serialization_tests.rs
  type TmpFile (line 3) | struct TmpFile(std::path::PathBuf);
    method create (line 6) | fn create(base: &str) -> TmpFile {
    method as_ref (line 18) | fn as_ref(&self) -> &std::path::Path {
  method drop (line 24) | fn dro

Download .json

Condensed preview — 1049 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (9,458K chars).

[
  {
    "path": ".cargo/config.toml",
    "chars": 244,
    "preview": "[build]\nrustflags = [\"-C\", \"target-cpu=native\"]\n\n[target.wasm32-unknown-unknown]\nrustflags = [\"-C\", \"target-feature=+sim"
  },
  {
    "path": ".github/dependabot.yml",
    "chars": 141,
    "preview": "version: 2\nupdates:\n  - package-ecosystem: \"cargo\"\n    directory: \"/\"\n    schedule:\n      interval: \"weekly\"\n    open-pu"
  },
  {
    "path": ".github/workflows/ci_cuda.yaml",
    "chars": 1108,
    "preview": "name: CI / cuda\n\non:\n  workflow_dispatch:\n  pull_request:\n\njobs:\n  test-cuda:\n    concurrency:\n      group: ${{ github.w"
  },
  {
    "path": ".github/workflows/maturin.yml",
    "chars": 3300,
    "preview": "name: PyO3-Wheels\n\non:\n  push:\n    branches:\n      - main\n    tags:\n      - '*'\n    paths:\n      - candle-pyo3/**\n  pull"
  },
  {
    "path": ".github/workflows/python.yml",
    "chars": 1593,
    "preview": "name: PyO3-CI\n\non:\n  workflow_dispatch:\n  push:\n    branches:\n      - main\n    paths:\n      - candle-pyo3/**\n  pull_requ"
  },
  {
    "path": ".github/workflows/rust-ci.yml",
    "chars": 2838,
    "preview": "on:\n  push:\n    branches:\n      - main\n  pull_request:\n\nname: Continuous integration\n\njobs:\n  check:\n    name: Check\n   "
  },
  {
    "path": ".github/workflows/trufflehog.yml",
    "chars": 268,
    "preview": "on:\n  push:\n\nname: Secret Leaks\n\njobs:\n  trufflehog:\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout code\n "
  },
  {
    "path": ".gitignore",
    "chars": 912,
    "preview": "# Generated by Cargo\n# will have compiled files and executables\ndebug/\ndata/\ndist/\ntarget/\n\n# Remove Cargo.lock from git"
  },
  {
    "path": ".pre-commit-config.yaml",
    "chars": 343,
    "preview": "repos:\n  - repo: https://github.com/Narsil/pre-commit-rust\n    rev: 2eed6366172ef2a5186e8785ec0e67243d7d73d0\n    hooks:\n"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 4071,
    "preview": "# Changelog\nThis documents the main changes to the `candle` crate.\n\n## v0.3.1 - Unreleased\n\n### Added\n\n### Modified\n\n## "
  },
  {
    "path": "Cargo.toml",
    "chars": 3242,
    "preview": "[workspace]\nmembers = [\n    \"candle-core\",\n    \"candle-datasets\",\n    \"candle-examples\",\n    \"candle-nn\",\n    \"candle-py"
  },
  {
    "path": "LICENSE-APACHE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "LICENSE-MIT",
    "chars": 1023,
    "preview": "Permission is hereby granted, free of charge, to any\nperson obtaining a copy of this software and associated\ndocumentati"
  },
  {
    "path": "Makefile",
    "chars": 269,
    "preview": ".PHONY: clean-ptx clean test\n\nclean-ptx:\n\tfind target -name \"*.ptx\" -type f -delete\n\techo \"\" > candle-kernels/src/lib.rs"
  },
  {
    "path": "README.md",
    "chars": 23129,
    "preview": "# candle\n[![discord server](https://dcbadge.limes.pink/api/server/hugging-face-879548962464493619)](https://discord.gg/h"
  },
  {
    "path": "candle-book/.gitignore",
    "chars": 5,
    "preview": "book\n"
  },
  {
    "path": "candle-book/CONTRIBUTING.md",
    "chars": 474,
    "preview": "# Candle Book\n\nThe book uses [mdBook](https://github.com/rust-lang/mdBook) for building.\n\n## Installation\n\nTo install md"
  },
  {
    "path": "candle-book/Cargo.toml",
    "chars": 1448,
    "preview": "[package]\nname = \"candle-book\"\nversion.workspace = true\nedition.workspace = true\ndescription.workspace = true\nrepository"
  },
  {
    "path": "candle-book/book.toml",
    "chars": 115,
    "preview": "[book]\nauthors = [\"Nicolas Patry\"]\nlanguage = \"en\"\nmultilingual = false\nsrc = \"src\"\ntitle = \"Candle Documentation\"\n"
  },
  {
    "path": "candle-book/src/README.md",
    "chars": 149,
    "preview": "# Introduction\n\n{{#include ../../README.md:goals}}\n\n{{#include ../../README.md:features}}\n\nThis book will introduce step"
  },
  {
    "path": "candle-book/src/SUMMARY.md",
    "chars": 877,
    "preview": "# Summary\n\n[Introduction](README.md)\n\n# User Guide\n\n- [Installation](guide/installation.md)\n- [Tutorial - MNIST](guide/m"
  },
  {
    "path": "candle-book/src/advanced/mkl.md",
    "chars": 12,
    "preview": "# Using MKL\n"
  },
  {
    "path": "candle-book/src/apps/README.md",
    "chars": 16,
    "preview": "# Creating apps\n"
  },
  {
    "path": "candle-book/src/apps/desktop.md",
    "chars": 31,
    "preview": "# Creating a desktop Tauri app\n"
  },
  {
    "path": "candle-book/src/apps/rest.md",
    "chars": 32,
    "preview": "# Creating a REST api webserver\n"
  },
  {
    "path": "candle-book/src/apps/wasm.md",
    "chars": 22,
    "preview": "# Creating a WASM app\n"
  },
  {
    "path": "candle-book/src/chapter_1.md",
    "chars": 12,
    "preview": "# Chapter 1\n"
  },
  {
    "path": "candle-book/src/cuda/README.md",
    "chars": 22,
    "preview": "# Advanced Cuda usage\n"
  },
  {
    "path": "candle-book/src/cuda/porting.md",
    "chars": 26,
    "preview": "# Porting a custom kernel\n"
  },
  {
    "path": "candle-book/src/cuda/writing.md",
    "chars": 26,
    "preview": "# Writing a custom kernel\n"
  },
  {
    "path": "candle-book/src/error_manage.md",
    "chars": 4469,
    "preview": "# Error management\n\nYou might have seen in the code base a lot of `.unwrap()` or `?`.\nIf you're unfamiliar with Rust che"
  },
  {
    "path": "candle-book/src/guide/cheatsheet.md",
    "chars": 65,
    "preview": "# Pytorch cheatsheet\n\n{{#include ../../../README.md:cheatsheet}}\n"
  },
  {
    "path": "candle-book/src/guide/hello_world.md",
    "chars": 5176,
    "preview": "# Hello world!\n\nWe will now create the hello world of the ML world, building a model capable of solving MNIST dataset.\n\n"
  },
  {
    "path": "candle-book/src/guide/installation.md",
    "chars": 1424,
    "preview": "# Installation\n\n## 1. Create a new rust app or library\n\n```bash\ncargo new myapp\ncd myapp\n```\n\n## 2. Add the correct cand"
  },
  {
    "path": "candle-book/src/guide/mnist/intro.md",
    "chars": 600,
    "preview": "# Candle MNIST Tutorial\n\n## Introduction\n\nThis tutorial provides an introduction to Candle by implementing and training "
  },
  {
    "path": "candle-book/src/guide/mnist/modeling.md",
    "chars": 4387,
    "preview": "# Candle MNIST Tutorial\n\n## Modeling\n\nOpen `src/main.rs` in your project folder and insert the following code:\n\n```rust\n"
  },
  {
    "path": "candle-book/src/guide/mnist/saving_loading.md",
    "chars": 5413,
    "preview": "# Candle MNIST Tutorial\n\n## Saving and Loading Models\n\nAfter training a model, it is useful to save and subsequently loa"
  },
  {
    "path": "candle-book/src/guide/mnist/training.md",
    "chars": 4207,
    "preview": "# Candle MNIST Tutorial\n\n## Training Implementation\n\nFirst, let's create a utility function `make_linear` that accepts a"
  },
  {
    "path": "candle-book/src/inference/cuda/README.md",
    "chars": 22,
    "preview": "# Advanced Cuda usage\n"
  },
  {
    "path": "candle-book/src/inference/cuda/porting.md",
    "chars": 26,
    "preview": "# Porting a custom kernel\n"
  },
  {
    "path": "candle-book/src/inference/cuda/writing.md",
    "chars": 26,
    "preview": "# Writing a custom kernel\n"
  },
  {
    "path": "candle-book/src/inference/hub.md",
    "chars": 3214,
    "preview": "# Using the hub\n\nInstall the [`hf-hub`](https://github.com/huggingface/hf-hub) crate:\n\n```bash\ncargo add hf-hub\n```\n\nThe"
  },
  {
    "path": "candle-book/src/inference/inference.md",
    "chars": 305,
    "preview": "# Running a model\n\n\nIn order to run an existing model, you will need to download and use existing weights.\nMost models a"
  },
  {
    "path": "candle-book/src/lib.rs",
    "chars": 6787,
    "preview": "#[cfg(test)]\npub mod simplified;\n\n#[cfg(test)]\nmod tests {\n    use anyhow::Result;\n    use candle::{DType, Device, Tenso"
  },
  {
    "path": "candle-book/src/simplified.rs",
    "chars": 6589,
    "preview": "//! #A simplified example in Rust of training a neural network and then using it based on the Candle Framework by Huggin"
  },
  {
    "path": "candle-book/src/tracing.md",
    "chars": 3164,
    "preview": "# Tracing\n\nTracing is a powerful tool for identifying performance issues and bottlenecks in code.\n\n> Profiling on GPUs i"
  },
  {
    "path": "candle-book/src/training/finetuning.md",
    "chars": 14,
    "preview": "# Fine-tuning\n"
  },
  {
    "path": "candle-book/src/training/mnist.md",
    "chars": 336,
    "preview": "# MNIST\n\nSo we now have downloaded the MNIST parquet files, let's put them in a simple struct.\n\n```rust,ignore\n{{#includ"
  },
  {
    "path": "candle-book/src/training/serialization.md",
    "chars": 16,
    "preview": "# Serialization\n"
  },
  {
    "path": "candle-book/src/training/simplified.md",
    "chars": 1885,
    "preview": "# Simplified\n\n## How its works\n\nThis program implements a neural network to predict the winner of the second round of el"
  },
  {
    "path": "candle-book/src/training/training.md",
    "chars": 1111,
    "preview": "# Training\n\n\nTraining starts with data. We're going to use the huggingface hub and \nstart with the Hello world dataset o"
  },
  {
    "path": "candle-core/Cargo.toml",
    "chars": 1994,
    "preview": "[package]\nname = \"candle-core\"\nversion.workspace = true\nedition.workspace = true\ndescription.workspace = true\nrepository"
  },
  {
    "path": "candle-core/LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "candle-core/README.md",
    "chars": 42,
    "preview": "# candle\nMinimalist ML framework for Rust\n"
  },
  {
    "path": "candle-core/benches/bench_main.rs",
    "chars": 447,
    "preview": "mod benchmarks;\n\nuse criterion::criterion_main;\n\ncriterion_main!(\n    benchmarks::affine::benches,\n    benchmarks::binar"
  },
  {
    "path": "candle-core/benches/benchmarks/affine.rs",
    "chars": 1465,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/binary.rs",
    "chars": 1647,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/broadcast.rs",
    "chars": 1575,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/conv_transpose2d.rs",
    "chars": 1783,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/copy.rs",
    "chars": 1415,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{Device, Tensor, WithDType};\nuse criterion::{"
  },
  {
    "path": "candle-core/benches/benchmarks/matmul.rs",
    "chars": 2447,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/mod.rs",
    "chars": 2232,
    "preview": "pub(crate) mod affine;\npub(crate) mod binary;\npub(crate) mod broadcast;\npub(crate) mod conv_transpose2d;\npub(crate) mod "
  },
  {
    "path": "candle-core/benches/benchmarks/qmatmul.rs",
    "chars": 2105,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{\n    quantized::{self, GgmlDType, QMatMul},\n"
  },
  {
    "path": "candle-core/benches/benchmarks/random.rs",
    "chars": 1852,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/reduce.rs",
    "chars": 4355,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/unary.rs",
    "chars": 2740,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/benches/benchmarks/where_cond.rs",
    "chars": 2026,
    "preview": "use crate::benchmarks::{BenchDevice, BenchDeviceHandler};\nuse candle_core::{DType, Device, Tensor};\nuse criterion::{crit"
  },
  {
    "path": "candle-core/examples/basics.rs",
    "chars": 569,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  },
  {
    "path": "candle-core/examples/cuda_basics.rs",
    "chars": 769,
    "preview": "#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\n#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\nuse an"
  },
  {
    "path": "candle-core/examples/cuda_sum_benchmark.rs",
    "chars": 1692,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse st"
  },
  {
    "path": "candle-core/examples/metal_basics.rs",
    "chars": 898,
    "preview": "#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\n#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\nuse an"
  },
  {
    "path": "candle-core/src/accelerate.rs",
    "chars": 12889,
    "preview": "#![allow(dead_code)]\nuse libc::{c_char, c_double, c_float, c_int, c_long, c_ulong};\n\nmod ffi {\n    use super::*;\n    ext"
  },
  {
    "path": "candle-core/src/backend.rs",
    "chars": 5124,
    "preview": "//! Traits to Define Backend Behavior\n//!\nuse crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};\nuse crate::{CpuStorage,"
  },
  {
    "path": "candle-core/src/backprop.rs",
    "chars": 39302,
    "preview": "//! Methods for backpropagation of gradients.\nuse crate::op::{BinaryOp, Op, ReduceOp, UnaryOp};\nuse crate::{Error, Resul"
  },
  {
    "path": "candle-core/src/conv.rs",
    "chars": 11619,
    "preview": "//! 1D and 2D Convolutions\n//!\nuse crate::{op::BackpropOp, op::Op, Error, Result, Tensor};\n\n#[derive(Debug, Clone, Parti"
  },
  {
    "path": "candle-core/src/convert.rs",
    "chars": 4765,
    "preview": "//! Implement conversion traits for tensors\nuse crate::{DType, Device, Error, Tensor, WithDType};\nuse half::{bf16, f16, "
  },
  {
    "path": "candle-core/src/cpu/avx.rs",
    "chars": 6121,
    "preview": "use super::{Cpu, CpuBF16, CpuF16};\n#[cfg(target_arch = \"x86\")]\nuse core::arch::x86::*;\n#[cfg(target_arch = \"x86_64\")]\nus"
  },
  {
    "path": "candle-core/src/cpu/erf.rs",
    "chars": 9756,
    "preview": "#![allow(clippy::excessive_precision)]\n// Code taken from https://github.com/statrs-dev/statrs\n//! Provides the [error]("
  },
  {
    "path": "candle-core/src/cpu/kernels.rs",
    "chars": 5713,
    "preview": "pub trait VecOps: num_traits::NumAssign + Copy {\n    fn min(self, rhs: Self) -> Self;\n    fn max(self, rhs: Self) -> Sel"
  },
  {
    "path": "candle-core/src/cpu/mod.rs",
    "chars": 7001,
    "preview": "//! Traits and methods for CPU-backed Tensors\n\npub mod erf;\npub mod kernels;\n\n#[allow(unused)]\ntrait Cpu<const ARR: usiz"
  },
  {
    "path": "candle-core/src/cpu/neon.rs",
    "chars": 1688,
    "preview": "use super::Cpu;\n#[cfg(target_arch = \"arm\")]\nuse core::arch::arm::*;\n\n#[cfg(target_arch = \"aarch64\")]\nuse core::arch::aar"
  },
  {
    "path": "candle-core/src/cpu/simd128.rs",
    "chars": 1541,
    "preview": "use super::Cpu;\nuse core::arch::wasm32::*;\n\npub struct CurrentCpu {}\n\nconst STEP: usize = 16;\nconst EPR: usize = 4;\ncons"
  },
  {
    "path": "candle-core/src/cpu_backend/conv2d.rs",
    "chars": 17253,
    "preview": "use std::borrow::Cow;\n\nuse rayon::iter::{IntoParallelIterator, ParallelIterator};\n\nuse crate::{\n    conv::ParamsConv2D,\n"
  },
  {
    "path": "candle-core/src/cpu_backend/mod.rs",
    "chars": 129149,
    "preview": "//! Implementation of Backend Fns for CPU\nuse crate::backend::{BackendDevice, BackendStorage};\nuse crate::op::{BinaryOpT"
  },
  {
    "path": "candle-core/src/cpu_backend/utils.rs",
    "chars": 16969,
    "preview": "/// Helper functions to write CPU kernels.\nuse crate::backend::BackendStorage;\nuse crate::{Error, Layout, Result, WithDT"
  },
  {
    "path": "candle-core/src/cuda_backend/cudnn.rs",
    "chars": 8121,
    "preview": "use crate::WithDType;\nuse cudarc;\nuse cudarc::cudnn::safe::{ConvForward, Cudnn};\nuse cudarc::driver::{CudaSlice, CudaVie"
  },
  {
    "path": "candle-core/src/cuda_backend/device.rs",
    "chars": 23961,
    "preview": "use crate::backend::{BackendDevice, BackendStorage};\nuse crate::{CpuStorage, CpuStorageRef, DType, Layout, Result, Shape"
  },
  {
    "path": "candle-core/src/cuda_backend/error.rs",
    "chars": 1682,
    "preview": "use crate::{DType, Layout};\n\n/// cudarc related errors\n#[derive(thiserror::Error, Debug)]\npub enum CudaError {\n    #[err"
  },
  {
    "path": "candle-core/src/cuda_backend/mod.rs",
    "chars": 102490,
    "preview": "//! Implementation of Backend traits for CUDA device\n//!\nuse crate::backend::{BackendDevice, BackendStorage};\nuse crate:"
  },
  {
    "path": "candle-core/src/cuda_backend/utils.rs",
    "chars": 7449,
    "preview": "/// Helper functions to plug cuda kernels in candle.\nuse crate::{Layout, Result, WithDType};\npub use cudarc;\nuse cudarc:"
  },
  {
    "path": "candle-core/src/custom_op.rs",
    "chars": 16561,
    "preview": "use crate::op::{BackpropOp, Op};\nuse crate::tensor::from_storage;\nuse crate::{CpuStorage, CudaStorage, Layout, MetalStor"
  },
  {
    "path": "candle-core/src/device.rs",
    "chars": 15213,
    "preview": "use crate::backend::BackendDevice;\nuse crate::cpu_backend::CpuDevice;\nuse crate::{CpuStorage, DType, Result, Shape, Stor"
  },
  {
    "path": "candle-core/src/display.rs",
    "chars": 17747,
    "preview": "//! Pretty printing of tensors\n//!\n//! This implementation should be in line with the [PyTorch version](https://github.c"
  },
  {
    "path": "candle-core/src/dtype.rs",
    "chars": 8170,
    "preview": "//! Types for elements that can be stored and manipulated using tensors.\n#![allow(clippy::redundant_closure_call)]\nuse c"
  },
  {
    "path": "candle-core/src/dummy_cuda_backend.rs",
    "chars": 8475,
    "preview": "//! Implementation of the Cuda backend when Cuda support has not been compiled in.\n//!\n#![allow(dead_code)]\nuse crate::o"
  },
  {
    "path": "candle-core/src/dummy_dtype.rs",
    "chars": 8036,
    "preview": "//! Dummy data types for experimental/future float formats\n//!\n//! These are placeholder types for experimental floating"
  },
  {
    "path": "candle-core/src/dummy_metal_backend.rs",
    "chars": 7147,
    "preview": "#![allow(dead_code)]\nuse crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};\nuse crate::{CpuStorage, DType, Error, Layout"
  },
  {
    "path": "candle-core/src/error.rs",
    "chars": 10898,
    "preview": "//! Candle-specific Error and Result\nuse std::{convert::Infallible, fmt::Display};\n\nuse crate::{DType, DeviceLocation, L"
  },
  {
    "path": "candle-core/src/indexer.rs",
    "chars": 7912,
    "preview": "use crate::{Error, Tensor};\nuse std::ops::{\n    Bound, Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, RangeTo"
  },
  {
    "path": "candle-core/src/layout.rs",
    "chars": 8632,
    "preview": "//! Tensor Layouts including contiguous or sparse strides\nuse crate::{Error, Result, Shape};\n\n#[derive(Debug, PartialEq,"
  },
  {
    "path": "candle-core/src/lib.rs",
    "chars": 5216,
    "preview": "//! ML framework for Rust\n//!\n//! ```rust\n//! use candle_core::{Tensor, DType, Device};\n//! # use candle_core::Error;\n//"
  },
  {
    "path": "candle-core/src/metal_backend/device.rs",
    "chars": 10385,
    "preview": "use crate::{DType, Result};\n\n#[cfg(feature = \"ug\")]\nuse candle_metal_kernels::metal::ComputePipeline;\nuse candle_metal_k"
  },
  {
    "path": "candle-core/src/metal_backend/mod.rs",
    "chars": 84528,
    "preview": "//! Implementation of Backend traits for Metal\n//!\nuse crate::backend::{BackendDevice, BackendStorage};\nuse crate::conv:"
  },
  {
    "path": "candle-core/src/mkl.rs",
    "chars": 11567,
    "preview": "#![allow(dead_code)]\nuse libc::{c_char, c_double, c_float, c_int};\n\nmod ffi {\n    use super::*;\n    extern \"C\" {\n       "
  },
  {
    "path": "candle-core/src/npy.rs",
    "chars": 18226,
    "preview": "//! Numpy support for tensors.\n//!\n//! The spec for the npy format can be found in\n//! [npy-format](https://docs.scipy.o"
  },
  {
    "path": "candle-core/src/op.rs",
    "chars": 27977,
    "preview": "//! Tensor Operation Enums and Traits\n//!\n#![allow(clippy::redundant_closure_call)]\nuse crate::Tensor;\nuse float8::F8E4M"
  },
  {
    "path": "candle-core/src/pickle.rs",
    "chars": 27360,
    "preview": "//! Just enough pickle support to be able to read PyTorch checkpoints.\n// This hardcodes objects that are required for t"
  },
  {
    "path": "candle-core/src/quantized/avx.rs",
    "chars": 28323,
    "preview": "use super::k_quants::{\n    BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K"
  },
  {
    "path": "candle-core/src/quantized/cuda.rs",
    "chars": 36078,
    "preview": "use super::{GgmlDType, QStorage};\nuse crate::quantized::k_quants::GgmlType;\nuse crate::{backend::BackendDevice, cuda_bac"
  },
  {
    "path": "candle-core/src/quantized/dummy_cuda.rs",
    "chars": 2374,
    "preview": "#![allow(unused)]\nuse super::GgmlDType;\nuse crate::{CudaDevice, CudaStorage, Error, Result};\n\npub struct QCudaStorage {\n"
  },
  {
    "path": "candle-core/src/quantized/dummy_metal.rs",
    "chars": 2161,
    "preview": "#![allow(unused)]\nuse super::GgmlDType;\nuse crate::{Error, MetalDevice, MetalStorage, Result};\n\npub struct QMetalStorage"
  },
  {
    "path": "candle-core/src/quantized/ggml_file.rs",
    "chars": 9229,
    "preview": "//! Support for the GGML file format.\n\nuse super::{k_quants, GgmlDType, QStorage};\nuse crate::{Device, Result};\nuse byte"
  },
  {
    "path": "candle-core/src/quantized/gguf_file.rs",
    "chars": 18416,
    "preview": "//! Support for the [GGUF file format](https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md).\n//!\n//! Spec: https"
  },
  {
    "path": "candle-core/src/quantized/imatrix_file.rs",
    "chars": 2580,
    "preview": "use std::collections::HashMap;\nuse std::fs::File;\nuse std::io::{Cursor, Read};\nuse std::path::Path;\n\nuse byteorder::{Lit"
  },
  {
    "path": "candle-core/src/quantized/k_quants.rs",
    "chars": 87195,
    "preview": "use super::utils::{\n    get_scale_min_k4, group_for_dequantization, group_for_quantization, make_q3_quants,\n    make_qkx"
  },
  {
    "path": "candle-core/src/quantized/metal.rs",
    "chars": 14774,
    "preview": "use super::{GgmlDType, QStorage};\nuse crate::backend::BackendStorage;\nuse crate::{DType, MetalDevice, MetalStorage, Resu"
  },
  {
    "path": "candle-core/src/quantized/mod.rs",
    "chars": 32348,
    "preview": "use crate::{\n    backend::BackendStorage, CpuStorage, DType, Device, Result, Shape, Storage, Tensor, D,\n};\nuse k_quants:"
  },
  {
    "path": "candle-core/src/quantized/neon.rs",
    "chars": 22872,
    "preview": "use super::k_quants::{\n    BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K"
  },
  {
    "path": "candle-core/src/quantized/simd128.rs",
    "chars": 17120,
    "preview": "use super::k_quants::{BlockQ2K, BlockQ4K, BlockQ4_0, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K};\nuse byteorder::{ByteOr"
  },
  {
    "path": "candle-core/src/quantized/tokenizer.rs",
    "chars": 11226,
    "preview": "use crate::quantized::gguf_file;\nuse crate::{Context, Error, Result};\nuse std::collections::HashSet;\nuse tokenizers::{\n "
  },
  {
    "path": "candle-core/src/quantized/utils.rs",
    "chars": 16370,
    "preview": "pub(super) fn nearest_int(v: f32) -> i32 {\n    v.round() as i32\n}\n\n/// Validates that the input and output are the right"
  },
  {
    "path": "candle-core/src/safetensors.rs",
    "chars": 25322,
    "preview": "//! Module to load `safetensor` files into CPU/GPU memory.\n//!\n//! There are multiple ways to load tensors from safetens"
  },
  {
    "path": "candle-core/src/scalar.rs",
    "chars": 3356,
    "preview": "//! TensorScalar Enum and Trait\n//!\nuse crate::{DType, Result, Tensor, WithDType};\nuse float8::F8E4M3 as f8e4m3;\nuse hal"
  },
  {
    "path": "candle-core/src/shape.rs",
    "chars": 19165,
    "preview": "//! The shape of a tensor is a tuple with the size of each of its dimensions.\n#![allow(clippy::redundant_closure_call)]\n"
  },
  {
    "path": "candle-core/src/sort.rs",
    "chars": 11046,
    "preview": "use crate::{Result, Tensor};\nuse rayon::prelude::*;\n\n#[derive(Debug, Clone, Copy)]\nstruct ArgSort {\n    asc: bool,\n    l"
  },
  {
    "path": "candle-core/src/storage.rs",
    "chars": 29392,
    "preview": "use crate::backend::BackendStorage;\nuse crate::op::{self, CmpOp, ReduceOp};\nuse crate::scalar::Scalar;\nuse crate::{CpuSt"
  },
  {
    "path": "candle-core/src/streaming.rs",
    "chars": 6138,
    "preview": "//! StreamTensror useful for streaming ops.\n//!\nuse crate::{Result, Shape, Tensor};\n\npub trait Dim: crate::shape::Dim + "
  },
  {
    "path": "candle-core/src/strided_index.rs",
    "chars": 2453,
    "preview": "use crate::Layout;\n\n/// An iterator over offset position for items of an N-dimensional arrays stored in a\n/// flat buffe"
  },
  {
    "path": "candle-core/src/tensor.rs",
    "chars": 116065,
    "preview": "//! Tensors are N-dimensional matrixes of elements using a single data type.\n#![allow(clippy::redundant_closure_call)]\nu"
  },
  {
    "path": "candle-core/src/tensor_cat.rs",
    "chars": 11055,
    "preview": "use crate::{shape::Dim, Context, Error, Result, Shape, Tensor};\n\nimpl Tensor {\n    /// Concatenates two or more tensors "
  },
  {
    "path": "candle-core/src/test_utils.rs",
    "chars": 2113,
    "preview": "use crate::{Result, Tensor};\n\n#[macro_export]\nmacro_rules! test_device {\n    // TODO: Switch to generating the two last "
  },
  {
    "path": "candle-core/src/utils.rs",
    "chars": 888,
    "preview": "//! Useful functions for checking features.\nuse std::str::FromStr;\n\npub fn get_num_threads() -> usize {\n    // Respond t"
  },
  {
    "path": "candle-core/src/variable.rs",
    "chars": 4731,
    "preview": "// Variables are wrappers around tensors that can be modified, they are typically used for holding\n// weights and being "
  },
  {
    "path": "candle-core/tests/bilinear_tests.rs",
    "chars": 17560,
    "preview": "use candle_core::{test_device, Device, IndexOp, Result, Tensor};\n\n// ==================================================="
  },
  {
    "path": "candle-core/tests/conv_tests.rs",
    "chars": 36233,
    "preview": "use anyhow::Result;\nuse candle_core::{test_device, test_utils, Device, IndexOp, Tensor};\n\n/* This test is based on the f"
  },
  {
    "path": "candle-core/tests/custom_op_tests.rs",
    "chars": 5395,
    "preview": "use candle_core::backend::BackendStorage;\nuse candle_core::cpu_backend;\nuse candle_core::test_utils::to_vec1_round;\nuse "
  },
  {
    "path": "candle-core/tests/display_tests.rs",
    "chars": 2794,
    "preview": "use anyhow::Result;\nuse candle_core::{DType, Device::Cpu, Tensor};\n\n#[test]\nfn display_scalar() -> Result<()> {\n    let "
  },
  {
    "path": "candle-core/tests/grad_tests.rs",
    "chars": 17898,
    "preview": "#![allow(clippy::approx_constant)]\nuse anyhow::{Context, Result};\nuse candle_core::{test_device, test_utils, DType, Devi"
  },
  {
    "path": "candle-core/tests/indexing_tests.rs",
    "chars": 3728,
    "preview": "use anyhow::Result;\nuse candle_core::{Device, IndexOp, Tensor};\n\n#[test]\nfn integer_index() -> Result<()> {\n    let dev "
  },
  {
    "path": "candle-core/tests/layout_tests.rs",
    "chars": 5185,
    "preview": "use candle::{test_device, Device, IndexOp, Result, Tensor};\nuse candle_core as candle;\n\nfn contiguous(device: &Device) -"
  },
  {
    "path": "candle-core/tests/matmul_tests.rs",
    "chars": 5382,
    "preview": "use candle_core::{test_device, DType, Device, IndexOp, Result, Tensor};\n\nfn matmul(device: &Device) -> Result<()> {\n    "
  },
  {
    "path": "candle-core/tests/npy.py",
    "chars": 193,
    "preview": "import numpy as np\nx = np.arange(10)\n\n# Write a npy file.\nnp.save(\"test.npy\", x)\n\n# Write multiple values to a npz file."
  },
  {
    "path": "candle-core/tests/pool_tests.rs",
    "chars": 3639,
    "preview": "use candle_core::{test_device, test_utils, Device, IndexOp, Result, Tensor};\n\n// https://github.com/huggingface/candle/i"
  },
  {
    "path": "candle-core/tests/pth.py",
    "chars": 1432,
    "preview": "import torch\nfrom collections import OrderedDict\n\n# Write a trivial tensor to a pt file\na= torch.tensor([[1,2,3,4], [5,6"
  },
  {
    "path": "candle-core/tests/pth_tests.rs",
    "chars": 919,
    "preview": "/// Regression test for pth files not loading on Windows.\n#[test]\nfn test_pth() {\n    let tensors = candle_core::pickle:"
  },
  {
    "path": "candle-core/tests/quantized_tests.rs",
    "chars": 49755,
    "preview": "use candle_core::{\n    bail,\n    quantized::{self, GgmlDType},\n    test_device,\n    test_utils::to_vec2_round,\n    DType"
  },
  {
    "path": "candle-core/tests/serialization_tests.rs",
    "chars": 1905,
    "preview": "use candle_core::{DType, Result, Tensor};\n\nstruct TmpFile(std::path::PathBuf);\n\nimpl TmpFile {\n    fn create(base: &str)"
  },
  {
    "path": "candle-core/tests/tensor_tests.rs",
    "chars": 66041,
    "preview": "use candle_core::{test_device, test_utils, DType, Device, IndexOp, Result, Tensor, D};\nuse float8::F8E4M3;\n\nfn zeros(dev"
  },
  {
    "path": "candle-datasets/Cargo.toml",
    "chars": 643,
    "preview": "[package]\nname = \"candle-datasets\"\nversion.workspace = true\nedition.workspace = true\ndescription.workspace = true\nreposi"
  },
  {
    "path": "candle-datasets/README.md",
    "chars": 18,
    "preview": "# candle-datasets\n"
  },
  {
    "path": "candle-datasets/src/batcher.rs",
    "chars": 5261,
    "preview": "use candle::{Result, Tensor};\n\npub struct Batcher<I> {\n    inner: I,\n    batch_size: usize,\n    return_last_incomplete_b"
  },
  {
    "path": "candle-datasets/src/hub.rs",
    "chars": 2842,
    "preview": "use hf_hub::{\n    api::sync::{Api, ApiRepo},\n    Repo, RepoType,\n};\nuse parquet::file::reader::SerializedFileReader;\nuse"
  },
  {
    "path": "candle-datasets/src/lib.rs",
    "chars": 124,
    "preview": "//! Datasets & Dataloaders for Candle\npub mod batcher;\npub mod hub;\npub mod nlp;\npub mod vision;\n\npub use batcher::Batch"
  },
  {
    "path": "candle-datasets/src/nlp/mod.rs",
    "chars": 21,
    "preview": "pub mod tinystories;\n"
  },
  {
    "path": "candle-datasets/src/nlp/tinystories.rs",
    "chars": 4064,
    "preview": "//! Helper functions for the tinystories dataset. This uses the pre-tokenized version as generated\n//! by the tools from"
  },
  {
    "path": "candle-datasets/src/vision/cifar.rs",
    "chars": 5017,
    "preview": "//! The CIFAR-10 dataset.\n//!\n//! The files can be downloaded from the following page:\n//! <https://www.cs.toronto.edu/~"
  },
  {
    "path": "candle-datasets/src/vision/fashion_mnist.rs",
    "chars": 483,
    "preview": "//! Zalando Fashion MNIST dataset.\n//! A slightly more difficult dataset that is drop-in compatible with MNIST.\n//!\n//! "
  },
  {
    "path": "candle-datasets/src/vision/mnist.rs",
    "chars": 4990,
    "preview": "//! The MNIST hand-written digit dataset.\n//!\n//! The files can be obtained from the following link:\n//! <http://yann.le"
  },
  {
    "path": "candle-datasets/src/vision/mod.rs",
    "chars": 239,
    "preview": "use candle::Tensor;\n\npub struct Dataset {\n    pub train_images: Tensor,\n    pub train_labels: Tensor,\n    pub test_image"
  },
  {
    "path": "candle-examples/Cargo.toml",
    "chars": 4221,
    "preview": "[package]\nname = \"candle-examples\"\nversion.workspace = true\nedition.workspace = true\ndescription.workspace = true\nreposi"
  },
  {
    "path": "candle-examples/README.md",
    "chars": 18,
    "preview": "# candle-examples\n"
  },
  {
    "path": "candle-examples/build.rs",
    "chars": 1787,
    "preview": "#![allow(unused)]\nmod buildtime_downloader;\nuse buildtime_downloader::download_model;\n\nstruct KernelDirectories {\n    ke"
  },
  {
    "path": "candle-examples/buildtime_downloader.rs",
    "chars": 1101,
    "preview": "use anyhow::Result;\nuse hf_hub::{api::sync::Api, Repo, RepoType};\n\npub fn download_model(model_and_revision: &str) -> Re"
  },
  {
    "path": "candle-examples/examples/based/README.md",
    "chars": 930,
    "preview": "# candle-based\n\nExperimental, not instruction-tuned small LLM from the Hazy Research group, combining local and linear a"
  },
  {
    "path": "candle-examples/examples/based/main.rs",
    "chars": 8073,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  },
  {
    "path": "candle-examples/examples/beit/README.md",
    "chars": 650,
    "preview": "# candle-beit\n\n[Beit](https://arxiv.org/abs/2106.08254) is a computer vision model.\nIn this example, it is used as an Im"
  },
  {
    "path": "candle-examples/examples/beit/main.rs",
    "chars": 2597,
    "preview": "//! BEiT: BERT Pre-Training of Image Transformers\n//! https://github.com/microsoft/unilm/tree/master/beit\n\n#[cfg(feature"
  },
  {
    "path": "candle-examples/examples/bert/README.md",
    "chars": 3363,
    "preview": "# candle-bert\n\nBert is a general large language model. In this example it can be used for two\ndifferent tasks:\n\n- Comput"
  },
  {
    "path": "candle-examples/examples/bert/main.rs",
    "chars": 8759,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\nuse can"
  },
  {
    "path": "candle-examples/examples/bert_single_file_binary/README.md",
    "chars": 4592,
    "preview": "# candle_bert_single_file_binary\n\nThis is an adapted version of the Candle Bert example to inline (embed) the model file"
  },
  {
    "path": "candle-examples/examples/bert_single_file_binary/main.rs",
    "chars": 7264,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\nuse can"
  },
  {
    "path": "candle-examples/examples/bigcode/README.md",
    "chars": 444,
    "preview": "# candle-starcoder: code generation model\n\n[StarCoder/BigCode](https://huggingface.co/bigcode/starcoderbase-1b) is a LLM"
  },
  {
    "path": "candle-examples/examples/bigcode/main.rs",
    "chars": 4613,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  },
  {
    "path": "candle-examples/examples/blip/README.md",
    "chars": 552,
    "preview": "# candle-blip\n\nThe\n[blip-image-captioning](https://huggingface.co/Salesforce/blip-image-captioning-base)\nmodel can gener"
  },
  {
    "path": "candle-examples/examples/blip/main.rs",
    "chars": 5303,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  },
  {
    "path": "candle-examples/examples/chatglm/README.md",
    "chars": 280,
    "preview": "# candle-chatglm\n\nUses `THUDM/chatglm3-6b` to generate chinese text. Will not generate text for english (usually).\n \n## "
  },
  {
    "path": "candle-examples/examples/chatglm/main.rs",
    "chars": 7373,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  },
  {
    "path": "candle-examples/examples/chinese_clip/README.md",
    "chars": 2233,
    "preview": "# candle-chinese-clip\n\nContrastive Language-Image Pre-Training (CLIP) is an architecture trained on\npairs of images with"
  },
  {
    "path": "candle-examples/examples/chinese_clip/main.rs",
    "chars": 6712,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse ca"
  },
  {
    "path": "candle-examples/examples/clip/README.md",
    "chars": 1950,
    "preview": "# candle-clip\n\nContrastive Language-Image Pre-Training (CLIP) is an architecture trained on\npairs of images with related"
  },
  {
    "path": "candle-examples/examples/clip/main.rs",
    "chars": 5327,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  },
  {
    "path": "candle-examples/examples/codegeex4-9b/README.org",
    "chars": 2881,
    "preview": "* candle-codegeex4_9b\nTHUDM/CodeGeeX4 is a versatile model for all AI software development scenarios, including code com"
  },
  {
    "path": "candle-examples/examples/codegeex4-9b/main.rs",
    "chars": 8150,
    "preview": "use candle::{DType, Device, Tensor};\nuse candle_nn::VarBuilder;\nuse candle_transformers::generation::LogitsProcessor;\nus"
  },
  {
    "path": "candle-examples/examples/colpali/README.md",
    "chars": 465,
    "preview": "# Colpali\n\n[HuggingFace Model Card](https://huggingface.co/vidore/colpali-v1.2-merged)\n\n```\nwget https://arxiv.org/pdf/1"
  },
  {
    "path": "candle-examples/examples/colpali/main.rs",
    "chars": 8020,
    "preview": "use anyhow::{Error as E, Result};\nuse candle::{DType, Device, Tensor};\nuse candle_nn::VarBuilder;\nuse candle_transformer"
  },
  {
    "path": "candle-examples/examples/convmixer/README.md",
    "chars": 646,
    "preview": "# candle-convmixer\n\nA lightweight CNN architecture that processes image patches similar to a vision transformer, with se"
  },
  {
    "path": "candle-examples/examples/convmixer/main.rs",
    "chars": 1644,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse cl"
  },
  {
    "path": "candle-examples/examples/convnext/README.md",
    "chars": 793,
    "preview": "# candle-convnext\n\n[A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) and\n[ConvNeXt V2: Co-designing and Scalin"
  },
  {
    "path": "candle-examples/examples/convnext/main.rs",
    "chars": 3847,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse cl"
  },
  {
    "path": "candle-examples/examples/csm/README.md",
    "chars": 486,
    "preview": "# Conversational Speech Model (CSM)\n\nCSM is a speech generation model from Sesame,\n[SesameAILabs/csm](https://github.com"
  },
  {
    "path": "candle-examples/examples/csm/main.rs",
    "chars": 7565,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  },
  {
    "path": "candle-examples/examples/custom-ops/README.md",
    "chars": 532,
    "preview": "# candle-custom-ops\n\n This example illustrates how to implement forward and backward passes for custom operations on the"
  },
  {
    "path": "candle-examples/examples/custom-ops/cuda_kernels.rs",
    "chars": 102,
    "preview": "pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!(\"OUT_DIR\"), \"/layernorm_kernels.ptx\"));\n"
  },
  {
    "path": "candle-examples/examples/custom-ops/kernels/layernorm_kernels.cu",
    "chars": 1274,
    "preview": "#include <stdint.h>\n#include \"reduction_utils.cuh\"\n\ntemplate <typename scalar_t>\n__device__ void\nrms_norm_kernel(scalar_"
  },
  {
    "path": "candle-examples/examples/custom-ops/kernels/reduction_utils.cuh",
    "chars": 1558,
    "preview": "/*\n * Adapted from\n * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/re"
  },
  {
    "path": "candle-examples/examples/custom-ops/main.rs",
    "chars": 3390,
    "preview": "// This example illustrates how to implement custom operations. These operations can provide their\n// own forward pass ("
  },
  {
    "path": "candle-examples/examples/debertav2/README.md",
    "chars": 11118,
    "preview": "## debertav2\n\nThis is a port of the DebertaV2/V3 model codebase for use in `candle`. It works with both locally fine-tun"
  },
  {
    "path": "candle-examples/examples/debertav2/main.rs",
    "chars": 13646,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse st"
  },
  {
    "path": "candle-examples/examples/deepseekv2/README.md",
    "chars": 799,
    "preview": "# DeepSeek V2\n\nDeepSeek V2 an MoE model featuring MLA (Multi-Latent Attention). There is a lite (16B) and a full (236B) "
  },
  {
    "path": "candle-examples/examples/deepseekv2/main.rs",
    "chars": 8651,
    "preview": "#[cfg(feature = \"mkl\")]\nextern crate intel_mkl_src;\n\n#[cfg(feature = \"accelerate\")]\nextern crate accelerate_src;\n\nuse an"
  }
]

// ... and 849 more files (download for full content)

About this extraction

This page contains the full source code of the huggingface/candle GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1049 files (8.7 MB), approximately 2.3M tokens, and a symbol index with 10803 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo