Repository: xorbitsai/inference
Branch: main
Commit: ebc027138775
Files: 1635
Total size: 67.4 MB
Directory structure:
gitextract_u_nl6j7f/
├── .dockerignore
├── .gitattributes
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.yaml
│ │ └── feature_request.yaml
│ └── workflows/
│ ├── assign.yaml
│ ├── docker-cd.yaml
│ ├── issue.yaml
│ ├── pr_auto_run_gen_docs.yaml
│ ├── python.yaml
│ └── release.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_ja_JP.md
├── README_zh_CN.md
├── benchmark/
│ ├── README.md
│ ├── benchmark_embedding.py
│ ├── benchmark_latency.py
│ ├── benchmark_long.py
│ ├── benchmark_rerank.py
│ ├── benchmark_runner.py
│ ├── benchmark_serving.py
│ └── utils.py
├── doc/
│ ├── Makefile
│ ├── source/
│ │ ├── _static/
│ │ │ └── switcher.json
│ │ ├── conf.py
│ │ ├── development/
│ │ │ ├── contributing_codebase.rst
│ │ │ ├── contributing_environment.rst
│ │ │ ├── index.rst
│ │ │ └── xinference_internals.rst
│ │ ├── examples/
│ │ │ ├── ai_podcast.rst
│ │ │ ├── chatbot.rst
│ │ │ ├── gradio_chatinterface.rst
│ │ │ ├── index.rst
│ │ │ ├── langchain_streamlit_doc_chat.rst
│ │ │ └── pdf_chatbot.rst
│ │ ├── gen_docs.py
│ │ ├── getting_started/
│ │ │ ├── environments.rst
│ │ │ ├── index.rst
│ │ │ ├── installation.rst
│ │ │ ├── installation_npu.rst
│ │ │ ├── logging.rst
│ │ │ ├── release_notes.rst
│ │ │ ├── troubleshooting.rst
│ │ │ ├── using_docker_image.rst
│ │ │ ├── using_kubernetes.rst
│ │ │ └── using_xinference.rst
│ │ ├── index.rst
│ │ ├── locale/
│ │ │ └── zh_CN/
│ │ │ └── LC_MESSAGES/
│ │ │ ├── development/
│ │ │ │ ├── contributing_codebase.po
│ │ │ │ ├── contributing_environment.po
│ │ │ │ ├── index.po
│ │ │ │ └── xinference_internals.po
│ │ │ ├── examples/
│ │ │ │ ├── ai_podcast.po
│ │ │ │ ├── chatbot.po
│ │ │ │ ├── gradio_chatinterface.po
│ │ │ │ ├── index.po
│ │ │ │ ├── langchain_streamlit_doc_chat.po
│ │ │ │ └── pdf_chatbot.po
│ │ │ ├── getting_started/
│ │ │ │ ├── environments.po
│ │ │ │ ├── index.po
│ │ │ │ ├── installation.po
│ │ │ │ ├── installation_npu.po
│ │ │ │ ├── logging.po
│ │ │ │ ├── release_notes.po
│ │ │ │ ├── troubleshooting.po
│ │ │ │ ├── using_docker_image.po
│ │ │ │ ├── using_kubernetes.po
│ │ │ │ └── using_xinference.po
│ │ │ ├── getting_started.po
│ │ │ ├── index.po
│ │ │ ├── models/
│ │ │ │ ├── builtin/
│ │ │ │ │ ├── audio/
│ │ │ │ │ │ └── index.po
│ │ │ │ │ ├── embedding/
│ │ │ │ │ │ ├── bge-base-en-v1.5.po
│ │ │ │ │ │ ├── bge-base-en.po
│ │ │ │ │ │ ├── bge-base-zh-v1.5.po
│ │ │ │ │ │ ├── bge-base-zh.po
│ │ │ │ │ │ ├── bge-large-en-v1.5.po
│ │ │ │ │ │ ├── bge-large-en.po
│ │ │ │ │ │ ├── bge-large-zh-noinstruct.po
│ │ │ │ │ │ ├── bge-large-zh-v1.5.po
│ │ │ │ │ │ ├── bge-large-zh.po
│ │ │ │ │ │ ├── bge-small-en-v1.5.po
│ │ │ │ │ │ ├── bge-small-zh-v1.5.po
│ │ │ │ │ │ ├── bge-small-zh.po
│ │ │ │ │ │ ├── e5-large-v2.po
│ │ │ │ │ │ ├── gte-base.po
│ │ │ │ │ │ ├── gte-large.po
│ │ │ │ │ │ ├── index.po
│ │ │ │ │ │ ├── jina-embeddings-v2-base-en.po
│ │ │ │ │ │ ├── jina-embeddings-v2-small-en.po
│ │ │ │ │ │ └── multilingual-e5-large.po
│ │ │ │ │ ├── image/
│ │ │ │ │ │ ├── flux.1-dev.po
│ │ │ │ │ │ ├── flux.1-schnell.po
│ │ │ │ │ │ ├── index.po
│ │ │ │ │ │ ├── kolors.po
│ │ │ │ │ │ ├── sd-turbo.po
│ │ │ │ │ │ ├── sd3-medium.po
│ │ │ │ │ │ ├── sdxl-turbo.po
│ │ │ │ │ │ ├── stable-diffusion-2-inpainting.po
│ │ │ │ │ │ ├── stable-diffusion-inpainting.po
│ │ │ │ │ │ ├── stable-diffusion-v1.5.po
│ │ │ │ │ │ ├── stable-diffusion-xl-base-1.0.po
│ │ │ │ │ │ └── stable-diffusion-xl-inpainting.po
│ │ │ │ │ ├── index.po
│ │ │ │ │ ├── llm/
│ │ │ │ │ │ ├── baichuan-2-chat.po
│ │ │ │ │ │ ├── baichuan-2.po
│ │ │ │ │ │ ├── baichuan-chat.po
│ │ │ │ │ │ ├── baichuan.po
│ │ │ │ │ │ ├── chatglm.po
│ │ │ │ │ │ ├── chatglm2-32k.po
│ │ │ │ │ │ ├── chatglm2.po
│ │ │ │ │ │ ├── chatglm3-32k.po
│ │ │ │ │ │ ├── chatglm3.po
│ │ │ │ │ │ ├── code-llama-instruct.po
│ │ │ │ │ │ ├── code-llama-python.po
│ │ │ │ │ │ ├── code-llama.po
│ │ │ │ │ │ ├── deepseek-chat.po
│ │ │ │ │ │ ├── deepseek-coder-instruct.po
│ │ │ │ │ │ ├── falcon-instruct.po
│ │ │ │ │ │ ├── falcon.po
│ │ │ │ │ │ ├── glaive-coder.po
│ │ │ │ │ │ ├── gorilla-openfunctions-v1.po
│ │ │ │ │ │ ├── gpt-2.po
│ │ │ │ │ │ ├── index.po
│ │ │ │ │ │ ├── internlm-20b.po
│ │ │ │ │ │ ├── internlm-7b.po
│ │ │ │ │ │ ├── internlm-chat-20b.po
│ │ │ │ │ │ ├── internlm-chat-7b.po
│ │ │ │ │ │ ├── llama-2-chat.po
│ │ │ │ │ │ ├── llama-2.po
│ │ │ │ │ │ ├── mistral-instruct-v0.1.po
│ │ │ │ │ │ ├── mistral-instruct-v0.2.po
│ │ │ │ │ │ ├── mistral-v0.1.po
│ │ │ │ │ │ ├── mixtral-instruct-v0.1.po
│ │ │ │ │ │ ├── mixtral-v0.1.po
│ │ │ │ │ │ ├── openbuddy.po
│ │ │ │ │ │ ├── openhermes-2.5.po
│ │ │ │ │ │ ├── opt.po
│ │ │ │ │ │ ├── orca.po
│ │ │ │ │ │ ├── qwen-chat.po
│ │ │ │ │ │ ├── starchat-beta.po
│ │ │ │ │ │ ├── starcoder.po
│ │ │ │ │ │ ├── starcoderplus.po
│ │ │ │ │ │ ├── tiny-llama.po
│ │ │ │ │ │ ├── vicuna-v1.3.po
│ │ │ │ │ │ ├── vicuna-v1.5-16k.po
│ │ │ │ │ │ ├── vicuna-v1.5.po
│ │ │ │ │ │ ├── wizardcoder-python-v1.0.po
│ │ │ │ │ │ ├── wizardlm-v1.0.po
│ │ │ │ │ │ ├── wizardmath-v1.0.po
│ │ │ │ │ │ ├── xverse-chat.po
│ │ │ │ │ │ ├── xverse.po
│ │ │ │ │ │ ├── yi-200k.po
│ │ │ │ │ │ ├── yi-chat.po
│ │ │ │ │ │ ├── yi.po
│ │ │ │ │ │ ├── zephyr-7b-alpha.po
│ │ │ │ │ │ └── zephyr-7b-beta.po
│ │ │ │ │ ├── rerank/
│ │ │ │ │ │ ├── bge-reranker-base.po
│ │ │ │ │ │ ├── bge-reranker-large.po
│ │ │ │ │ │ └── index.po
│ │ │ │ │ └── video/
│ │ │ │ │ ├── cogvideox-2b.po
│ │ │ │ │ └── index.po
│ │ │ │ ├── custom.po
│ │ │ │ ├── index.po
│ │ │ │ ├── lora.po
│ │ │ │ ├── model_abilities/
│ │ │ │ │ ├── audio.po
│ │ │ │ │ ├── chat.po
│ │ │ │ │ ├── embed.po
│ │ │ │ │ ├── flexible.po
│ │ │ │ │ ├── image.po
│ │ │ │ │ ├── index.po
│ │ │ │ │ ├── multimodal.po
│ │ │ │ │ ├── rerank.po
│ │ │ │ │ ├── tools.po
│ │ │ │ │ └── video.po
│ │ │ │ ├── model_memory.po
│ │ │ │ ├── model_update.po
│ │ │ │ ├── source/
│ │ │ │ │ └── source.po
│ │ │ │ ├── sources/
│ │ │ │ │ └── sources.po
│ │ │ │ ├── virtualenv.po
│ │ │ │ ├── xinference_model_hub.po
│ │ │ │ └── xinference_models_hub.po
│ │ │ ├── reference/
│ │ │ │ └── index.po
│ │ │ ├── reference.po
│ │ │ └── user_guide/
│ │ │ ├── auth_system.po
│ │ │ ├── backends.po
│ │ │ ├── cache_management.po
│ │ │ ├── client_api.po
│ │ │ ├── continuous_batching.po
│ │ │ ├── distributed_inference.po
│ │ │ ├── index.po
│ │ │ ├── launch.po
│ │ │ └── vllm_enhancement.po
│ │ ├── models/
│ │ │ ├── builtin/
│ │ │ │ ├── audio/
│ │ │ │ │ ├── belle-distilwhisper-large-v2-zh.rst
│ │ │ │ │ ├── belle-whisper-large-v2-zh.rst
│ │ │ │ │ ├── belle-whisper-large-v3-zh.rst
│ │ │ │ │ ├── chattts.rst
│ │ │ │ │ ├── cosyvoice-300m-instruct.rst
│ │ │ │ │ ├── cosyvoice-300m-sft.rst
│ │ │ │ │ ├── cosyvoice-300m.rst
│ │ │ │ │ ├── cosyvoice2-0.5b.rst
│ │ │ │ │ ├── f5-tts-mlx.rst
│ │ │ │ │ ├── f5-tts.rst
│ │ │ │ │ ├── fishspeech-1.5.rst
│ │ │ │ │ ├── fun-asr-mlt-nano-2512.rst
│ │ │ │ │ ├── fun-asr-nano-2512.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── indextts2.rst
│ │ │ │ │ ├── kokoro-82m-mlx.rst
│ │ │ │ │ ├── kokoro-82m-v1.1-zh.rst
│ │ │ │ │ ├── kokoro-82m.rst
│ │ │ │ │ ├── megatts3.rst
│ │ │ │ │ ├── melotts-chinese.rst
│ │ │ │ │ ├── melotts-english-v2.rst
│ │ │ │ │ ├── melotts-english-v3.rst
│ │ │ │ │ ├── melotts-english.rst
│ │ │ │ │ ├── melotts-french.rst
│ │ │ │ │ ├── melotts-japanese.rst
│ │ │ │ │ ├── melotts-korean.rst
│ │ │ │ │ ├── melotts-spanish.rst
│ │ │ │ │ ├── paraformer-zh-hotword.rst
│ │ │ │ │ ├── paraformer-zh-long.rst
│ │ │ │ │ ├── paraformer-zh-spk.rst
│ │ │ │ │ ├── paraformer-zh.rst
│ │ │ │ │ ├── qwen3-asr-0.6b.rst
│ │ │ │ │ ├── qwen3-asr-1.7b.rst
│ │ │ │ │ ├── seaco-paraformer-zh.rst
│ │ │ │ │ ├── sensevoicesmall.rst
│ │ │ │ │ ├── whisper-base-mlx.rst
│ │ │ │ │ ├── whisper-base.en-mlx.rst
│ │ │ │ │ ├── whisper-base.en.rst
│ │ │ │ │ ├── whisper-base.rst
│ │ │ │ │ ├── whisper-large-v3-mlx.rst
│ │ │ │ │ ├── whisper-large-v3-turbo-mlx.rst
│ │ │ │ │ ├── whisper-large-v3-turbo.rst
│ │ │ │ │ ├── whisper-large-v3.rst
│ │ │ │ │ ├── whisper-medium-mlx.rst
│ │ │ │ │ ├── whisper-medium.en-mlx.rst
│ │ │ │ │ ├── whisper-medium.en.rst
│ │ │ │ │ ├── whisper-medium.rst
│ │ │ │ │ ├── whisper-small-mlx.rst
│ │ │ │ │ ├── whisper-small.en-mlx.rst
│ │ │ │ │ ├── whisper-small.en.rst
│ │ │ │ │ ├── whisper-small.rst
│ │ │ │ │ ├── whisper-tiny-mlx.rst
│ │ │ │ │ ├── whisper-tiny.en-mlx.rst
│ │ │ │ │ ├── whisper-tiny.en.rst
│ │ │ │ │ └── whisper-tiny.rst
│ │ │ │ ├── embedding/
│ │ │ │ │ ├── bce-embedding-base_v1.rst
│ │ │ │ │ ├── bge-base-en-v1.5.rst
│ │ │ │ │ ├── bge-base-en.rst
│ │ │ │ │ ├── bge-base-zh-v1.5.rst
│ │ │ │ │ ├── bge-base-zh.rst
│ │ │ │ │ ├── bge-large-en-v1.5.rst
│ │ │ │ │ ├── bge-large-en.rst
│ │ │ │ │ ├── bge-large-zh-noinstruct.rst
│ │ │ │ │ ├── bge-large-zh-v1.5.rst
│ │ │ │ │ ├── bge-large-zh.rst
│ │ │ │ │ ├── bge-m3.rst
│ │ │ │ │ ├── bge-small-en-v1.5.rst
│ │ │ │ │ ├── bge-small-zh-v1.5.rst
│ │ │ │ │ ├── bge-small-zh.rst
│ │ │ │ │ ├── e5-large-v2.rst
│ │ │ │ │ ├── gme-qwen2-vl-2b-instruct.rst
│ │ │ │ │ ├── gme-qwen2-vl-7b-instruct.rst
│ │ │ │ │ ├── gte-base.rst
│ │ │ │ │ ├── gte-large.rst
│ │ │ │ │ ├── gte-qwen2.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── jina-clip-v2.rst
│ │ │ │ │ ├── jina-embeddings-v2-base-en.rst
│ │ │ │ │ ├── jina-embeddings-v2-base-zh.rst
│ │ │ │ │ ├── jina-embeddings-v2-small-en.rst
│ │ │ │ │ ├── jina-embeddings-v3.rst
│ │ │ │ │ ├── jina-embeddings-v4.rst
│ │ │ │ │ ├── m3e-base.rst
│ │ │ │ │ ├── m3e-large.rst
│ │ │ │ │ ├── m3e-small.rst
│ │ │ │ │ ├── multilingual-e5-large.rst
│ │ │ │ │ ├── qwen3-embedding-0.6b.rst
│ │ │ │ │ ├── qwen3-embedding-4b.rst
│ │ │ │ │ ├── qwen3-embedding-8b.rst
│ │ │ │ │ ├── qwen3-vl-embedding-2b.rst
│ │ │ │ │ ├── qwen3-vl-embedding-8b.rst
│ │ │ │ │ ├── text2vec-base-chinese-paraphrase.rst
│ │ │ │ │ ├── text2vec-base-chinese-sentence.rst
│ │ │ │ │ ├── text2vec-base-chinese.rst
│ │ │ │ │ ├── text2vec-base-multilingual.rst
│ │ │ │ │ └── text2vec-large-chinese.rst
│ │ │ │ ├── image/
│ │ │ │ │ ├── cogview4.rst
│ │ │ │ │ ├── deepseek-ocr.rst
│ │ │ │ │ ├── flux.1-dev.rst
│ │ │ │ │ ├── flux.1-kontext-dev.rst
│ │ │ │ │ ├── flux.1-schnell.rst
│ │ │ │ │ ├── flux.2-dev.rst
│ │ │ │ │ ├── flux.2-klein-4b.rst
│ │ │ │ │ ├── flux.2-klein-9b.rst
│ │ │ │ │ ├── got-ocr2_0.rst
│ │ │ │ │ ├── hunyuandit-v1.2-distilled.rst
│ │ │ │ │ ├── hunyuandit-v1.2.rst
│ │ │ │ │ ├── hunyuanocr.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── kolors.rst
│ │ │ │ │ ├── mineru2.5-2509-1.2b.rst
│ │ │ │ │ ├── paddleocr-vl.rst
│ │ │ │ │ ├── qwen-image-2512.rst
│ │ │ │ │ ├── qwen-image-edit-2509.rst
│ │ │ │ │ ├── qwen-image-edit-2511.rst
│ │ │ │ │ ├── qwen-image-edit.rst
│ │ │ │ │ ├── qwen-image-layered.rst
│ │ │ │ │ ├── qwen-image.rst
│ │ │ │ │ ├── sd-turbo.rst
│ │ │ │ │ ├── sd3-medium.rst
│ │ │ │ │ ├── sd3.5-large-turbo.rst
│ │ │ │ │ ├── sd3.5-large.rst
│ │ │ │ │ ├── sd3.5-medium.rst
│ │ │ │ │ ├── sdxl-turbo.rst
│ │ │ │ │ ├── stable-diffusion-2-inpainting.rst
│ │ │ │ │ ├── stable-diffusion-inpainting.rst
│ │ │ │ │ ├── stable-diffusion-v1.5.rst
│ │ │ │ │ ├── stable-diffusion-xl-base-1.0.rst
│ │ │ │ │ ├── stable-diffusion-xl-inpainting.rst
│ │ │ │ │ ├── z-image-turbo.rst
│ │ │ │ │ └── z-image.rst
│ │ │ │ ├── index.rst
│ │ │ │ ├── llm/
│ │ │ │ │ ├── baichuan-2-chat.rst
│ │ │ │ │ ├── baichuan-2.rst
│ │ │ │ │ ├── baichuan-m2.rst
│ │ │ │ │ ├── code-llama-instruct.rst
│ │ │ │ │ ├── code-llama-python.rst
│ │ │ │ │ ├── code-llama.rst
│ │ │ │ │ ├── codegeex4.rst
│ │ │ │ │ ├── codeqwen1.5-chat.rst
│ │ │ │ │ ├── codeqwen1.5.rst
│ │ │ │ │ ├── codeshell-chat.rst
│ │ │ │ │ ├── codeshell.rst
│ │ │ │ │ ├── codestral-v0.1.rst
│ │ │ │ │ ├── cogagent.rst
│ │ │ │ │ ├── deepseek-chat.rst
│ │ │ │ │ ├── deepseek-coder-instruct.rst
│ │ │ │ │ ├── deepseek-coder.rst
│ │ │ │ │ ├── deepseek-prover-v2.rst
│ │ │ │ │ ├── deepseek-r1-0528-qwen3.rst
│ │ │ │ │ ├── deepseek-r1-0528.rst
│ │ │ │ │ ├── deepseek-r1-distill-llama.rst
│ │ │ │ │ ├── deepseek-r1-distill-qwen.rst
│ │ │ │ │ ├── deepseek-r1.rst
│ │ │ │ │ ├── deepseek-v2-chat-0628.rst
│ │ │ │ │ ├── deepseek-v2-chat.rst
│ │ │ │ │ ├── deepseek-v2.5.rst
│ │ │ │ │ ├── deepseek-v3-0324.rst
│ │ │ │ │ ├── deepseek-v3.1.rst
│ │ │ │ │ ├── deepseek-v3.2-exp.rst
│ │ │ │ │ ├── deepseek-v3.2.rst
│ │ │ │ │ ├── deepseek-v3.rst
│ │ │ │ │ ├── deepseek-vl2.rst
│ │ │ │ │ ├── deepseek.rst
│ │ │ │ │ ├── dianjin-r1.rst
│ │ │ │ │ ├── ernie4.5.rst
│ │ │ │ │ ├── fin-r1.rst
│ │ │ │ │ ├── gemma-3-1b-it.rst
│ │ │ │ │ ├── gemma-3-it.rst
│ │ │ │ │ ├── glm-4.1v-thinking.rst
│ │ │ │ │ ├── glm-4.5.rst
│ │ │ │ │ ├── glm-4.5v.rst
│ │ │ │ │ ├── glm-4.6.rst
│ │ │ │ │ ├── glm-4.7-flash.rst
│ │ │ │ │ ├── glm-4.7.rst
│ │ │ │ │ ├── glm-4v.rst
│ │ │ │ │ ├── glm-5.rst
│ │ │ │ │ ├── glm-edge-chat.rst
│ │ │ │ │ ├── glm4-0414.rst
│ │ │ │ │ ├── glm4-chat-1m.rst
│ │ │ │ │ ├── glm4-chat.rst
│ │ │ │ │ ├── gorilla-openfunctions-v2.rst
│ │ │ │ │ ├── gpt-2.rst
│ │ │ │ │ ├── gpt-oss.rst
│ │ │ │ │ ├── huatuogpt-o1-llama-3.1.rst
│ │ │ │ │ ├── huatuogpt-o1-qwen2.5.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── internlm3-instruct.rst
│ │ │ │ │ ├── internvl3.rst
│ │ │ │ │ ├── kat-v1.rst
│ │ │ │ │ ├── kimi-k2.5.rst
│ │ │ │ │ ├── llama-2-chat.rst
│ │ │ │ │ ├── llama-2.rst
│ │ │ │ │ ├── llama-3-instruct.rst
│ │ │ │ │ ├── llama-3.1-instruct.rst
│ │ │ │ │ ├── llama-3.1.rst
│ │ │ │ │ ├── llama-3.2-vision-instruct.rst
│ │ │ │ │ ├── llama-3.2-vision.rst
│ │ │ │ │ ├── llama-3.3-instruct.rst
│ │ │ │ │ ├── llama-3.rst
│ │ │ │ │ ├── marco-o1.rst
│ │ │ │ │ ├── mineru2.5-2509-1.2b.rst
│ │ │ │ │ ├── minicpm-2b-dpo-bf16.rst
│ │ │ │ │ ├── minicpm-2b-dpo-fp16.rst
│ │ │ │ │ ├── minicpm-2b-dpo-fp32.rst
│ │ │ │ │ ├── minicpm-2b-sft-bf16.rst
│ │ │ │ │ ├── minicpm-2b-sft-fp32.rst
│ │ │ │ │ ├── minicpm-v-2.6.rst
│ │ │ │ │ ├── minicpm-v-4.5.rst
│ │ │ │ │ ├── minicpm3-4b.rst
│ │ │ │ │ ├── minicpm4.rst
│ │ │ │ │ ├── minimax-m2.5.rst
│ │ │ │ │ ├── minimax-m2.rst
│ │ │ │ │ ├── mistral-instruct-v0.1.rst
│ │ │ │ │ ├── mistral-instruct-v0.2.rst
│ │ │ │ │ ├── mistral-instruct-v0.3.rst
│ │ │ │ │ ├── mistral-large-instruct.rst
│ │ │ │ │ ├── mistral-nemo-instruct.rst
│ │ │ │ │ ├── mistral-v0.1.rst
│ │ │ │ │ ├── mixtral-8x22b-instruct-v0.1.rst
│ │ │ │ │ ├── mixtral-instruct-v0.1.rst
│ │ │ │ │ ├── mixtral-v0.1.rst
│ │ │ │ │ ├── moonlight-16b-a3b-instruct.rst
│ │ │ │ │ ├── openhermes-2.5.rst
│ │ │ │ │ ├── opt.rst
│ │ │ │ │ ├── orion-chat.rst
│ │ │ │ │ ├── ovis2.rst
│ │ │ │ │ ├── phi-2.rst
│ │ │ │ │ ├── phi-3-mini-128k-instruct.rst
│ │ │ │ │ ├── phi-3-mini-4k-instruct.rst
│ │ │ │ │ ├── qvq-72b-preview.rst
│ │ │ │ │ ├── qwen-chat.rst
│ │ │ │ │ ├── qwen1.5-chat.rst
│ │ │ │ │ ├── qwen1.5-moe-chat.rst
│ │ │ │ │ ├── qwen2-audio-instruct.rst
│ │ │ │ │ ├── qwen2-instruct.rst
│ │ │ │ │ ├── qwen2-moe-instruct.rst
│ │ │ │ │ ├── qwen2-vl-instruct.rst
│ │ │ │ │ ├── qwen2.5-coder-instruct.rst
│ │ │ │ │ ├── qwen2.5-coder.rst
│ │ │ │ │ ├── qwen2.5-instruct-1m.rst
│ │ │ │ │ ├── qwen2.5-instruct.rst
│ │ │ │ │ ├── qwen2.5-omni.rst
│ │ │ │ │ ├── qwen2.5-vl-instruct.rst
│ │ │ │ │ ├── qwen2.5.rst
│ │ │ │ │ ├── qwen3-coder.rst
│ │ │ │ │ ├── qwen3-instruct.rst
│ │ │ │ │ ├── qwen3-next-instruct.rst
│ │ │ │ │ ├── qwen3-next-thinking.rst
│ │ │ │ │ ├── qwen3-omni-instruct.rst
│ │ │ │ │ ├── qwen3-omni-thinking.rst
│ │ │ │ │ ├── qwen3-thinking.rst
│ │ │ │ │ ├── qwen3-vl-instruct.rst
│ │ │ │ │ ├── qwen3-vl-thinking.rst
│ │ │ │ │ ├── qwen3.5.rst
│ │ │ │ │ ├── qwen3.rst
│ │ │ │ │ ├── qwenlong-l1.rst
│ │ │ │ │ ├── qwq-32b-preview.rst
│ │ │ │ │ ├── qwq-32b.rst
│ │ │ │ │ ├── seallm_v2.5.rst
│ │ │ │ │ ├── seallm_v2.rst
│ │ │ │ │ ├── seallms-v3.rst
│ │ │ │ │ ├── seed-oss.rst
│ │ │ │ │ ├── skywork-math.rst
│ │ │ │ │ ├── skywork-or1-preview.rst
│ │ │ │ │ ├── skywork-or1.rst
│ │ │ │ │ ├── skywork.rst
│ │ │ │ │ ├── telechat.rst
│ │ │ │ │ ├── tiny-llama.rst
│ │ │ │ │ ├── wizardcoder-python-v1.0.rst
│ │ │ │ │ ├── wizardmath-v1.0.rst
│ │ │ │ │ ├── xiyansql-qwencoder-2504.rst
│ │ │ │ │ ├── xverse-chat.rst
│ │ │ │ │ ├── xverse.rst
│ │ │ │ │ ├── yi-1.5-chat-16k.rst
│ │ │ │ │ ├── yi-1.5-chat.rst
│ │ │ │ │ ├── yi-1.5.rst
│ │ │ │ │ ├── yi-200k.rst
│ │ │ │ │ ├── yi-chat.rst
│ │ │ │ │ └── yi.rst
│ │ │ │ ├── rerank/
│ │ │ │ │ ├── bce-reranker-base_v1.rst
│ │ │ │ │ ├── bge-reranker-base.rst
│ │ │ │ │ ├── bge-reranker-large.rst
│ │ │ │ │ ├── bge-reranker-v2-gemma.rst
│ │ │ │ │ ├── bge-reranker-v2-m3.rst
│ │ │ │ │ ├── bge-reranker-v2-minicpm-layerwise.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── jina-reranker-v2.rst
│ │ │ │ │ ├── jina-reranker-v3.rst
│ │ │ │ │ ├── minicpm-reranker.rst
│ │ │ │ │ ├── qwen3-reranker-0.6b.rst
│ │ │ │ │ ├── qwen3-reranker-4b.rst
│ │ │ │ │ ├── qwen3-reranker-8b.rst
│ │ │ │ │ ├── qwen3-vl-reranker-2b.rst
│ │ │ │ │ └── qwen3-vl-reranker-8b.rst
│ │ │ │ └── video/
│ │ │ │ ├── cogvideox-2b.rst
│ │ │ │ ├── cogvideox-5b.rst
│ │ │ │ ├── hunyuanvideo.rst
│ │ │ │ ├── index.rst
│ │ │ │ ├── wan2.1-1.3b.rst
│ │ │ │ ├── wan2.1-14b.rst
│ │ │ │ ├── wan2.1-flf2v-14b-720p.rst
│ │ │ │ ├── wan2.1-i2v-14b-480p.rst
│ │ │ │ ├── wan2.1-i2v-14b-720p.rst
│ │ │ │ ├── wan2.2-a14b.rst
│ │ │ │ ├── wan2.2-i2v-a14b.rst
│ │ │ │ └── wan2.2-ti2v-5b.rst
│ │ │ ├── custom.rst
│ │ │ ├── index.rst
│ │ │ ├── lora.rst
│ │ │ ├── model_abilities/
│ │ │ │ ├── audio.rst
│ │ │ │ ├── chat.rst
│ │ │ │ ├── embed.rst
│ │ │ │ ├── flexible.rst
│ │ │ │ ├── image.rst
│ │ │ │ ├── index.rst
│ │ │ │ ├── multimodal.rst
│ │ │ │ ├── rerank.rst
│ │ │ │ ├── tools.rst
│ │ │ │ └── video.rst
│ │ │ ├── model_memory.rst
│ │ │ ├── model_update.rst
│ │ │ ├── sources/
│ │ │ │ └── sources.rst
│ │ │ ├── virtualenv.rst
│ │ │ └── xinference_models_hub.rst
│ │ ├── norm_zh.py
│ │ ├── reference/
│ │ │ └── index.rst
│ │ └── user_guide/
│ │ ├── auth_system.rst
│ │ ├── backends.rst
│ │ ├── client_api.rst
│ │ ├── continuous_batching.rst
│ │ ├── distributed_inference.rst
│ │ ├── index.rst
│ │ ├── launch.rst
│ │ ├── metrics.rst
│ │ └── vllm_enhancement.rst
│ └── templates/
│ ├── audio.rst.jinja
│ ├── audio_index.rst.jinja
│ ├── embedding.rst.jinja
│ ├── embedding_index.rst.jinja
│ ├── image.rst.jinja
│ ├── image_index.rst.jinja
│ ├── llm.rst.jinja
│ ├── llm_index.rst.jinja
│ ├── metrics.jinja
│ ├── rerank.rst.jinja
│ ├── rerank_index.rst.jinja
│ ├── video.rst.jinja
│ └── video_index.rst.jinja
├── examples/
│ ├── AI_podcast.py
│ ├── AI_podcast_ZH.py
│ ├── AI_translate.py
│ ├── Custom_StableDiffusion_ControlNet.ipynb
│ ├── FunctionCall.ipynb
│ ├── LangChain_QA.ipynb
│ ├── LangChain_Streamlit_Doc_Chat.py
│ ├── StableDiffusionControlNet.ipynb
│ ├── Xinference_Quick_Start.ipynb
│ ├── audio_to_text.ipynb
│ ├── chat.py
│ ├── chat_vl.ipynb
│ └── gradio_chatinterface.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── versioneer.py
└── xinference/
├── __init__.py
├── _compat.py
├── _version.py
├── api/
│ ├── __init__.py
│ ├── dependencies.py
│ ├── oauth2/
│ │ ├── __init__.py
│ │ ├── auth_service.py
│ │ ├── types.py
│ │ └── utils.py
│ ├── responses.py
│ ├── restful_api.py
│ ├── routers/
│ │ ├── __init__.py
│ │ ├── admin.py
│ │ ├── audio.py
│ │ ├── embeddings.py
│ │ ├── images.py
│ │ ├── llm.py
│ │ ├── models.py
│ │ ├── rerank.py
│ │ └── videos.py
│ ├── schemas/
│ │ ├── __init__.py
│ │ └── requests.py
│ ├── tests/
│ │ ├── __init__.py
│ │ ├── test_admin.py
│ │ └── test_utils.py
│ └── utils.py
├── client/
│ ├── __init__.py
│ ├── common.py
│ ├── handlers.py
│ ├── restful/
│ │ ├── __init__.py
│ │ ├── async_restful_client.py
│ │ └── restful_client.py
│ └── tests/
│ ├── __init__.py
│ ├── test_async_client.py
│ ├── test_async_client_with_auth.py
│ ├── test_client.py
│ └── test_client_with_auth.py
├── conftest.py
├── constants.py
├── core/
│ ├── __init__.py
│ ├── cache_tracker.py
│ ├── event.py
│ ├── launch_strategy.py
│ ├── metrics.py
│ ├── model.py
│ ├── otel.py
│ ├── progress_tracker.py
│ ├── resource.py
│ ├── status_guard.py
│ ├── supervisor.py
│ ├── tests/
│ │ ├── __init__.py
│ │ ├── test_continuous_batching.py
│ │ ├── test_launch_strategy.py
│ │ ├── test_metrics.py
│ │ ├── test_model.py
│ │ ├── test_progressor.py
│ │ ├── test_restful_api.py
│ │ ├── test_types.py
│ │ ├── test_utils.py
│ │ └── test_worker.py
│ ├── utils.py
│ ├── virtual_env_manager.py
│ └── worker.py
├── deploy/
│ ├── __init__.py
│ ├── cmdline.py
│ ├── docker/
│ │ ├── Dockerfile
│ │ ├── Dockerfile.cpu
│ │ ├── docker-compose-distributed.yml
│ │ ├── docker-compose.yml
│ │ ├── requirements/
│ │ │ ├── requirements-base.txt
│ │ │ ├── requirements-ml.txt
│ │ │ └── requirements-models.txt
│ │ └── requirements_cpu/
│ │ ├── requirements_cpu-base.txt
│ │ ├── requirements_cpu-ml.txt
│ │ └── requirements_cpu-models.txt
│ ├── local.py
│ ├── supervisor.py
│ ├── test/
│ │ ├── __init__.py
│ │ └── test_cmdline.py
│ ├── utils.py
│ └── worker.py
├── device_utils.py
├── fields.py
├── isolation.py
├── model/
│ ├── __init__.py
│ ├── audio/
│ │ ├── __init__.py
│ │ ├── chattts.py
│ │ ├── core.py
│ │ ├── cosyvoice.py
│ │ ├── custom.py
│ │ ├── f5tts.py
│ │ ├── f5tts_mlx.py
│ │ ├── fish_speech.py
│ │ ├── funasr.py
│ │ ├── indextts2.py
│ │ ├── kokoro.py
│ │ ├── kokoro_mlx.py
│ │ ├── kokoro_zh.py
│ │ ├── megatts.py
│ │ ├── melotts.py
│ │ ├── model_spec.json
│ │ ├── qwen3_asr.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── bbc_news.npy
│ │ │ ├── jfk.flac
│ │ │ ├── test_chattts.py
│ │ │ ├── test_cosyvoice.py
│ │ │ ├── test_f5tts.py
│ │ │ ├── test_f5tts_mlx.py
│ │ │ ├── test_fish_speech.py
│ │ │ ├── test_funasr.py
│ │ │ ├── test_kokoro.py
│ │ │ ├── test_megatts.py
│ │ │ ├── test_melotts.py
│ │ │ ├── test_whisper.py
│ │ │ └── test_whisper_mlx.py
│ │ ├── utils.py
│ │ ├── whisper.py
│ │ └── whisper_mlx.py
│ ├── batch.py
│ ├── cache_manager.py
│ ├── core.py
│ ├── custom.py
│ ├── embedding/
│ │ ├── __init__.py
│ │ ├── cache_manager.py
│ │ ├── core.py
│ │ ├── custom.py
│ │ ├── embed_family.py
│ │ ├── flag/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_flag.py
│ │ ├── llama_cpp/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_llama_cpp.py
│ │ ├── model_spec.json
│ │ ├── sentence_transformers/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_sentence_transformers.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_embedding_models.py
│ │ │ ├── test_integrated_embedding.py
│ │ │ └── test_qwen3_vl_engine_params.py
│ │ └── vllm/
│ │ ├── __init__.py
│ │ ├── core.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ └── test_vllm_embedding.py
│ ├── flexible/
│ │ ├── __init__.py
│ │ ├── core.py
│ │ ├── custom.py
│ │ ├── launchers/
│ │ │ ├── __init__.py
│ │ │ ├── image_process_launcher.py
│ │ │ ├── modelscope_launcher.py
│ │ │ ├── transformers_launcher.py
│ │ │ └── yolo_launcher.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_flexible_models.py
│ │ └── utils.py
│ ├── image/
│ │ ├── __init__.py
│ │ ├── cache_manager.py
│ │ ├── core.py
│ │ ├── custom.py
│ │ ├── engine.py
│ │ ├── engine_family.py
│ │ ├── model_spec.json
│ │ ├── ocr/
│ │ │ ├── __init__.py
│ │ │ ├── deepseek_ocr.py
│ │ │ ├── got_ocr2.py
│ │ │ ├── hunyuan_ocr.py
│ │ │ ├── mlx.py
│ │ │ ├── ocr_family.py
│ │ │ ├── paddleocr_vl.py
│ │ │ └── vllm.py
│ │ ├── scheduler/
│ │ │ ├── __init__.py
│ │ │ └── flux.py
│ │ ├── sdapi.py
│ │ ├── stable_diffusion/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── mlx.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_got_ocr2.py
│ │ │ └── test_stable_diffusion.py
│ │ └── utils.py
│ ├── llm/
│ │ ├── __init__.py
│ │ ├── cache_manager.py
│ │ ├── config_parser.py
│ │ ├── core.py
│ │ ├── custom.py
│ │ ├── harmony.py
│ │ ├── llama_cpp/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_gguf.py
│ │ │ └── test_structured.py
│ │ ├── llm_family.json
│ │ ├── llm_family.py
│ │ ├── lmdeploy/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── tests/
│ │ │ └── __init__.py
│ │ ├── memory.py
│ │ ├── mlx/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ ├── distributed_models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── core.py
│ │ │ │ ├── deepseek_v3.py
│ │ │ │ ├── qwen2.py
│ │ │ │ ├── qwen3.py
│ │ │ │ └── qwen3_moe.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_distributed_model.py
│ │ │ └── test_mlx.py
│ │ ├── reasoning_parser.py
│ │ ├── sglang/
│ │ │ ├── __init__.py
│ │ │ └── core.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_harmony.py
│ │ │ ├── test_llm_family.py
│ │ │ ├── test_llm_model.py
│ │ │ ├── test_memory_estimate.py
│ │ │ ├── test_multimodal.py
│ │ │ ├── test_stream_options.py
│ │ │ └── test_utils.py
│ │ ├── tool_parsers/
│ │ │ ├── __init__.py
│ │ │ ├── abstract_tool_parser.py
│ │ │ ├── deepseek_r1_tool_parser.py
│ │ │ ├── deepseek_v3_1_tool_parser.py
│ │ │ ├── deepseek_v3_tool_parser.py
│ │ │ ├── glm4_tool_parser.py
│ │ │ ├── llama3_tool_parser.py
│ │ │ ├── minimax_tool_parser.py
│ │ │ ├── qwen_tool_parser.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_deepseek_r1_tool_parser.py
│ │ │ ├── test_deepseek_v3_1_tool_parser.py
│ │ │ ├── test_deepseek_v3_tool_parser.py
│ │ │ ├── test_glm4_tool_parser.py
│ │ │ ├── test_llama3_tool_parser.py
│ │ │ └── test_qwen_tool_parser.py
│ │ ├── transformers/
│ │ │ ├── __init__.py
│ │ │ ├── chatglm.py
│ │ │ ├── core.py
│ │ │ ├── deepseek_v2.py
│ │ │ ├── gemma3.py
│ │ │ ├── gpt_oss.py
│ │ │ ├── multimodal/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cogagent.py
│ │ │ │ ├── core.py
│ │ │ │ ├── deepseek_vl2.py
│ │ │ │ ├── gemma3.py
│ │ │ │ ├── glm4_1v.py
│ │ │ │ ├── glm4v.py
│ │ │ │ ├── intern_vl.py
│ │ │ │ ├── minicpmv26.py
│ │ │ │ ├── minicpmv45.py
│ │ │ │ ├── ovis2.py
│ │ │ │ ├── qwen-omni.py
│ │ │ │ ├── qwen2_audio.py
│ │ │ │ └── qwen2_vl.py
│ │ │ ├── opt.py
│ │ │ ├── tensorizer_utils.py
│ │ │ ├── tests/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_opt.py
│ │ │ │ └── test_tensorizer.py
│ │ │ └── utils.py
│ │ ├── utils.py
│ │ └── vllm/
│ │ ├── __init__.py
│ │ ├── core.py
│ │ ├── distributed_executor.py
│ │ ├── distributed_executor_v1.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_core_chat_model.py
│ │ │ └── test_distributed_executor.py
│ │ ├── utils.py
│ │ └── xavier/
│ │ ├── __init__.py
│ │ ├── allocator.py
│ │ ├── block.py
│ │ ├── block_manager.py
│ │ ├── block_tracker.py
│ │ ├── collective.py
│ │ ├── collective_manager.py
│ │ ├── engine.py
│ │ ├── executor.py
│ │ ├── scheduler.py
│ │ ├── test/
│ │ │ ├── __init__.py
│ │ │ └── test_xavier.py
│ │ ├── transfer.py
│ │ └── utils.py
│ ├── rerank/
│ │ ├── __init__.py
│ │ ├── cache_manager.py
│ │ ├── core.py
│ │ ├── custom.py
│ │ ├── llama_cpp/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_llama_cpp.py
│ │ ├── model_spec.json
│ │ ├── rerank_family.py
│ │ ├── sentence_transformers/
│ │ │ ├── __init__.py
│ │ │ ├── core.py
│ │ │ └── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_sentence_transformers.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_qwen3_vl_reranker_virtualenv.py
│ │ │ └── test_rerank.py
│ │ ├── utils.py
│ │ └── vllm/
│ │ ├── __init__.py
│ │ ├── core.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ └── test_vllm.py
│ ├── scheduler/
│ │ ├── __init__.py
│ │ ├── batch.py
│ │ ├── core.py
│ │ └── request.py
│ ├── tests/
│ │ ├── __init__.py
│ │ └── test_utils.py
│ ├── utils.py
│ └── video/
│ ├── __init__.py
│ ├── cache_manager.py
│ ├── core.py
│ ├── diffusers.py
│ ├── model_spec.json
│ └── tests/
│ ├── __init__.py
│ └── test_diffusers_video.py
├── thirdparty/
│ ├── __init__.py
│ ├── audiotools/
│ │ ├── __init__.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── audio_signal.py
│ │ │ ├── display.py
│ │ │ ├── dsp.py
│ │ │ ├── effects.py
│ │ │ ├── ffmpeg.py
│ │ │ ├── loudness.py
│ │ │ ├── playback.py
│ │ │ ├── templates/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── headers.html
│ │ │ │ ├── pandoc.css
│ │ │ │ └── widget.html
│ │ │ ├── util.py
│ │ │ └── whisper.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── datasets.py
│ │ │ ├── preprocess.py
│ │ │ └── transforms.py
│ │ ├── metrics/
│ │ │ ├── __init__.py
│ │ │ ├── distance.py
│ │ │ ├── quality.py
│ │ │ └── spectral.py
│ │ ├── ml/
│ │ │ ├── __init__.py
│ │ │ ├── accelerator.py
│ │ │ ├── decorators.py
│ │ │ ├── experiment.py
│ │ │ └── layers/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ └── spectral_gate.py
│ │ ├── post.py
│ │ └── preference.py
│ ├── cosyvoice/
│ │ ├── __init__.py
│ │ ├── bin/
│ │ │ ├── average_model.py
│ │ │ ├── export_jit.py
│ │ │ ├── export_onnx.py
│ │ │ ├── inference_deprecated.py
│ │ │ ├── spk2info.pt
│ │ │ └── train.py
│ │ ├── cli/
│ │ │ ├── __init__.py
│ │ │ ├── cosyvoice.py
│ │ │ ├── frontend.py
│ │ │ └── model.py
│ │ ├── dataset/
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ └── processor.py
│ │ ├── flow/
│ │ │ ├── decoder.py
│ │ │ ├── flow.py
│ │ │ ├── flow_matching.py
│ │ │ └── length_regulator.py
│ │ ├── hifigan/
│ │ │ ├── discriminator.py
│ │ │ ├── f0_predictor.py
│ │ │ ├── generator.py
│ │ │ └── hifigan.py
│ │ ├── llm/
│ │ │ └── llm.py
│ │ ├── tokenizer/
│ │ │ ├── assets/
│ │ │ │ └── multilingual_zh_ja_yue_char_del.tiktoken
│ │ │ └── tokenizer.py
│ │ ├── transformer/
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── attention.py
│ │ │ ├── convolution.py
│ │ │ ├── decoder.py
│ │ │ ├── decoder_layer.py
│ │ │ ├── embedding.py
│ │ │ ├── encoder.py
│ │ │ ├── encoder_layer.py
│ │ │ ├── label_smoothing_loss.py
│ │ │ ├── positionwise_feed_forward.py
│ │ │ ├── subsampling.py
│ │ │ └── upsample_encoder.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ ├── class_utils.py
│ │ │ ├── common.py
│ │ │ ├── executor.py
│ │ │ ├── file_utils.py
│ │ │ ├── frontend_utils.py
│ │ │ ├── losses.py
│ │ │ ├── mask.py
│ │ │ ├── scheduler.py
│ │ │ └── train_utils.py
│ │ └── vllm/
│ │ └── cosyvoice2.py
│ ├── deepseek_vl/
│ │ ├── __init__.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── clip_encoder.py
│ │ │ ├── image_processing_vlm.py
│ │ │ ├── modeling_vlm.py
│ │ │ ├── processing_vlm.py
│ │ │ ├── projector.py
│ │ │ ├── sam.py
│ │ │ └── siglip_vit.py
│ │ ├── serve/
│ │ │ ├── __init__.py
│ │ │ ├── app_deepseek.py
│ │ │ ├── app_modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gradio_utils.py
│ │ │ │ ├── overwrites.py
│ │ │ │ ├── presets.py
│ │ │ │ └── utils.py
│ │ │ ├── assets/
│ │ │ │ ├── Kelpy-Codos.js
│ │ │ │ ├── custom.css
│ │ │ │ └── custom.js
│ │ │ └── inference.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── conversation.py
│ │ └── io.py
│ ├── deepseek_vl2/
│ │ ├── __init__.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── configuration_deepseek.py
│ │ │ ├── conversation.py
│ │ │ ├── modeling_deepseek.py
│ │ │ ├── modeling_deepseek_vl_v2.py
│ │ │ ├── processing_deepseek_vl_v2.py
│ │ │ └── siglip_vit.py
│ │ ├── serve/
│ │ │ ├── __init__.py
│ │ │ ├── app_modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gradio_utils.py
│ │ │ │ ├── overwrites.py
│ │ │ │ ├── presets.py
│ │ │ │ └── utils.py
│ │ │ ├── assets/
│ │ │ │ ├── Kelpy-Codos.js
│ │ │ │ ├── custom.css
│ │ │ │ ├── custom.js
│ │ │ │ └── simsun.ttc
│ │ │ └── inference.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ └── io.py
│ ├── f5_tts/
│ │ ├── __init__.py
│ │ ├── api.py
│ │ ├── configs/
│ │ │ ├── E2TTS_Base_train.yaml
│ │ │ ├── E2TTS_Small_train.yaml
│ │ │ ├── F5TTS_Base_train.yaml
│ │ │ └── F5TTS_Small_train.yaml
│ │ ├── eval/
│ │ │ ├── README.md
│ │ │ ├── ecapa_tdnn.py
│ │ │ ├── eval_infer_batch.py
│ │ │ ├── eval_infer_batch.sh
│ │ │ ├── eval_librispeech_test_clean.py
│ │ │ ├── eval_seedtts_testset.py
│ │ │ └── utils_eval.py
│ │ ├── infer/
│ │ │ ├── README.md
│ │ │ ├── examples/
│ │ │ │ ├── basic/
│ │ │ │ │ └── basic.toml
│ │ │ │ ├── multi/
│ │ │ │ │ ├── country.flac
│ │ │ │ │ ├── main.flac
│ │ │ │ │ ├── story.toml
│ │ │ │ │ ├── story.txt
│ │ │ │ │ └── town.flac
│ │ │ │ └── vocab.txt
│ │ │ ├── infer_cli.py
│ │ │ ├── infer_gradio.py
│ │ │ ├── speech_edit.py
│ │ │ └── utils_infer.py
│ │ ├── model/
│ │ │ ├── __init__.py
│ │ │ ├── backbones/
│ │ │ │ ├── README.md
│ │ │ │ ├── dit.py
│ │ │ │ ├── mmdit.py
│ │ │ │ └── unett.py
│ │ │ ├── cfm.py
│ │ │ ├── dataset.py
│ │ │ ├── modules.py
│ │ │ ├── trainer.py
│ │ │ └── utils.py
│ │ ├── scripts/
│ │ │ ├── count_max_epoch.py
│ │ │ └── count_params_gflops.py
│ │ ├── socket_server.py
│ │ └── train/
│ │ ├── README.md
│ │ ├── datasets/
│ │ │ ├── prepare_csv_wavs.py
│ │ │ ├── prepare_emilia.py
│ │ │ ├── prepare_libritts.py
│ │ │ ├── prepare_ljspeech.py
│ │ │ └── prepare_wenetspeech4tts.py
│ │ ├── finetune_cli.py
│ │ ├── finetune_gradio.py
│ │ └── train.py
│ ├── fish_speech/
│ │ ├── __init__.py
│ │ ├── fish_speech/
│ │ │ ├── __init__.py
│ │ │ ├── callbacks/
│ │ │ │ ├── __init__.py
│ │ │ │ └── grad_norm.py
│ │ │ ├── configs/
│ │ │ │ ├── base.yaml
│ │ │ │ ├── firefly_gan_vq.yaml
│ │ │ │ ├── lora/
│ │ │ │ │ └── r_8_alpha_16.yaml
│ │ │ │ └── text2semantic_finetune.yaml
│ │ │ ├── conversation.py
│ │ │ ├── datasets/
│ │ │ │ ├── concat_repeat.py
│ │ │ │ ├── protos/
│ │ │ │ │ ├── text-data.proto
│ │ │ │ │ ├── text_data_pb2.py
│ │ │ │ │ └── text_data_stream.py
│ │ │ │ ├── semantic.py
│ │ │ │ └── vqgan.py
│ │ │ ├── i18n/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── core.py
│ │ │ │ ├── locale/
│ │ │ │ │ ├── en_US.json
│ │ │ │ │ ├── es_ES.json
│ │ │ │ │ ├── ja_JP.json
│ │ │ │ │ ├── ko_KR.json
│ │ │ │ │ ├── pt_BR.json
│ │ │ │ │ └── zh_CN.json
│ │ │ │ └── scan.py
│ │ │ ├── models/
│ │ │ │ ├── text2semantic/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── lit_module.py
│ │ │ │ │ ├── llama.py
│ │ │ │ │ └── lora.py
│ │ │ │ └── vqgan/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── firefly.py
│ │ │ │ │ └── fsq.py
│ │ │ │ └── utils.py
│ │ │ ├── scheduler.py
│ │ │ ├── text/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── chn_text_norm/
│ │ │ │ │ ├── .gitignore
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── basic_class.py
│ │ │ │ │ ├── basic_constant.py
│ │ │ │ │ ├── basic_util.py
│ │ │ │ │ ├── cardinal.py
│ │ │ │ │ ├── date.py
│ │ │ │ │ ├── digit.py
│ │ │ │ │ ├── fraction.py
│ │ │ │ │ ├── money.py
│ │ │ │ │ ├── percentage.py
│ │ │ │ │ ├── telephone.py
│ │ │ │ │ └── text.py
│ │ │ │ ├── clean.py
│ │ │ │ └── spliter.py
│ │ │ ├── tokenizer.py
│ │ │ ├── train.py
│ │ │ ├── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── braceexpand.py
│ │ │ │ ├── context.py
│ │ │ │ ├── file.py
│ │ │ │ ├── instantiators.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── logging_utils.py
│ │ │ │ ├── rich_utils.py
│ │ │ │ ├── spectrogram.py
│ │ │ │ └── utils.py
│ │ │ └── webui/
│ │ │ ├── css/
│ │ │ │ └── style.css
│ │ │ ├── html/
│ │ │ │ └── footer.html
│ │ │ ├── js/
│ │ │ │ └── animate.js
│ │ │ ├── launch_utils.py
│ │ │ └── manage.py
│ │ └── tools/
│ │ ├── api_client.py
│ │ ├── api_server.py
│ │ ├── download_models.py
│ │ ├── e2e_webui.py
│ │ ├── extract_model.py
│ │ ├── file.py
│ │ ├── fish_e2e.py
│ │ ├── inference_engine/
│ │ │ ├── __init__.py
│ │ │ ├── reference_loader.py
│ │ │ ├── utils.py
│ │ │ └── vq_manager.py
│ │ ├── llama/
│ │ │ ├── build_dataset.py
│ │ │ ├── eval_in_context.py
│ │ │ ├── generate.py
│ │ │ ├── merge_lora.py
│ │ │ ├── quantize.py
│ │ │ └── rebuild_tokenizer.py
│ │ ├── run_webui.py
│ │ ├── schema.py
│ │ ├── sensevoice/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── auto_model.py
│ │ │ ├── fun_asr.py
│ │ │ └── vad_utils.py
│ │ ├── server/
│ │ │ ├── agent/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── generate.py
│ │ │ │ ├── generation_utils.py
│ │ │ │ └── pre_generation_utils.py
│ │ │ ├── api_utils.py
│ │ │ ├── exception_handler.py
│ │ │ ├── inference.py
│ │ │ ├── model_manager.py
│ │ │ ├── model_utils.py
│ │ │ └── views.py
│ │ ├── smart_pad.py
│ │ ├── vqgan/
│ │ │ ├── create_train_split.py
│ │ │ ├── extract_vq.py
│ │ │ └── inference.py
│ │ ├── webui/
│ │ │ ├── __init__.py
│ │ │ ├── inference.py
│ │ │ └── variables.py
│ │ └── whisper_asr.py
│ ├── indextts/
│ │ ├── BigVGAN/
│ │ │ ├── ECAPA_TDNN.py
│ │ │ ├── __init__.py
│ │ │ ├── activations.py
│ │ │ ├── alias_free_activation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cuda/
│ │ │ │ │ ├── .gitignore
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── activation1d.py
│ │ │ │ │ ├── anti_alias_activation.cpp
│ │ │ │ │ ├── anti_alias_activation_cuda.cu
│ │ │ │ │ ├── compat.h
│ │ │ │ │ ├── load.py
│ │ │ │ │ └── type_shim.h
│ │ │ │ └── torch/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── act.py
│ │ │ │ ├── filter.py
│ │ │ │ └── resample.py
│ │ │ ├── alias_free_torch/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── act.py
│ │ │ │ ├── filter.py
│ │ │ │ └── resample.py
│ │ │ ├── bigvgan.py
│ │ │ ├── models.py
│ │ │ ├── nnet/
│ │ │ │ ├── CNN.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── linear.py
│ │ │ │ └── normalization.py
│ │ │ └── utils.py
│ │ ├── __init__.py
│ │ ├── cli.py
│ │ ├── gpt/
│ │ │ ├── __init__.py
│ │ │ ├── conformer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── attention.py
│ │ │ │ ├── embedding.py
│ │ │ │ └── subsampling.py
│ │ │ ├── conformer_encoder.py
│ │ │ ├── model.py
│ │ │ ├── model_v2.py
│ │ │ ├── perceiver.py
│ │ │ ├── transformers_beam_search.py
│ │ │ ├── transformers_generation_utils.py
│ │ │ ├── transformers_gpt2.py
│ │ │ └── transformers_modeling_utils.py
│ │ ├── infer.py
│ │ ├── infer_v2.py
│ │ ├── s2mel/
│ │ │ ├── dac/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── model/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── dac.py
│ │ │ │ │ ├── discriminator.py
│ │ │ │ │ └── encodec.py
│ │ │ │ ├── nn/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── layers.py
│ │ │ │ │ ├── loss.py
│ │ │ │ │ └── quantize.py
│ │ │ │ └── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── decode.py
│ │ │ │ └── encode.py
│ │ │ ├── hf_utils.py
│ │ │ ├── modules/
│ │ │ │ ├── alias_free_torch/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── act.py
│ │ │ │ │ ├── filter.py
│ │ │ │ │ └── resample.py
│ │ │ │ ├── audio.py
│ │ │ │ ├── bigvgan/
│ │ │ │ │ ├── activations.py
│ │ │ │ │ ├── alias_free_activation/
│ │ │ │ │ │ ├── cuda/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── activation1d.py
│ │ │ │ │ │ │ ├── anti_alias_activation.cpp
│ │ │ │ │ │ │ ├── anti_alias_activation_cuda.cu
│ │ │ │ │ │ │ ├── compat.h
│ │ │ │ │ │ │ ├── load.py
│ │ │ │ │ │ │ └── type_shim.h
│ │ │ │ │ │ └── torch/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── act.py
│ │ │ │ │ │ ├── filter.py
│ │ │ │ │ │ └── resample.py
│ │ │ │ │ ├── bigvgan.py
│ │ │ │ │ ├── config.json
│ │ │ │ │ ├── env.py
│ │ │ │ │ ├── meldataset.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── campplus/
│ │ │ │ │ ├── DTDNN.py
│ │ │ │ │ ├── classifier.py
│ │ │ │ │ └── layers.py
│ │ │ │ ├── commons.py
│ │ │ │ ├── diffusion_transformer.py
│ │ │ │ ├── encodec.py
│ │ │ │ ├── flow_matching.py
│ │ │ │ ├── gpt_fast/
│ │ │ │ │ ├── generate.py
│ │ │ │ │ ├── model.py
│ │ │ │ │ └── quantize.py
│ │ │ │ ├── hifigan/
│ │ │ │ │ ├── f0_predictor.py
│ │ │ │ │ └── generator.py
│ │ │ │ ├── layers.py
│ │ │ │ ├── length_regulator.py
│ │ │ │ ├── openvoice/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── api.py
│ │ │ │ │ ├── attentions.py
│ │ │ │ │ ├── checkpoints_v2/
│ │ │ │ │ │ └── converter/
│ │ │ │ │ │ └── config.json
│ │ │ │ │ ├── commons.py
│ │ │ │ │ ├── mel_processing.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ ├── modules.py
│ │ │ │ │ ├── openvoice_app.py
│ │ │ │ │ ├── se_extractor.py
│ │ │ │ │ ├── transforms.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── quantize.py
│ │ │ │ ├── rmvpe.py
│ │ │ │ ├── vocos/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── heads.py
│ │ │ │ │ ├── helpers.py
│ │ │ │ │ ├── loss.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ ├── modules.py
│ │ │ │ │ ├── pretrained.py
│ │ │ │ │ └── spectral_ops.py
│ │ │ │ └── wavenet.py
│ │ │ ├── optimizers.py
│ │ │ └── wav2vecbert_extract.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ ├── arch_util.py
│ │ │ ├── checkpoint.py
│ │ │ ├── common.py
│ │ │ ├── feature_extractors.py
│ │ │ ├── front.py
│ │ │ ├── maskgct/
│ │ │ │ └── models/
│ │ │ │ ├── codec/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── amphion_codec/
│ │ │ │ │ │ ├── codec.py
│ │ │ │ │ │ ├── quantize/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── factorized_vector_quantize.py
│ │ │ │ │ │ │ ├── lookup_free_quantize.py
│ │ │ │ │ │ │ ├── residual_vq.py
│ │ │ │ │ │ │ └── vector_quantize.py
│ │ │ │ │ │ └── vocos.py
│ │ │ │ │ ├── codec_dataset.py
│ │ │ │ │ ├── codec_inference.py
│ │ │ │ │ ├── codec_sampler.py
│ │ │ │ │ ├── codec_trainer.py
│ │ │ │ │ ├── facodec/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── alias_free_torch/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── act.py
│ │ │ │ │ │ │ ├── filter.py
│ │ │ │ │ │ │ └── resample.py
│ │ │ │ │ │ ├── facodec_dataset.py
│ │ │ │ │ │ ├── facodec_inference.py
│ │ │ │ │ │ ├── facodec_trainer.py
│ │ │ │ │ │ ├── modules/
│ │ │ │ │ │ │ ├── JDC/
│ │ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ │ ├── bst.t7
│ │ │ │ │ │ │ │ └── model.py
│ │ │ │ │ │ │ ├── attentions.py
│ │ │ │ │ │ │ ├── commons.py
│ │ │ │ │ │ │ ├── gradient_reversal.py
│ │ │ │ │ │ │ ├── layers.py
│ │ │ │ │ │ │ ├── quantize.py
│ │ │ │ │ │ │ ├── style_encoder.py
│ │ │ │ │ │ │ └── wavenet.py
│ │ │ │ │ │ └── optimizer.py
│ │ │ │ │ ├── kmeans/
│ │ │ │ │ │ ├── repcodec_model.py
│ │ │ │ │ │ └── vocos.py
│ │ │ │ │ ├── melvqgan/
│ │ │ │ │ │ └── melspec.py
│ │ │ │ │ ├── ns3_codec/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── alias_free_torch/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── act.py
│ │ │ │ │ │ │ ├── filter.py
│ │ │ │ │ │ │ └── resample.py
│ │ │ │ │ │ ├── facodec.py
│ │ │ │ │ │ ├── gradient_reversal.py
│ │ │ │ │ │ ├── melspec.py
│ │ │ │ │ │ ├── quantize/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── fvq.py
│ │ │ │ │ │ │ └── rvq.py
│ │ │ │ │ │ └── transformer.py
│ │ │ │ │ ├── speechtokenizer/
│ │ │ │ │ │ ├── model.py
│ │ │ │ │ │ └── modules/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── conv.py
│ │ │ │ │ │ ├── lstm.py
│ │ │ │ │ │ ├── norm.py
│ │ │ │ │ │ ├── quantization/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── ac.py
│ │ │ │ │ │ │ ├── core_vq.py
│ │ │ │ │ │ │ ├── distrib.py
│ │ │ │ │ │ │ └── vq.py
│ │ │ │ │ │ └── seanet.py
│ │ │ │ │ └── vevo/
│ │ │ │ │ └── vevo_repcodec.py
│ │ │ │ └── tts/
│ │ │ │ └── maskgct/
│ │ │ │ ├── ckpt/
│ │ │ │ │ └── wav2vec2bert_stats.pt
│ │ │ │ ├── llama_nar.py
│ │ │ │ └── maskgct_s2a.py
│ │ │ ├── maskgct_utils.py
│ │ │ ├── text_utils.py
│ │ │ ├── typical_sampling.py
│ │ │ ├── utils.py
│ │ │ ├── webui_utils.py
│ │ │ └── xtransformers.py
│ │ └── vqvae/
│ │ ├── __init__.py
│ │ └── xtts_dvae.py
│ ├── internvl/
│ │ ├── __init__.py
│ │ └── conversation.py
│ ├── llava/
│ │ ├── __init__.py
│ │ ├── conversation.py
│ │ ├── mm_utils.py
│ │ └── model/
│ │ ├── __init__.py
│ │ ├── clip_encoder/
│ │ │ ├── __init__.py
│ │ │ ├── builder.py
│ │ │ └── clip_encoder.py
│ │ ├── constants.py
│ │ ├── llava_arch.py
│ │ ├── llava_llama.py
│ │ └── multimodal_projector/
│ │ ├── __init__.py
│ │ └── builder.py
│ ├── matcha/
│ │ ├── VERSION
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── cli.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── components/
│ │ │ │ └── __init__.py
│ │ │ └── text_mel_datamodule.py
│ │ ├── hifigan/
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ ├── denoiser.py
│ │ │ ├── env.py
│ │ │ ├── meldataset.py
│ │ │ ├── models.py
│ │ │ └── xutils.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── baselightningmodule.py
│ │ │ ├── components/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── decoder.py
│ │ │ │ ├── flow_matching.py
│ │ │ │ ├── text_encoder.py
│ │ │ │ └── transformer.py
│ │ │ └── matcha_tts.py
│ │ ├── onnx/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ └── infer.py
│ │ ├── text/
│ │ │ ├── __init__.py
│ │ │ ├── cleaners.py
│ │ │ ├── numbers.py
│ │ │ └── symbols.py
│ │ ├── train.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── generate_data_statistics.py
│ │ ├── get_durations_from_trained_model.py
│ │ ├── instantiators.py
│ │ ├── logging_utils.py
│ │ ├── model.py
│ │ ├── monotonic_align/
│ │ │ ├── __init__.py
│ │ │ ├── core.pyx
│ │ │ └── setup.py
│ │ ├── pylogger.py
│ │ ├── rich_utils.py
│ │ └── utils.py
│ ├── megatts3/
│ │ ├── __init__.py
│ │ └── tts/
│ │ ├── frontend_function.py
│ │ ├── gradio_api.py
│ │ ├── infer_cli.py
│ │ ├── modules/
│ │ │ ├── aligner/
│ │ │ │ └── whisper_small.py
│ │ │ ├── ar_dur/
│ │ │ │ ├── ar_dur_predictor.py
│ │ │ │ └── commons/
│ │ │ │ ├── layers.py
│ │ │ │ ├── nar_tts_modules.py
│ │ │ │ ├── rel_transformer.py
│ │ │ │ ├── rot_transformer.py
│ │ │ │ ├── seq_utils.py
│ │ │ │ └── transformer.py
│ │ │ ├── llm_dit/
│ │ │ │ ├── cfm.py
│ │ │ │ ├── dit.py
│ │ │ │ ├── time_embedding.py
│ │ │ │ └── transformer.py
│ │ │ └── wavvae/
│ │ │ ├── decoder/
│ │ │ │ ├── diag_gaussian.py
│ │ │ │ ├── hifigan_modules.py
│ │ │ │ ├── seanet_encoder.py
│ │ │ │ └── wavvae_v3.py
│ │ │ └── encoder/
│ │ │ └── common_modules/
│ │ │ ├── conv.py
│ │ │ ├── lstm.py
│ │ │ └── seanet.py
│ │ └── utils/
│ │ ├── audio_utils/
│ │ │ ├── align.py
│ │ │ ├── io.py
│ │ │ └── plot.py
│ │ ├── commons/
│ │ │ ├── ckpt_utils.py
│ │ │ └── hparams.py
│ │ └── text_utils/
│ │ ├── dict.json
│ │ ├── ph_tone_convert.py
│ │ ├── split_text.py
│ │ └── text_encoder.py
│ ├── melo/
│ │ ├── __init__.py
│ │ ├── api.py
│ │ ├── app.py
│ │ ├── attentions.py
│ │ ├── commons.py
│ │ ├── configs/
│ │ │ └── config.json
│ │ ├── data/
│ │ │ └── example/
│ │ │ └── metadata.list
│ │ ├── data_utils.py
│ │ ├── download_utils.py
│ │ ├── infer.py
│ │ ├── init_downloads.py
│ │ ├── losses.py
│ │ ├── main.py
│ │ ├── mel_processing.py
│ │ ├── models.py
│ │ ├── modules.py
│ │ ├── monotonic_align/
│ │ │ ├── __init__.py
│ │ │ └── core.py
│ │ ├── preprocess_text.py
│ │ ├── split_utils.py
│ │ ├── text/
│ │ │ ├── __init__.py
│ │ │ ├── chinese.py
│ │ │ ├── chinese_bert.py
│ │ │ ├── chinese_mix.py
│ │ │ ├── cleaner.py
│ │ │ ├── cleaner_multiling.py
│ │ │ ├── cmudict.rep
│ │ │ ├── cmudict_cache.pickle
│ │ │ ├── english.py
│ │ │ ├── english_bert.py
│ │ │ ├── english_utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── abbreviations.py
│ │ │ │ ├── number_norm.py
│ │ │ │ └── time_norm.py
│ │ │ ├── es_phonemizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── cleaner.py
│ │ │ │ ├── es_symbols.json
│ │ │ │ ├── es_symbols.txt
│ │ │ │ ├── es_symbols_v2.json
│ │ │ │ ├── es_to_ipa.py
│ │ │ │ ├── example_ipa.txt
│ │ │ │ ├── gruut_wrapper.py
│ │ │ │ ├── punctuation.py
│ │ │ │ ├── spanish_symbols.txt
│ │ │ │ └── test.ipynb
│ │ │ ├── fr_phonemizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── cleaner.py
│ │ │ │ ├── en_symbols.json
│ │ │ │ ├── example_ipa.txt
│ │ │ │ ├── fr_symbols.json
│ │ │ │ ├── fr_to_ipa.py
│ │ │ │ ├── french_abbreviations.py
│ │ │ │ ├── french_symbols.txt
│ │ │ │ ├── gruut_wrapper.py
│ │ │ │ └── punctuation.py
│ │ │ ├── french.py
│ │ │ ├── french_bert.py
│ │ │ ├── japanese.py
│ │ │ ├── japanese_bert.py
│ │ │ ├── ko_dictionary.py
│ │ │ ├── korean.py
│ │ │ ├── opencpop-strict.txt
│ │ │ ├── spanish.py
│ │ │ ├── spanish_bert.py
│ │ │ ├── symbols.py
│ │ │ └── tone_sandhi.py
│ │ ├── train.py
│ │ ├── train.sh
│ │ ├── transforms.py
│ │ └── utils.py
│ ├── mlx/
│ │ ├── __init__.py
│ │ └── flux/
│ │ ├── __init__.py
│ │ ├── autoencoder.py
│ │ ├── clip.py
│ │ ├── datasets.py
│ │ ├── flux.py
│ │ ├── layers.py
│ │ ├── lora.py
│ │ ├── model.py
│ │ ├── sampler.py
│ │ ├── t5.py
│ │ ├── tokenizers.py
│ │ ├── trainer.py
│ │ └── utils.py
│ └── whisper/
│ ├── __init__.py
│ ├── __main__.py
│ ├── assets/
│ │ ├── gpt2.tiktoken
│ │ ├── mel_filters.npz
│ │ └── multilingual.tiktoken
│ ├── audio.py
│ ├── decoding.py
│ ├── model.py
│ ├── normalizers/
│ │ ├── __init__.py
│ │ ├── basic.py
│ │ ├── english.json
│ │ └── english.py
│ ├── timing.py
│ ├── tokenizer.py
│ ├── transcribe.py
│ ├── triton_ops.py
│ ├── utils.py
│ └── version.py
├── types.py
├── ui/
│ ├── __init__.py
│ ├── gradio/
│ │ ├── __init__.py
│ │ ├── chat_interface.py
│ │ ├── media_interface.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ └── latex.py
│ └── web/
│ └── ui/
│ ├── .eslintignore
│ ├── .eslintrc.yml
│ ├── .gitignore
│ ├── .prettierignore
│ ├── .prettierrc.yml
│ ├── package.json
│ ├── public/
│ │ └── index.html
│ └── src/
│ ├── App.js
│ ├── components/
│ │ ├── MenuSide.js
│ │ ├── Title.js
│ │ ├── alertComponent.js
│ │ ├── apiContext.js
│ │ ├── authAlertDialog.js
│ │ ├── copyComponent.js
│ │ ├── deleteDialog.js
│ │ ├── errorMessageSnackBar.js
│ │ ├── fetchWrapper.js
│ │ ├── fetcher.js
│ │ ├── hotkeyFocusTextField.js
│ │ ├── successMessageSnackBar.js
│ │ ├── tableTitle.js
│ │ ├── themeButton.js
│ │ ├── themeContext.js
│ │ ├── titleTypography.js
│ │ ├── translateButton.js
│ │ ├── utils.js
│ │ └── versionLabel.js
│ ├── i18n.js
│ ├── index.css
│ ├── index.js
│ ├── locales/
│ │ ├── en.json
│ │ ├── ja.json
│ │ ├── ko.json
│ │ └── zh.json
│ ├── router/
│ │ └── index.js
│ ├── scenes/
│ │ ├── _layout/
│ │ │ └── index.js
│ │ ├── cluster_info/
│ │ │ ├── index.js
│ │ │ ├── nodeInfo.js
│ │ │ └── style.js
│ │ ├── launch_model/
│ │ │ ├── LaunchModel.js
│ │ │ ├── components/
│ │ │ │ ├── cachedListDialog.js
│ │ │ │ ├── commandBuilder.js
│ │ │ │ ├── dynamicFieldList.js
│ │ │ │ ├── editCustomModelDialog.js
│ │ │ │ ├── launchModelDrawer.js
│ │ │ │ ├── modelFormConfig.js
│ │ │ │ ├── pasteDialog.js
│ │ │ │ ├── progress.js
│ │ │ │ ├── selectField.js
│ │ │ │ └── virtualenvListDialog.js
│ │ │ ├── data/
│ │ │ │ └── data.js
│ │ │ ├── index.js
│ │ │ ├── launchCustom.js
│ │ │ ├── modelCard.js
│ │ │ └── styles/
│ │ │ └── modelCardStyle.css
│ │ ├── login/
│ │ │ ├── header.js
│ │ │ └── login.js
│ │ ├── register_model/
│ │ │ ├── components/
│ │ │ │ ├── addControlnet.js
│ │ │ │ ├── addModelSpecs.js
│ │ │ │ ├── addStop.js
│ │ │ │ └── addVirtualenv.js
│ │ │ ├── data/
│ │ │ │ └── languages.js
│ │ │ ├── index.js
│ │ │ ├── registerModel.js
│ │ │ └── styles/
│ │ │ └── registerModelStyle.css
│ │ └── running_models/
│ │ └── index.js
│ └── theme.js
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
doc/
.idea/
.github/
build/
xinference.egg-info/
xinference/web/ui/build/
xinference/web/ui/node_modules/
================================================
FILE: .gitattributes
================================================
xinference/_version.py export-subst
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yaml
================================================
name: "Bug Report"
description: Submit a bug report to help us improve Xinference. You should provide useful information AMAP rather than simply describing what happened. / 提交一个问题报告来帮助我们改进 Xinference。你必须提供有用的信息而不只是描述发生的现象,否则将不予处理。
body:
- type: textarea
id: system-info
attributes:
label: System Info / 系統信息
description: Your operating environment / 您的运行环境信息
placeholder: Includes Cuda version, transformers / xllamacpp / vllm version, Python version, operating system... / 包括Cuda版本,transformers / xllamacpp / vllm版本,Python版本,操作系统等。
validations:
required: true
- type: checkboxes
id: information-scripts-examples
attributes:
label: Running Xinference with Docker? / 是否使用 Docker 运行 Xinfernece?
description: 'How are you using Xinference? / 以何种方式使用 Xinference?'
options:
- label: docker / docker
- label: pip install / 通过 pip install 安装
- label: installation from source / 从源码安装
- type: textarea
id: start-way
attributes:
label: Version info / 版本信息
description: The version of Xinference you are running / Xinference 版本
validations:
required: true
- type: textarea
id: commandline
attributes:
label: The command used to start Xinference / 用以启动 xinference 的命令
description: |
Please provide the command used to start Xinference.
If it is a distributed scenario, the commands for starting the supervisor and worker need to be listed separately.
If it is a Docker scenario, please provide the complete command for starting Xinference through Docker.
If it is another method, please describe it specifically.
请提供启动 xinference 的命令。
如果是分布式场景,启动 supervisor 和 worker 的命令需要分别列出。
如果是docker场景,请提供通过 docker 启动 xinference 的完整命令。
如果是其他方式,请具体描述。
validations:
required: true
- type: textarea
id: reproduction
validations:
required: true
attributes:
label: Reproduction / 复现过程
description: |
Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit.
If you have code snippets, error messages, stack traces, please provide them here as well.
Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code.
请提供能重现您遇到的问题的代码示例,最好是最小复现单元。
如果您有代码片段、错误信息、堆栈跟踪、涉及的命令行操作等也请在此提供。
请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
请勿使用截图,因为截图难以阅读,而且(更重要的是)不允许他人复制粘贴您的代码。
placeholder: |
Steps to reproduce the behavior/复现Bug的步骤:
1.
2.
3.
- type: textarea
id: expected-behavior
validations:
required: true
attributes:
label: Expected behavior / 期待表现
description: "A clear and concise description of what you would expect to happen. / 简单描述您期望发生的事情。"
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.yaml
================================================
name: "Feature request"
description: Submit a request for a new Xinference feature / 提交一个新的 Xinference 的功能建议
labels: [ "feature" ]
body:
- type: textarea
id: feature-request
validations:
required: true
attributes:
label: Feature request / 功能建议
description: |
A brief description of the functional proposal.
对功能建议的简述。
- type: textarea
id: motivation
validations:
required: true
attributes:
label: Motivation / 动机
description: |
Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here.
您提出建议的动机。如果该动机与另一个 GitHub 问题有关,请在此处提供对应的链接。
- type: textarea
id: contribution
validations:
required: true
attributes:
label: Your contribution / 您的贡献
description: |
Your PR link or any other link you can help with.
您的PR链接或者其他您能提供帮助的链接。
================================================
FILE: .github/workflows/assign.yaml
================================================
name: Assign
on:
issue_comment:
types: created
permissions:
contents: read
jobs:
issue_assign:
permissions:
issues: write
pull-requests: write
runs-on: ubuntu-22.04
steps:
- if: github.event.comment.body == 'take'
run: |
echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
================================================
FILE: .github/workflows/docker-cd.yaml
================================================
name: Xinference CD for DockerHub
on:
schedule:
- cron: '0 18 * * *'
push:
tags:
- '*'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build:
timeout-minutes: 240
runs-on: self-hosted
strategy:
matrix:
python-version: [ "3.10" ]
steps:
- name: Check out code
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: recursive
- name: Log in to Docker Hub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Build and push Docker image
shell: bash
if: ${{ github.repository == 'xorbitsai/inference' }}
env:
DOCKER_ORG: ${{ secrets.DOCKERHUB_USERNAME }}
PY_VERSION: ${{ matrix.python-version }}
run: |
if [[ "$GITHUB_REF" =~ ^"refs/tags/" ]]; then
export GIT_TAG=$(echo "$GITHUB_REF" | sed -e "s/refs\/tags\///g")
fi
docker system prune -f -a
if [[ -n "$GIT_TAG" ]]; then
BRANCHES="$GIT_TAG"
echo "Will handle tag $BRANCHES"
else
MAINBRANCH=$(git rev-parse --abbrev-ref HEAD)
BRANCHES="$MAINBRANCH"
fi
for branch in $BRANCHES; do
if [[ -n "$GIT_TAG" ]]; then
export IMAGE_TAG="$GIT_TAG"
else
git checkout $branch
export IMAGE_TAG="nightly-$branch"
fi
docker build -t "$DOCKER_ORG/xinference:${IMAGE_TAG}" --progress=plain -f xinference/deploy/docker/Dockerfile .
docker push "$DOCKER_ORG/xinference:${IMAGE_TAG}"
docker build -t "$DOCKER_ORG/xinference:${IMAGE_TAG}-cpu" --progress=plain -f xinference/deploy/docker/Dockerfile.cpu .
docker push "$DOCKER_ORG/xinference:${IMAGE_TAG}-cpu"
echo "XINFERENCE_IMAGE_TAG=${IMAGE_TAG}" >> $GITHUB_ENV
done
if [[ -n "$GIT_TAG" ]]; then
docker tag "$DOCKER_ORG/xinference:${GIT_TAG}" "$DOCKER_ORG/xinference:latest"
docker push "$DOCKER_ORG/xinference:latest"
docker tag "$DOCKER_ORG/xinference:${GIT_TAG}-cpu" "$DOCKER_ORG/xinference:latest-cpu"
docker push "$DOCKER_ORG/xinference:latest-cpu"
echo "XINFERENCE_GIT_TAG=${GIT_TAG}" >> $GITHUB_ENV
fi
- name: Clean docker image cache
shell: bash
if: ${{ github.repository == 'xorbitsai/inference' }}
run: |
docker system prune -f -a
================================================
FILE: .github/workflows/issue.yaml
================================================
name: Close inactive issues
on:
schedule:
- cron: "0 19 * * *"
workflow_dispatch:
jobs:
close-issues:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v9
with:
days-before-issue-stale: 14
days-before-issue-close: 10
stale-issue-label: "stale"
stale-issue-message: "This issue is stale because it has been open for 14 days with no activity."
close-issue-message: "This issue was closed because it has been inactive for 10 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
operations-per-run: 500
repo-token: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .github/workflows/pr_auto_run_gen_docs.yaml
================================================
name: Auto run gen_docs.py and commit changes to PR
on:
pull_request_target:
types: [opened, synchronize]
permissions:
contents: write
pull-requests: write
jobs:
run-gen-docs-and-commit:
if: startsWith(github.event.pull_request.head.ref, 'chore/models-sync/')
runs-on: ubuntu-latest
steps:
- name: Checkout base repository (trusted scripts)
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.base.ref }}
repository: ${{ github.repository }}
path: main
fetch-depth: 0
- name: Checkout PR head branch (working copy)
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.ref }}
repository: ${{ github.event.pull_request.head.repo.full_name }}
path: pr
fetch-depth: 0
- name: Decide whether to run gen_docs for latest commit
id: decide
working-directory: pr
run: |
set -e
MSG="$(git log -1 --pretty=%B || echo "")"
echo "Latest commit message: $MSG"
if echo "$MSG" | grep -Eiq '\[(skip ci|ci skip)\]'; then
echo "Skip token found in commit message; will not run."
echo "run=false" >> $GITHUB_OUTPUT
exit 0
fi
HEAD_SHA="$(git rev-parse HEAD)"
BASE_SHA="${{ github.event.pull_request.base.sha }}"
RANGE="$BASE_SHA...$HEAD_SHA"
echo "Diff range (full PR): $RANGE"
CHANGED_FILES="$(git diff --name-only "$RANGE" || true)"
echo "Changed files in PR range:"
echo "$CHANGED_FILES"
RUN="false"
for f in $CHANGED_FILES; do
case "$f" in
xinference/model/llm/llm_family.json|xinference/model/embedding/model_spec.json|xinference/model/rerank/model_spec.json|xinference/model/image/model_spec.json|xinference/model/audio/model_spec.json|xinference/model/video/model_spec.json)
RUN="true"; break;;
esac
done
echo "run=$RUN" >> $GITHUB_OUTPUT
- name: Set up Python
if: steps.decide.outputs.run == 'true'
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install gen_docs dependencies
if: steps.decide.outputs.run == 'true'
run: |
python -m pip install --upgrade pip
python -m pip install jinja2
python -m pip install "xinference[doc]"
- name: Run gen_docs.py if present
if: steps.decide.outputs.run == 'true'
working-directory: pr
run: |
echo "[Debug] CWD: $(pwd)"
echo "[Debug] List ../main:"
ls -la ../main || true
echo "[Debug] List ../main/doc/source:"
ls -la ../main/doc/source || true
# Use PR branch's gen_docs.py if it exists, otherwise use main branch's
if [ -f "doc/source/gen_docs.py" ]; then
echo "Using PR branch's doc/source/gen_docs.py"
echo "Running pr/doc/source/gen_docs.py from its directory"
(cd doc/source && python -u gen_docs.py)
elif [ -f "../main/doc/source/gen_docs.py" ]; then
echo "Copying main/doc/source/gen_docs.py into PR workspace"
mkdir -p doc/source
cp -f ../main/doc/source/gen_docs.py doc/source/gen_docs.py
echo "Running pr/doc/source/gen_docs.py from its directory"
(cd doc/source && python -u gen_docs.py)
elif [ -f "gen_docs.py" ]; then
echo "Using PR branch's gen_docs.py"
echo "Running pr/gen_docs.py"
python -u gen_docs.py
elif [ -f "../main/gen_docs.py" ]; then
echo "Copying main/gen_docs.py into PR workspace"
cp -f ../main/gen_docs.py gen_docs.py
echo "Running pr/gen_docs.py"
python -u gen_docs.py
else
echo "gen_docs.py not found in main repository, skipping."
fi
- name: Stage and commit changes back to PR branch
if: steps.decide.outputs.run == 'true'
working-directory: pr
run: |
echo "[Debug] Before staging:" && git status --porcelain
echo "[Debug] check-ignore for generated file:"
git check-ignore -v doc/source/_generated/auto_generated.txt || echo "Not ignored"
git add -A
git add -f doc/source/_generated || true
echo "[Debug] After staging:" && git status --porcelain
echo "[Debug] Staged diff:" && git diff --cached --name-status || true
if ! git diff --cached --quiet; then
git config user.name "github-actions[bot]"
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
git commit -m "chore(docs): auto-run gen_docs.py"
else
echo "No changes to commit."
fi
- name: Push back for same-repo PR
env:
BRANCH: ${{ github.event.pull_request.head.ref }}
if: steps.decide.outputs.run == 'true' && github.event.pull_request.head.repo.full_name == github.repository
working-directory: pr
run: |
echo "Pushing changes to same-repo PR..."
git push origin HEAD:$BRANCH || echo "No changes to push."
- name: Push back for fork PR using maintainer PAT
if: steps.decide.outputs.run == 'true' && github.event.pull_request.head.repo.full_name != github.repository && github.event.pull_request.maintainer_can_modify
env:
PUSH_TOKEN: ${{ secrets.PUSH_TOKEN }}
BRANCH: ${{ github.event.pull_request.head.ref }}
HEAD_FULL_NAME: ${{ github.event.pull_request.head.repo.full_name }}
working-directory: pr
run: |
if [ -z "$PUSH_TOKEN" ]; then
echo "Missing secrets.PUSH_TOKEN; cannot push to fork. Skipping push."
exit 0
fi
echo "Pushing changes to fork PR using maintainer PAT..."
git remote set-url origin "https://x-access-token:${PUSH_TOKEN}@github.com/${HEAD_FULL_NAME}.git"
git push origin HEAD:$BRANCH || echo "No changes to push."
- name: Skip push for fork PR without maintainer edit permission
if: steps.decide.outputs.run != 'true' && github.event.pull_request.head.repo.full_name != github.repository && !github.event.pull_request.maintainer_can_modify
run: |
echo "Fork PR does not allow edits by maintainers; run succeeded but skip pushing commits."
================================================
FILE: .github/workflows/python.yaml
================================================
name: Python CI
on:
push:
branches:
- '*'
pull_request:
types: ['opened', 'reopened', 'synchronize']
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
lint:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ "ubuntu-latest" ]
python-version: [ "3.10" ]
steps:
- name: Check out code
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: recursive
- name: Set up Python environment
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install pre-commit
run: pip install pre-commit
- name: Run pre-commit
run: pre-commit run --all-files
- name: Set up Node.js
uses: actions/setup-node@v1
with:
node-version: 16
# ESLint and Prettier must be in `package.json`
- name: Install Node.js dependencies
run: cd xinference/ui/web/ui && npm ci
- name: ESLint Check
run: cd xinference/ui/web/ui && npx eslint .
- name: Prettier Check
run: cd xinference/ui/web/ui && ./node_modules/.bin/prettier --check .
build_test_job:
runs-on: ${{ matrix.os }}
needs: lint
env:
CONDA_ENV: test
SELF_HOST_PYTHON: /root/miniconda3/envs/inference_test/bin/python
SELF_HOST_CONDA: /root/miniconda3/condabin/conda
defaults:
run:
shell: bash -l {0}
strategy:
fail-fast: false
matrix:
os: [ "ubuntu-latest", "macos-latest", "windows-latest" ]
python-version: [ "3.10", "3.11", "3.12", "3.13" ]
module: [ "xinference" ]
exclude:
- { os: macos-latest, python-version: 3.11 }
- { os: macos-latest, python-version: 3.12 }
- { os: windows-latest, python-version: 3.11 }
- { os: windows-latest, python-version: 3.12 }
include:
- { os: self-hosted, module: gpu, python-version: "3.11"}
- { os: macos-latest, module: metal, python-version: "3.10" }
steps:
- name: Check out code
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: recursive
- name: Set up conda ${{ matrix.python-version }}
uses: conda-incubator/setup-miniconda@v3
if: ${{ matrix.module != 'gpu' }}
with:
python-version: ${{ matrix.python-version }}
activate-environment: ${{ env.CONDA_ENV }}
# Important for python == 3.12 and 3.13
- name: Update pip and setuptools
if: ${{ matrix.python-version == '3.12' || matrix.python-version == '3.13' }}
run: |
python -m pip install -U pip "setuptools<82"
# Install torch for Python 3.13 using nightly builds
- name: Install torch for Python 3.13
if: ${{ matrix.python-version == '3.13'}}
run: |
python -m pip install torch torchvision torchaudio
- name: Install numpy
if: |
(startsWith(matrix.os, 'macos') && (matrix.python-version == '3.13')) ||
(startsWith(matrix.os, 'windows'))
run: |
python -m pip install "numpy<2"
- name: Install dependencies
env:
MODULE: ${{ matrix.module }}
OS: ${{ matrix.os }}
if: ${{ matrix.module != 'gpu' }}
run: |
if [ "$OS" == "ubuntu-latest" ]; then
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
fi
pip install -e ".[dev]"
pip install "xllamacpp>=0.2.0"
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
pip install mlx-lm
pip install "mlx-vlm>=0.3.4"
pip install mlx-whisper
pip install f5-tts-mlx
pip install qwen-vl-utils!=0.0.9
pip install tomli
else
pip install "transformers<4.49"
pip install attrdict
pip install "timm>=0.9.16"
if [ "${{ matrix.python-version }}" != "3.13" ]; then
pip install torch torchvision
fi
pip install accelerate
pip install sentencepiece
pip install transformers_stream_generator
pip install bitsandbytes
pip install "sentence-transformers>=5.1.1"
pip install modelscope
pip install diffusers
pip install protobuf
pip install FlagEmbedding
pip install "tenacity>=8.2.0,<8.4.0"
pip install "jinja2==3.1.2"
pip install jj-pytorchvideo
pip install qwen-vl-utils!=0.0.9
pip install datamodel_code_generator
pip install jsonschema
fi
working-directory: .
- name: Clean up disk
if: |
(startsWith(matrix.os, 'ubuntu'))
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/*
df -h
- name: Fix SSL on Windows
if: startsWith(matrix.os, 'windows')
shell: bash
run: |
echo "activate conda env"
source $CONDA/etc/profile.d/conda.sh || true
conda activate $CONDA_ENV || true
python -V
which python
echo "before: $SSL_CERT_FILE"
python -m pip install --quiet certifi || true
SSL_CERT_FILE=$(python -c "import certifi,os;print(os.path.normpath(certifi.where()))")
export SSL_CERT_FILE
export REQUESTS_CA_BUNDLE=$SSL_CERT_FILE
export CURL_CA_BUNDLE=$SSL_CERT_FILE
echo "after: $SSL_CERT_FILE"
echo "SSL_CERT_FILE=$(python -c 'import certifi;print(certifi.where())')" >> $GITHUB_ENV
- name: Test with pytest
env:
MODULE: ${{ matrix.module }}
PYTORCH_MPS_HIGH_WATERMARK_RATIO: 1.0
PYTORCH_MPS_LOW_WATERMARK_RATIO: 0.2
XFORMERS_FORCE_DISABLE_TRITON: 1
TORCH_DISABLE_FLASH_ATTENTION: 1
run: |
if [ "$MODULE" == "gpu" ]; then
${{ env.SELF_HOST_PYTHON }} -m pip install -U -e ".[audio,dev]"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "openai>1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U modelscope
${{ env.SELF_HOST_PYTHON }} -m pip install -U gguf
${{ env.SELF_HOST_PYTHON }} -m pip install -U uv
${{ env.SELF_HOST_PYTHON }} -m pip install -U sse_starlette
${{ env.SELF_HOST_PYTHON }} -m pip install -U xoscar
${{ env.SELF_HOST_PYTHON }} -m pip install -U "python-jose[cryptography]"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "passlib[bcrypt]"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "aioprometheus[starlette]"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "pynvml"
${{ env.SELF_HOST_PYTHON }} -m pip install "transformers==4.53.2"
${{ env.SELF_HOST_PYTHON }} -m pip install "funasr==1.2.7"
${{ env.SELF_HOST_PYTHON }} -m pip install -U nemo_text_processing<1.1.0
${{ env.SELF_HOST_PYTHON }} -m pip install -U omegaconf~=2.3.0
${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing<1.0.4
${{ env.SELF_HOST_PYTHON }} -m pip install -U librosa
${{ env.SELF_HOST_PYTHON }} -m pip install -U xxhash
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2.1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U HyperPyYAML
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y matcha-tts
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxruntime-gpu==1.16.0; sys_platform == 'linux'
${{ env.SELF_HOST_PYTHON }} -m pip install -U openai-whisper
${{ env.SELF_HOST_PYTHON }} -m pip install -U "torch==2.7.0" "torchaudio==2.7.0" "torchvision==0.22.0"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loguru"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "natsort"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loralib"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ormsgpack"
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y opencc
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper"
${{ env.SELF_HOST_PYTHON }} -m pip install -U accelerate
${{ env.SELF_HOST_PYTHON }} -m pip install -U verovio
${{ env.SELF_HOST_PYTHON }} -m pip install -U cachetools
${{ env.SELF_HOST_PYTHON }} -m pip install -U silero-vad
${{ env.SELF_HOST_PYTHON }} -m pip install -U pydantic
${{ env.SELF_HOST_PYTHON }} -m pip install -U diffusers
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnx
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxconverter_common
${{ env.SELF_HOST_PYTHON }} -m pip install -U torchdiffeq
${{ env.SELF_HOST_PYTHON }} -m pip install -U "x_transformers>=1.31.14"
${{ env.SELF_HOST_PYTHON }} -m pip install -U pypinyin
${{ env.SELF_HOST_PYTHON }} -m pip install -U tomli
${{ env.SELF_HOST_PYTHON }} -m pip install -U vocos
${{ env.SELF_HOST_PYTHON }} -m pip install -U jieba
${{ env.SELF_HOST_PYTHON }} -m pip install -U soundfile
${{ env.SELF_HOST_PYTHON }} -m pip install tensorizer
${{ env.SELF_HOST_PYTHON }} -m pip install -U sentence-transformers
${{ env.SELF_HOST_PYTHON }} -m pip install -U FlagEmbedding
${{ env.SELF_HOST_PYTHON }} -m pip install -U "peft<=0.17.1"
${{ env.SELF_HOST_PYTHON }} -m pip install "xllamacpp>=0.2.0" --index-url https://xorbitsai.github.io/xllamacpp/whl/cu124 --extra-index-url https://pypi.org/simple
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
--disable-warnings \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/core/tests/test_continuous_batching.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/embedding/tests/test_qwen3_vl_engine_params.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/rerank/tests/test_qwen3_vl_reranker_virtualenv.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_stable_diffusion.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_got_ocr2.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_funasr.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_chattts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_cosyvoice.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_f5tts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_f5tts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_melotts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_kokoro.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_fish_speech.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_megatts.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/embedding/tests/test_integrated_embedding.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/embedding/vllm/tests/test_vllm_embedding.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/transformers/tests/test_tensorizer.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/tests/test_llm_model.py
elif [ "$MODULE" == "metal" ]; then
pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_mlx.py && \
pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper_mlx.py && \
pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_f5tts_mlx.py && \
pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_distributed_model.py
else
pytest --timeout=3000 \
-W ignore::PendingDeprecationWarning \
-vv \
--cov-config=setup.cfg \
--cov-report=xml \
--cov=xinference \
--ignore xinference/core/tests/test_continuous_batching.py \
--ignore xinference/model/image/tests/test_stable_diffusion.py \
--ignore xinference/model/image/tests/test_got_ocr2.py \
--ignore xinference/model/audio/tests \
--ignore xinference/model/embedding/tests/test_integrated_embedding.py \
--ignore xinference/model/llm/transformers/tests/test_tensorizer.py \
--ignore xinference/model/llm/tests/test_llm_model.py \
--ignore xinference/model/llm/vllm \
--ignore xinference/model/llm/sglang \
--ignore xinference/client/tests/test_client.py \
--ignore xinference/client/tests/test_async_client.py \
--ignore xinference/model/llm/mlx \
xinference
fi
working-directory: .
================================================
FILE: .github/workflows/release.yaml
================================================
name: Build and upload to PyPI
on:
push:
tags:
- '*'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build-publish:
name: Build and publish Python distribution to PyPI
runs-on: ubuntu-latest
steps:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- uses: actions/checkout@v3
- name: Install pypa/build
run: >-
python3 -m
pip install
build "setuptools<82"
--user
- name: Build web
run: >-
python setup.py build_web
- name: Build a binary wheel and a source tarball
run: >-
python3 -m
build
--sdist
--wheel
--outdir dist/
.
# if is xorbitsai repo, upload to pypi
- uses: pypa/gh-action-pypi-publish@v1.5.0
if: github.repository == 'xorbitsai/inference'
with:
user: __token__
password: ${{ secrets.PYPI_PASSWORD }}
# if is not xorbitsai repo, upload to test
- uses: pypa/gh-action-pypi-publish@v1.5.0
if: github.repository != 'xorbitsai/inference'
with:
user: __token__
password: ${{ secrets.TEST_PYPI_PASSWORD }}
verbose: true
repository_url: https://test.pypi.org/legacy/
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
generated/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# IDEs
.idea
.vscode
*.iml
# VIM
*.sw*
# web staff
node_modules/
static/
# Local docs (project notes, refactoring plans, etc.)
docs/
# doc
doc/source/savefig/
# local env
local_env
asv/results
.DS_Store
# Exclude markdown files except README files
*.md
!README.md
!README_*.md
================================================
FILE: .pre-commit-config.yaml
================================================
files: xinference
repos:
- repo: https://github.com/psf/black
rev: 25.1.0
hooks:
- id: black
exclude: thirdparty
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
exclude: ^xinference/thirdparty
- id: trailing-whitespace
exclude: thirdparty
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8
args: [--config, setup.cfg]
exclude: thirdparty
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
args: [--sp, setup.cfg]
exclude: thirdparty
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.15.0
hooks:
- id: mypy
additional_dependencies: ["tokenize-rt==3.2.0", "types-requests", "types-tabulate"]
args: [--ignore-missing-imports, --follow-imports, skip]
exclude: thirdparty
- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
hooks:
- id: codespell
args: [ --config, setup.cfg]
exclude: thirdparty
================================================
FILE: .readthedocs.yaml
================================================
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: doc/source/conf.py
build:
os: ubuntu-20.04
tools:
python: "3.10"
python:
install:
- method: pip
path: .
extra_requirements:
- doc
submodules:
include: all
recursive: true
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
global-include *.pyx
global-include *.pxd
global-include xinference/**/*.json
global-exclude *.c
global-exclude *.cpp
include setup.cfg
include pyproject.toml
global-exclude .DS_Store
include versioneer.py
include xinference/_version.py
global-exclude conftest.py
include xinference/locale/*.json
include xinference/model/llm/*.json
include xinference/model/embedding/*.json
graft xinference/thirdparty
global-include xinference/ui/web/ui/build/**/*
================================================
FILE: README.md
================================================

# Xorbits Inference: Model Serving Made Easy 🤖
Xinference Enterprise ·
Self-hosting ·
Documentation
[](https://pypi.org/project/xinference/)
[](https://github.com/xorbitsai/inference/blob/main/LICENSE)
[](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main)
[](https://hub.docker.com/r/xprobe/xinference)
[](https://discord.gg/Xw9tszSkr5)
[](https://twitter.com/xorbitsio)
Xorbits Inference(Xinference) is a powerful and versatile library designed to serve language,
speech recognition, and multimodal models. With Xorbits Inference, you can effortlessly deploy
and serve your or state-of-the-art built-in models using just a single command. Whether you are a
researcher, developer, or data scientist, Xorbits Inference empowers you to unleash the full
potential of cutting-edge AI models.
## 🔥 Hot Topics
### Framework Enhancements
- Agent-native Serving: Xinference integrates with [Xagent](https://github.com/xorbitsai/xagent) to enable dynamic planning, tool use, and autonomous multi-step reasoning — moving beyond static pipelines.
- Auto batch: Multiple concurrent requests are automatically batched, significantly improving throughput: [#4197](https://github.com/xorbitsai/inference/pull/4197)
- [Xllamacpp](https://github.com/xorbitsai/xllamacpp): New llama.cpp Python binding, maintained by Xinference team, supports continuous batching and is more production-ready.: [#2997](https://github.com/xorbitsai/inference/pull/2997)
- Distributed inference: running models across workers: [#2877](https://github.com/xorbitsai/inference/pull/2877)
- VLLM enhancement: Shared KV cache across multiple replicas: [#2732](https://github.com/xorbitsai/inference/pull/2732)
### New Models
- Built-in support for [Qwen-3.5](https://github.com/QwenLM/Qwen3.5): [#4639](https://github.com/xorbitsai/inference/pull/4639)
- Built-in support for [GLM-5](https://github.com/zai-org/GLM-5): [#4638](https://github.com/xorbitsai/inference/pull/4638)
- Built-in support for [MiniMax-M2.5](https://github.com/MiniMax-AI/MiniMax-M2.5): [#4630](https://github.com/xorbitsai/inference/pull/4630)
- Built-in support for [Kimi-K2.5](https://github.com/MoonshotAI/Kimi-K2.5): [#4631](https://github.com/xorbitsai/inference/pull/4631)
- Built-in support for [FLUX.2-Klein](https://bfl.ai/models/flux-2-klein): [#4596](https://github.com/xorbitsai/inference/pull/4596)
- Built-in support for [Qwen3-ASR](https://github.com/QwenLM/Qwen3-ASR): [#4581](https://github.com/xorbitsai/inference/pull/4581)
- Built-in support for [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7): [#4565](https://github.com/xorbitsai/inference/pull/4565)
- Built-in support for [MinerU2.5-2509-1.2B](https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B): [#4569](https://github.com/xorbitsai/inference/pull/4569)
### Integrations
- [Xagent](https://github.com/xorbitsai/xagent): an enterprise agent platform for building and running AI agents with planning, memory, and tool use — not limited to rigid workflows.
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
- [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
- [RAGFlow](https://github.com/infiniflow/ragflow): is an open-source RAG engine based on deep document understanding.
- [MaxKB](https://github.com/1Panel-dev/MaxKB): MaxKB = Max Knowledge Brain, it is a powerful and easy-to-use AI assistant that integrates Retrieval-Augmented Generation (RAG) pipelines, supports robust workflows, and provides advanced MCP tool-use capabilities.
## Key Features
🌟 **Model Serving Made Easy**: Simplify the process of serving large language, speech
recognition, and multimodal models. You can set up and deploy your models
for experimentation and production with a single command.
⚡️ **State-of-the-Art Models**: Experiment with cutting-edge built-in models using a single
command. Inference provides access to state-of-the-art open-source models!
🖥 **Heterogeneous Hardware Utilization**: Make the most of your hardware resources with
[ggml](https://github.com/ggerganov/ggml). Xorbits Inference intelligently utilizes heterogeneous
hardware, including GPUs and CPUs, to accelerate your model inference tasks.
⚙️ **Flexible API and Interfaces**: Offer multiple interfaces for interacting
with your models, supporting OpenAI compatible RESTful API (including Function Calling API), RPC, CLI
and WebUI for seamless model management and interaction.
🌐 **Distributed Deployment**: Excel in distributed deployment scenarios,
allowing the seamless distribution of model inference across multiple devices or machines.
🔌 **Built-in Integration with Third-Party Libraries**: Xorbits Inference seamlessly integrates
with popular third-party libraries including [LangChain](https://python.langchain.com/docs/integrations/providers/xinference), [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/XinferenceLocalDeployment.html#i-run-pip-install-xinference-all-in-a-terminal-window), [Dify](https://docs.dify.ai/advanced/model-configuration/xinference), and [Chatbox](https://chatboxai.app/).
## Why Xinference
| Feature | Xinference | FastChat | OpenLLM | RayLLM |
|------------------------------------------------|------------|----------|---------|--------|
| OpenAI-Compatible RESTful API | ✅ | ✅ | ✅ | ✅ |
| vLLM Integrations | ✅ | ✅ | ✅ | ✅ |
| More Inference Engines (GGML, TensorRT) | ✅ | ❌ | ✅ | ✅ |
| More Platforms (CPU, Metal) | ✅ | ✅ | ❌ | ❌ |
| Multi-node Cluster Deployment | ✅ | ❌ | ❌ | ✅ |
| Image Models (Text-to-Image) | ✅ | ✅ | ❌ | ❌ |
| Text Embedding Models | ✅ | ❌ | ❌ | ❌ |
| Multimodal Models | ✅ | ❌ | ❌ | ❌ |
| Audio Models | ✅ | ❌ | ❌ | ❌ |
| More OpenAI Functionalities (Function Calling) | ✅ | ❌ | ❌ | ❌ |
## Using Xinference
- **Self-hosting Xinference Community Edition**
Quickly get Xinference running in your environment with this [starter guide](#getting-started).
Use our [documentation](https://inference.readthedocs.io/) for further references and more in-depth instructions.
- **Xinference for enterprise / organizations**
We provide additional enterprise-centric features. [send us an email](mailto:business@xprobe.io?subject=[GitHub]Business%20License%20Inquiry) to discuss enterprise needs.
## Staying Ahead
Star Xinference on GitHub and be instantly notified of new releases.

## Getting Started
* [Docs](https://inference.readthedocs.io/en/latest/index.html)
* [Built-in Models](https://inference.readthedocs.io/en/latest/models/builtin/index.html)
* [Custom Models](https://inference.readthedocs.io/en/latest/models/custom.html)
* [Deployment Docs](https://inference.readthedocs.io/en/latest/getting_started/using_xinference.html)
* [Examples and Tutorials](https://inference.readthedocs.io/en/latest/examples/index.html)
### Jupyter Notebook
The lightest way to experience Xinference is to try our [Jupyter Notebook on Google Colab](https://colab.research.google.com/github/xorbitsai/inference/blob/main/examples/Xinference_Quick_Start.ipynb).
### Docker
Nvidia GPU users can start Xinference server using [Xinference Docker Image](https://inference.readthedocs.io/en/latest/getting_started/using_docker_image.html). Prior to executing the installation command, ensure that both [Docker](https://docs.docker.com/get-docker/) and [CUDA](https://developer.nvidia.com/cuda-downloads) are set up on your system.
```bash
docker run --name xinference -d -p 9997:9997 -e XINFERENCE_HOME=/data -v :/data --gpus all xprobe/xinference:latest xinference-local -H 0.0.0.0
```
### K8s via helm
Ensure that you have GPU support in your Kubernetes cluster, then install as follows.
```
# add repo
helm repo add xinference https://xorbitsai.github.io/xinference-helm-charts
# update indexes and query xinference versions
helm repo update xinference
helm search repo xinference/xinference --devel --versions
# install xinference
helm install xinference xinference/xinference -n xinference --version 0.0.1-v
```
For more customized installation methods on K8s, please refer to the [documentation](https://inference.readthedocs.io/en/latest/getting_started/using_kubernetes.html).
### Quick Start
Install Xinference by using pip as follows. (For more options, see [Installation page](https://inference.readthedocs.io/en/latest/getting_started/installation.html).)
```bash
pip install "xinference[all]"
```
To start a local instance of Xinference, run the following command:
```bash
$ xinference-local
```
Once Xinference is running, there are multiple ways you can try it: via the web UI, via cURL,
via the command line, or via the Xinference’s python client. Check out our [docs]( https://inference.readthedocs.io/en/latest/getting_started/using_xinference.html#run-xinference-locally) for the guide.

## Getting involved
| Platform | Purpose |
|-------------------------------------------------------------------------------------------------|---------------------------------------------|
| [Github Issues](https://github.com/xorbitsai/inference/issues) | Reporting bugs and filing feature requests. |
| [Discord](https://discord.gg/Xw9tszSkr5) | Collaborating with other Xinference users. |
| [Twitter](https://twitter.com/xorbitsio) | Staying up-to-date on new features. |
## Citation
If this work is helpful, please kindly cite as:
```bibtex
@inproceedings{lu2024xinference,
title = "Xinference: Making Large Model Serving Easy",
author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.30",
pages = "291--300",
}
```
## Contributors
## Star History
[](https://star-history.com/#xorbitsai/inference&Date)
================================================
FILE: README_ja_JP.md
================================================

# Xorbits Inference: モデルサービングを簡単に 🤖
Xinference Enterprise(企業版) ·
セルフホスティング ·
ドキュメント
[](https://pypi.org/project/xinference/)
[](https://github.com/xorbitsai/inference/blob/main/LICENSE)
[](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main)
[](https://hub.docker.com/r/xprobe/xinference)
[](https://discord.gg/Xw9tszSkr5)
[](https://twitter.com/xorbitsio)
Xorbits Inference(Xinference) は、言語、音声認識、マルチモーダルモデルのために
設計された強力で汎用性の高いライブラリです。 Xorbits Inference を使えば、たった 1 つのコマンドで、
あなたや最先端のビルトインモデルを簡単にデプロイし、提供することができます。 Xorbits Inference は、
研究者、開発者、データサイエンティストを問わず、最先端の AI モデルの可能性を最大限に引き出すことができます。
## 主な特徴
🌟 **モデルサービングを簡単に**: 大規模な言語、音声認識、マルチモーダルモデルの提供プロセスを簡素化します。
1つのコマンドで、実験用と本番用のモデルをセットアップしてデプロイできます。
⚡️ **最先端モデル**: コマンド1つで最先端のビルトインモデルを実験。
Inference は、最先端のオープンソースモデルへのアクセスを提供します!
🖥 **異機種ハードウェアの利用**: [ggml](https://github.com/ggerganov/ggml) でハードウェアリソースを最大限に活用しましょう。
Xorbits Inference は、GPU や CPU を含む異種ハードウェアをインテリジェントに利用し、モデル推論タスクを高速化します。
⚙️ **柔軟な API とインターフェース**: OpenAI互換のRESTful API(Function Callingを含む)、RPC、コマンドライン、Web UIなど、
多様なインターフェースを提供し、モデルの管理と相互作用を容易にします。
🌐 **配布デプロイメント**: Excel の分散展開シナリオでは、複数のデバイスやマシンにモデルの推論をシームレスに分散させることができます。
🔌 **サードパーティライブラリとの組み込み統合**: Xorbits Inference は、[LangChain](https://python.langchain.com/docs/integrations/providers/xinference)
や [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/XinferenceLocalDeployment.html#i-run-pip-install-xinference-all-in-a-terminal-window) のような人気のあるサードパーティライブラリと
シームレスに統合されています。
## なぜ Xinference を選ぶのか
| 機能 | Xinference | FastChat | OpenLLM | RayLLM |
|------|------------|----------|---------|--------|
| OpenAI 互換の RESTful API | ✅ | ✅ | ✅ | ✅ |
| vLLM 統合 | ✅ | ✅ | ✅ | ✅ |
| その他の推論エンジン(GGML、TensorRT) | ✅ | ❌ | ✅ | ✅ |
| その他のプラットフォーム(CPU、Metal) | ✅ | ✅ | ❌ | ❌ |
| マルチノードクラスター展開 | ✅ | ❌ | ❌ | ✅ |
| 画像モデル(テキストから画像へ) | ✅ | ✅ | ❌ | ❌ |
| テキスト埋め込みモデル | ✅ | ❌ | ❌ | ❌ |
| マルチモーダルモデル | ✅ | ❌ | ❌ | ❌ |
| より多くのOpenAI機能(関数呼び出し) | ✅ | ❌ | ❌ | ❌ |
## 入門ガイド
**始める前に、GitHubで私たちにスターを付けてください。そうすると、新しいリリースの通知を即座に受け取ることができます!**
* [ドキュメント](https://inference.readthedocs.io/en/latest/index.html)
* [組み込みモデル](https://inference.readthedocs.io/en/latest/models/builtin/index.html)
* [カスタムモデル](https://inference.readthedocs.io/en/latest/models/custom.html)
* [デプロイメントドキュメント](https://inference.readthedocs.io/en/latest/getting_started/using_xinference.html)
* [例とチュートリアル](https://inference.readthedocs.io/en/latest/examples/index.html)
### Jupyter Notebook
Xinferenceを体験する最軽量な方法は、私たちの[Google Colab上のJupyterノートブック](https://colab.research.google.com/github/xorbitsai/inference/blob/main/examples/Xinference_Quick_Start.ipynb)を試すことです]。
### Docker
Nvidia GPUユーザーは、[Xinference Dockerイメージ](https://inference.readthedocs.io/en/latest/getting_started/using_docker_image.html)を使用してXinferenceサーバーを開始することができます。インストールコマンドを実行する前に、システムに[Docker](https://docs.docker.com/get-docker/)と[CUDA](https://developer.nvidia.com/cuda-downloads)が設定されていることを確認してください。
### クイックスタート
以下のようにpipを使用してXinferenceをインストールします。(他のオプションについては、[インストールページ](https://inference.readthedocs.io/en/latest/getting_started/installation.html)を参照してください。)
```bash
pip install "xinference[all]"
```
ローカルインスタンスのXinferenceを開始するには、次のコマンドを実行します:
```bash
$ xinference-local
```
Xinferenceが実行されると、Web UI、cURL、コマンドライン、またはXinferenceのPythonクライアントを介して試すことができます。詳細は[ドキュメント](https://inference.readthedocs.io/en/latest/getting_started/using_xinference.html#run-xinference-locally)をご覧ください。

## 関与する
| プラットフォーム | 目的 |
|-------------------------------------------------------------------------------------------------|-----------------------|
| [Github イシュー](https://github.com/xorbitsai/inference/issues) | バグ報告と機能リクエストの提出。 |
| [Discord](https://discord.gg/Xw9tszSkr5) | 他のXinferenceユーザーとの協力。 |
| [Twitter](https://twitter.com/xorbitsio) | 新機能に関する最新情報の入手。 |
## 引用
この仕事が役立つ場合は、以下のように引用してください:
```bibtex
@inproceedings{lu2024xinference,
title = "Xinference: Making Large Model Serving Easy",
author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.30",
pages = "291--300",
}
```
## 寄稿者
================================================
FILE: README_zh_CN.md
================================================

# Xorbits Inference:模型推理, 轻而易举 🤖
Xinference 企业版 ·
自托管 ·
文档
[](https://pypi.org/project/xinference/)
[](https://github.com/xorbitsai/inference/blob/main/LICENSE)
[](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main)
[](https://hub.docker.com/r/xprobe/xinference)
[](https://xinference.cn/images/WeCom.jpg)
[](https://www.zhihu.com/org/xorbits)
Xorbits Inference(Xinference)是一个性能强大且功能全面的分布式推理框架。可用于大语言模型(LLM),语音识别模型,多模态模型等各种模型的推理。通过 Xorbits Inference,你可以轻松地一键部署你自己的模型或内置的前沿开源模型。无论你是研究者,开发者,或是数据科学家,都可以通过 Xorbits Inference 与最前沿的 AI 模型,发掘更多可能。
## 🔥 近期热点
### 框架增强
- Agent 原生服务能力:Xinference 与 [Xagent](https://github.com/xorbitsai/xagent) 深度集成,支持动态规划、工具调用与多步自主推理,突破传统静态流程的限制。
- 自动 Batch: 多个并发请求会被自动合批处理,大幅提升吞吐量。: [#4197](https://github.com/xorbitsai/inference/pull/4197)
- 支持寒武纪芯片:[#3693](https://github.com/xorbitsai/inference/pull/3693)
- [Xllamacpp](https://github.com/xorbitsai/xllamacpp): 全新llama.cpp Python binding,由 Xinference 团队维护,支持持续并行且更生产可用: [#2997](https://github.com/xorbitsai/inference/pull/2997)
- 分布式推理:在多个 worker 上运行大尺寸模型:[#2877](https://github.com/xorbitsai/inference/pull/2877)
- VLLM 引擎增强: 跨副本共享KV Cache: [#2732](https://github.com/xorbitsai/inference/pull/2732)
### 新模型
- 内置 [Qwen-3.5](https://github.com/QwenLM/Qwen3.5): [#4639](https://github.com/xorbitsai/inference/pull/4639)
- 内置 [GLM-5](https://github.com/zai-org/GLM-5): [#4638](https://github.com/xorbitsai/inference/pull/4638)
- 内置 [MiniMax-M2.5](https://github.com/MiniMax-AI/MiniMax-M2.5): [#4630](https://github.com/xorbitsai/inference/pull/4630)
- 内置 [Kimi-K2.5](https://github.com/MoonshotAI/Kimi-K2.5): [#4631](https://github.com/xorbitsai/inference/pull/4631)
- 内置 [FLUX.2-Klein](https://bfl.ai/models/flux-2-klein): [#4596](https://github.com/xorbitsai/inference/pull/4596)
- 内置 [Qwen3-ASR](https://github.com/QwenLM/Qwen3-ASR): [#4581](https://github.com/xorbitsai/inference/pull/4581)
- 内置 [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7): [#4565](https://github.com/xorbitsai/inference/pull/4565)
- 内置 [MinerU2.5-2509-1.2B](https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B): [#4569](https://github.com/xorbitsai/inference/pull/4569)
### 集成
- [Xagent](https://github.com/xorbitsai/xagent):企业级 Agent 平台,用于构建和运行具备规划、记忆与工具调用能力的智能体,不再受限于僵化的工作流。
- [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/):一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力,帮助您轻松实现复杂的问答场景。
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。
- [RAGFlow](https://github.com/infiniflow/ragflow): 是一款基于深度文档理解构建的开源 RAG 引擎。
- [MaxKB](https://github.com/1Panel-dev/MaxKB): MaxKB = Max Knowledge Base,是一款基于大语言模型和 RAG 的开源知识库问答系统,广泛应用于智能客服、企业内部知识库、学术研究与教育等场景。
## 主要功能
🌟 **模型推理,轻而易举**:大语言模型,语音识别模型,多模态模型的部署流程被大大简化。一个命令即可完成模型的部署工作。
⚡️ **前沿模型,应有尽有**:框架内置众多中英文的前沿大语言模型,包括 baichuan,chatglm2 等,一键即可体验!内置模型列表还在快速更新中!
🖥 **异构硬件,快如闪电**:通过 [ggml](https://github.com/ggerganov/ggml),同时使用你的 GPU 与 CPU 进行推理,降低延迟,提高吞吐!
⚙️ **接口调用,灵活多样**:提供多种使用模型的接口,包括 OpenAI 兼容的 RESTful API(包括 Function Calling),RPC,命令行,web UI 等等。方便模型的管理与交互。
🌐 **集群计算,分布协同**: 支持分布式部署,通过内置的资源调度器,让不同大小的模型按需调度到不同机器,充分使用集群资源。
🔌 **开放生态,无缝对接**: 与流行的三方库无缝对接,包括 [LangChain](https://python.langchain.com/docs/integrations/providers/xinference),[LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/XinferenceLocalDeployment.html#i-run-pip-install-xinference-all-in-a-terminal-window),[Dify](https://docs.dify.ai/advanced/model-configuration/xinference),以及 [Chatbox](https://chatboxai.app/)。
## 为什么选择 Xinference
| 功能特点 | Xinference | FastChat | OpenLLM | RayLLM |
|-------------------------|------------|----------|---------|--------|
| 兼容 OpenAI 的 RESTful API | ✅ | ✅ | ✅ | ✅ |
| vLLM 集成 | ✅ | ✅ | ✅ | ✅ |
| 更多推理引擎(GGML、TensorRT) | ✅ | ❌ | ✅ | ✅ |
| 更多平台支持(CPU、Metal) | ✅ | ✅ | ❌ | ❌ |
| 分布式集群部署 | ✅ | ❌ | ❌ | ✅ |
| 图像模型(文生图) | ✅ | ✅ | ❌ | ❌ |
| 文本嵌入模型 | ✅ | ❌ | ❌ | ❌ |
| 多模态模型 | ✅ | ❌ | ❌ | ❌ |
| 语音识别模型 | ✅ | ❌ | ❌ | ❌ |
| 更多 OpenAI 功能 (函数调用) | ✅ | ❌ | ❌ | ❌ |
## 使用 Xinference
- **自托管 Xinference 社区版**
使用 [入门指南](#getting-started) 快速在你自己的环境中运行 Xinference。
参考 [文档](https://inference.readthedocs.io/zh-cn) 以获得参考和更多说明。
- **面向企业/组织的 Xinference 版本**
我们提供额外的面向企业的功能。 [通过企业微信联系](https://xinference.cn/images/WeCom.jpg)
或 [提交表单](https://w8v6grm432.feishu.cn/share/base/form/shrcn9u1EBXQxmGMqILEjguuGoh) 讨论企业需求。
## 保持领先
在 GitHub 上给 Xinference Star,并立即收到新版本的通知。

## 入门指南
* [文档](https://inference.readthedocs.io/zh-cn/latest/index.html)
* [内置模型](https://inference.readthedocs.io/zh-cn/latest/models/builtin/index.html)
* [自定义模型](https://inference.readthedocs.io/zh-cn/latest/models/custom.html)
* [部署文档](https://inference.readthedocs.io/zh-cn/latest/getting_started/using_xinference.html)
* [示例和教程](https://inference.readthedocs.io/zh-cn/latest/examples/index.html)
### Jupyter Notebook
体验 Xinference 最轻量级的方式是使用我们 [Google Colab 上的 Jupyter Notebook](https://colab.research.google.com/github/xorbitsai/inference/blob/main/examples/Xinference_Quick_Start.ipynb)。
### Docker
Nvidia GPU 用户可以使用[Xinference Docker 镜像](https://inference.readthedocs.io/zh-cn/latest/getting_started/using_docker_image.html) 启动 Xinference 服务器。在执行安装命令之前,确保你的系统中已经安装了 [Docker](https://docs.docker.com/get-docker/) 和 [CUDA](https://developer.nvidia.com/cuda-downloads)。
### Kubernetes
确保你的 Kubernetes 集群开启了 GPU 支持,然后通过 `helm` 进行如下方式的安装。
```
# 新增xinference仓库
helm repo add xinference https://xorbitsai.github.io/xinference-helm-charts
# 更新仓库,查询可安装的版本
helm repo update xinference
helm search repo xinference/xinference --devel --versions
# 在K8s中安装xinference
helm install xinference xinference/xinference -n xinference --version 0.0.1-v
```
更多定制化安装方式,请参考[文档](https://inference.readthedocs.io/en/latest/getting_started/using_kubernetes.html)。
### 快速开始
使用 pip 安装 Xinference,操作如下。(更多选项,请参阅[安装页面](https://inference.readthedocs.io/zh-cn/latest/getting_started/installation.html)。)
```bash
pip install "xinference[all]"
```
要启动一个本地的 Xinference 实例,请运行以下命令:
```bash
$ xinference-local
```
一旦 Xinference 运行起来,你可以通过多种方式尝试它:通过网络界面、通过 cURL、通过命令行或通过 Xinference 的 Python 客户端。更多指南,请查看我们的[文档](https://inference.readthedocs.io/zh-cn/latest/getting_started/using_xinference.html#run-xinference-locally)。

## 参与其中
| 平台 | 目的 |
|-------------------------------------------------------------------------------------------------|----------------------|
| [Github 问题](https://github.com/xorbitsai/inference/issues) | 报告错误和提交功能请求。 |
| [Discord](https://discord.gg/Xw9tszSkr5) | 与其他 Xinference 用户合作。 |
| [Twitter](https://twitter.com/xorbitsio) | 及时了解新功能。 |
| [微信社群](https://xinference.cn/images/WeCom.jpg) | 与其他 Xinference 用户交流。 |
| [知乎](https://zhihu.com/org/xorbits) | 了解团队最新的进展。 |
## 引用
如果您觉得此项目有帮助,请以如下格式引用我们:
```bibtex
@inproceedings{lu2024xinference,
title = "Xinference: Making Large Model Serving Easy",
author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.30",
pages = "291--300",
}
```
## 合作
* [琶洲实验室 | 黄埔](https://www.pazhoulab-huangpu.com/#/)
## 贡献者
## Star 历史
[](https://star-history.com/#xorbitsai/inference&Date)
================================================
FILE: benchmark/README.md
================================================
# Benchmarking Xinference
## Downloading the ShareGPT dataset
You can download the dataset by running:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```
## Benchmarking latency
This tool will sample prompts from dataset, and run benchmark with serialized requests.
```bash
python benchmark_latency.py --dataset /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
--tokenizer /path/to/tokenizer \
--num-prompt 100 \
--model-uid ${model_uid}
```
## Benchmarking serving
This tool will sample prompts from dataset, and run benchmark with parallel requests.
```bash
python benchmark_serving.py --dataset /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
--tokenizer /path/to/tokenizer \
--model-uid ${model_uid} \
--num-prompt 100 --concurrency 50
```
## Benchmarking long context serving
This tool will generate long prompts to sort random numbers, according to specified context length.
```
python benchmark/benchmark_long.py --context-length ${context_length} --tokenizer /path/to/tokenizer \
--model-uid ${model_uid} \
--num-prompts 32 -c 16
```
## Common Options for Benchmarking Tools
- `--stream`. You can enable streaming responses by using the option, which is useful for real-time data processing and receiving incremental data without waiting for the entire dataset to be processed.
- `--print-error`. For troubleshooting and more detailed output, the option can be used to print detailed error messages if any errors are encountered during the execution.
These options are available for use in all benchmarking tools provided in this suite, enhancing flexibility and providing essential debugging information.
================================================
FILE: benchmark/benchmark_embedding.py
================================================
# Copyright 2022-2025 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import logging
import random
import time
import aiohttp
from typing import List, Dict, Optional
from datasets import load_dataset
import numpy as np
from benchmark_runner import ConcurrentBenchmarkRunner
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class EmbeddingBenchmarkRunner(ConcurrentBenchmarkRunner):
def __init__(
self,
api_url: str,
model_uid: str,
input_requests: List[Dict],
stream: bool,
concurrency: int,
api_key: Optional[str] = None,
print_error: bool = False,
):
super().__init__(
api_url,
model_uid,
input_requests,
stream,
concurrency,
api_key,
print_error,
)
async def _run(self):
tasks = []
for i in range(self.concurrency):
tasks.append(asyncio.create_task(self.worker(i)))
await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
async def worker(self, i: int):
r = random.Random(i)
index = r.randint(0, len(self.input_requests) - 1)
while self.left > 0:
request = self.input_requests[index]
index += 1
index = index % len(self.input_requests)
await self.send_request(request)
self.left -= 1
# pring longer space to overwrite the previous when left decrease
print("\rdone_request, left %d " % (self.left), end="")
# The last one
print("")
async def send_request(self, request, warming_up: bool = False):
input = request["sentence"]
request_start_time = time.time()
pload = {
"model": self.model_uid,
"input": input,
}
headers = {"User-Agent": "Benchmark Client"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
timeout = aiohttp.ClientTimeout(total=3 * 3600)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.post(
self.api_url, headers=headers, json=pload
) as response:
resp = await response.json()
if response.status == 200:
request_end_time = time.time()
request_latency = request_end_time - request_start_time
if not warming_up:
self.outputs.append(request_latency)
else:
logger.error(f"Failed to create chat completion: {resp}")
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
np.random.seed(args.seed)
api_url = f"http://{args.host}:{args.port}/v1/embeddings"
model_uid = args.model_uid
logger.info("Preparing for benchmark.")
dataset = load_dataset(args.dataset, args.subset)
input_requests = dataset["test"].to_list()
if args.num_query > 0:
input_requests = input_requests[: args.num_query]
else:
args.num_query = len(input_requests)
logger.info("Benchmark starts.")
benchmark = EmbeddingBenchmarkRunner(
api_url,
model_uid,
input_requests,
args.stream,
concurrency=args.concurrency,
api_key=args.api_key,
print_error=args.print_error,
)
asyncio.run(benchmark.run())
# TODO: Print the results of request_latency in detail.
# benchmark.print_stats() needs to be overridden
print(f"Total time: {benchmark.benchmark_time:.2f} s")
print(f"Throughput: {args.num_query / benchmark.benchmark_time:.2f} requests/s")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Stress test the embedding model.")
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=9997)
parser.add_argument(
"--dataset",
type=str,
default="clue",
help="Name to the dataset.",
)
parser.add_argument(
"--subset",
type=str,
default="tnews",
help="Subset to the dataset.",
)
parser.add_argument(
"--concurrency",
"-c",
type=int,
default=256,
help="Set the concurrency of request to send",
)
parser.add_argument(
"--num-query",
"-q",
type=int,
default=-1,
help="Set the query dataset count, default is all",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from huggingface.",
)
parser.add_argument(
"--model-uid", type=str, required=True, help="Xinference model UID."
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
"--stream", action="store_true", help="Enable streaming responses."
)
parser.add_argument(
"--api-key", type=str, default=None, help="Authorization api key",
)
parser.add_argument(
"--print-error",
action="store_true",
help="Print detailed error messages if any errors encountered."
)
args = parser.parse_args()
main(args)
================================================
FILE: benchmark/benchmark_latency.py
================================================
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import logging
import random
import numpy as np
from utils import get_tokenizer, sample_requests
from benchmark_runner import BenchmarkRunner
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LatencyBenchmarkRunner(BenchmarkRunner):
async def _run(self):
total_requests = len(self.input_requests)
for i, request in enumerate(self.input_requests):
await self.send_request(request)
remaining = total_requests - (i + 1)
print(
f"\rProcessed {i + 1}/{total_requests} requests, {remaining} remaining.",
end="",
)
print("")
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
np.random.seed(args.seed)
api_url = f"http://{args.host}:{args.port}/v1/chat/completions"
model_uid = args.model_uid
logger.info("Preparing for benchmark.")
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
logger.info("Benchmark starts.")
benchmark = LatencyBenchmarkRunner(
api_url,
model_uid,
input_requests,
args.stream,
args.api_key,
args.print_error,
)
asyncio.run(benchmark.run())
benchmark.print_stats()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark the latency of processing a single batch of requests."
)
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=9997)
parser.add_argument(
"--dataset", type=str, required=True, help="Path to the dataset."
)
parser.add_argument(
"--tokenizer", type=str, required=True, help="Name or path of the tokenizer."
)
parser.add_argument(
"--num-prompts", type=int, default=100, help="Number of prompts to process."
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from huggingface.",
)
parser.add_argument("--model-uid", type=str, help="Xinference model UID.")
parser.add_argument(
"--stream", action="store_true", help="Enable streaming responses."
)
parser.add_argument(
"--api-key",
type=str,
default=None,
help="Authorization api key",
)
parser.add_argument(
"--print-error",
action="store_true",
help="Print detailed error messages if any errors encountered."
)
args = parser.parse_args()
main(args)
================================================
FILE: benchmark/benchmark_long.py
================================================
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import logging
import random
import numpy as np
from utils import generate_sorting_prompts, get_tokenizer
from benchmark_runner import ConcurrentBenchmarkRunner
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LongBenchmarkRunner(ConcurrentBenchmarkRunner):
async def _run(self):
tasks = []
for i in range(self.concurrency):
tasks.append(asyncio.create_task(self.worker(i)))
await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
async def worker(self, i: int):
r = random.Random(i)
index = r.randint(0, len(self.input_requests) - 1)
while self.left > 0:
request = self.input_requests[index]
index += 1
index = index % len(self.input_requests)
await self.send_request(request)
self.left -= 1
# pring longer space to overwrite the previous when left decrease
print("\rdone_request, left %d " % (self.left), end="")
# The last one
print("")
def main(args: argparse.Namespace):
if args.concurrency > args.num_prompts:
print("Fix concurrency with num_prompts %d" % (args.num_prompts))
args.concurrency = args.num_prompts
print(args)
random.seed(args.seed)
np.random.seed(args.seed)
api_url = f"http://{args.host}:{args.port}/v1/chat/completions"
model_uid = args.model_uid
logger.info("Preparing for benchmark.")
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
# XXX: generate_sorting_prompts() currently only generate prompts 1/2 to 2/3 of context_length,
# because tokenizers vary by models, consider improve in the future.
input_requests = generate_sorting_prompts(
args.concurrency, args.context_length, args.context_length / 2 - 20, tokenizer
)
logger.info("Benchmark starts.")
benchmark = LongBenchmarkRunner(
api_url,
model_uid,
input_requests,
args.stream,
concurrency=args.concurrency,
api_key=args.api_key,
print_error=args.print_error,
)
asyncio.run(benchmark.run())
benchmark.print_stats()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput with long context."
)
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=9997)
parser.add_argument(
"--tokenizer", type=str, required=True, help="Name or path of the tokenizer."
)
parser.add_argument(
"--context-length", type=int, default=32768, help="model context_length."
)
parser.add_argument(
"--num-prompts", type=int, default=16, help="Number of prompts to process."
)
parser.add_argument(
"--concurrency",
"-c",
type=int,
default=16,
help="Set the concurrency of request to send",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from huggingface.",
)
parser.add_argument("--model-uid", type=str, help="Xinference model UID.")
parser.add_argument(
"--api-key", type=str, default=None, help="Authorization api key",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
"--stream", action="store_true", help="Enable streaming responses."
)
parser.add_argument(
"--print-error",
action="store_true",
help="Print detailed error messages if any errors encountered."
)
args = parser.parse_args()
main(args)
================================================
FILE: benchmark/benchmark_rerank.py
================================================
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import logging
import random
import time
import aiohttp
from typing import List, Dict, Optional
from datasets import load_dataset
import numpy as np
from benchmark_runner import ConcurrentBenchmarkRunner
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RerankBenchmarkRunner(ConcurrentBenchmarkRunner):
def __init__(
self,
api_url: str,
model_uid: str,
input_requests: List[Dict],
stream: bool,
top_n: int,
concurrency: int,
api_key: Optional[str] = None,
print_error: bool = False,
):
super().__init__(
api_url,
model_uid,
input_requests,
stream,
concurrency,
api_key,
print_error,
)
self.top_n = top_n
async def _run(self):
tasks = []
for i in range(self.concurrency):
tasks.append(asyncio.create_task(self.worker(i)))
await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
async def worker(self, i: int):
r = random.Random(i)
index = r.randint(0, len(self.input_requests) - 1)
while self.left > 0:
request = self.input_requests[index]
index += 1
index = index % len(self.input_requests)
await self.send_request(request)
self.left -= 1
# pring longer space to overwrite the previous when left decrease
print("\rdone_request, left %d " % (self.left), end="")
# The last one
print("")
async def send_request(self, request, warming_up: bool = False):
prompt, documents = request["query"], request["positive"]
request_start_time = time.time()
pload = {
"model": self.model_uid,
"top_n": self.top_n,
"query": prompt,
"documents": documents,
}
headers = {"User-Agent": "Benchmark Client"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
timeout = aiohttp.ClientTimeout(total=3 * 3600)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.post(
self.api_url, headers=headers, json=pload
) as response:
resp = await response.json()
if response.status == 200:
request_end_time = time.time()
request_latency = request_end_time - request_start_time
if not warming_up:
self.outputs.append(request_latency)
else:
logger.error(f"Failed to create chat completion: {resp}")
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
np.random.seed(args.seed)
api_url = f"http://{args.host}:{args.port}/v1/rerank"
model_uid = args.model_uid
logger.info("Preparing for benchmark.")
dataset = load_dataset(args.dataset)
input_requests = dataset["test"].remove_columns("negative").to_list()
if args.num_query > 0:
input_requests = input_requests[: args.num_query]
else:
args.num_query = len(input_requests)
logger.info("Benchmark starts.")
benchmark = RerankBenchmarkRunner(
api_url,
model_uid,
input_requests,
args.stream,
top_n=args.top_n,
concurrency=args.concurrency,
api_key=args.api_key,
print_error=args.print_error,
)
asyncio.run(benchmark.run())
# TODO: Print the results of request_latency in detail.
# benchmark.print_stats() needs to be overridden
print(f"Total time: {benchmark.benchmark_time:.2f} s")
print(f"Throughput: {args.num_query / benchmark.benchmark_time:.2f} requests/s")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Stress test the rerank model.")
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=9997)
parser.add_argument(
"--dataset",
type=str,
default="mteb/scidocs-reranking",
help="Path to the dataset.",
)
parser.add_argument(
"--concurrency",
"-c",
type=int,
default=16,
help="Set the concurrency of request to send",
)
parser.add_argument(
"--top-n",
"-n",
type=int,
default=5,
help="Set the top n to the rerank",
)
parser.add_argument(
"--num-query",
"-q",
type=int,
default=-1,
help="Set the query dataset count, default is all",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from huggingface.",
)
parser.add_argument(
"--model-uid", type=str, required=True, help="Xinference model UID."
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
"--stream", action="store_true", help="Enable streaming responses."
)
parser.add_argument(
"--api-key", type=str, default=None, help="Authorization api key",
)
parser.add_argument(
"--print-error",
action="store_true",
help="Print detailed error messages if any errors encountered."
)
args = parser.parse_args()
main(args)
================================================
FILE: benchmark/benchmark_runner.py
================================================
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import aiohttp
import json
import sys
import traceback
import warnings
import logging
from dataclasses import dataclass, field
import time
from typing import List, Optional, Tuple
import numpy as np
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=3 * 3600)
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :].strip()
return text.strip()
@dataclass
class RequestOutput:
success: bool = False
prompt_len: int = 0
completion_tokens: int = 0
latency: float = 0.0
ttft: float = 0.0
itl: List[float] = field(default_factory=list) # List of inter-token latencies
error: str = ""
class BenchmarkRunner:
def __init__(
self,
api_url: str,
model_uid: str,
input_requests: List[Tuple[str, int, int]],
stream: bool,
api_key: Optional[str] = None,
print_error: bool = False,
):
self.api_url = api_url
self.model_uid = model_uid
self.input_requests = input_requests
self.outputs: List[RequestOutput] = []
self.benchmark_time = None
self.stream = stream
self.api_key = api_key
self.print_error = print_error
async def run(self):
await self.warm_up()
start_time = time.time()
await self._run()
end_time = time.time()
self.benchmark_time = end_time - start_time
async def warm_up(self, num_requests: int = 5):
logger.info("Warming up...")
for i in range(min(num_requests, len(self.input_requests))):
request = self.input_requests[i]
await self.send_request(request, warming_up=True)
logger.info("Warm-up completed.")
async def _run(self):
pass
async def send_request(self, request: tuple, warming_up: bool = False):
prompt, prompt_len, output_len = request
if self.stream:
pload = {
"model": self.model_uid,
"n": 1,
"temperature": 0.6,
"top_p": 0.9,
"max_tokens": output_len,
"stream": True,
"messages": [{"role": "user", "content": prompt}],
"stream_options": {"include_usage": True},
}
else:
pload = {
"model": self.model_uid,
"n": 1,
"temperature": 0.6,
"top_p": 0.9,
"max_tokens": output_len,
"stream": False,
"messages": [{"role": "user", "content": prompt}],
}
headers = {"User-Agent": "Benchmark Client"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
output = RequestOutput(prompt_len=prompt_len)
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
self.api_url, headers=headers, json=pload
) as response:
if response.status == 200:
if self.stream:
async for chunk_bytes in response.content:
# {
# "id": "chataec79465-dfea-46af-81b9-c28124063fc0",
# "model": "llama-3-instruct",
# "created": 1721202668,
# "object": "chat.completion.chunk",
# "choices": [
# {
# "index": 0,
# "delta": {"role": "assistant", "content": ""},
# "finish_reason": null,
# }
# ],
# }
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data:")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
timestamp = time.perf_counter()
data = json.loads(chunk)
# First token
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = latency
output.success = True
output.completion_tokens = data["usage"]["completion_tokens"]
else:
resp = await response.json()
output.latency = time.perf_counter() - st
output.success = True
output.completion_tokens = resp["usage"]["completion_tokens"]
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if not warming_up:
self.outputs.append(output)
def print_stats(self):
total_time = self.benchmark_time
if self.stream:
# Initialize variables for metrics
total_input = 0
completed = 0
actual_output_lens = []
itls = []
tpots = []
ttfts = []
for output in self.outputs:
if output.success:
actual_output_lens.append(output.completion_tokens)
total_input += output.prompt_len
if output.completion_tokens > 1:
tpots.append(
(output.latency - output.ttft)
/ (output.completion_tokens - 1)
)
itls += output.itl
ttfts.append(output.ttft)
completed += 1
else:
actual_output_lens.append(0)
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2,
)
# Calculate statistics
total_output = sum(actual_output_lens)
request_throughput = completed / total_time if total_time > 0 else 0
input_throughput = total_input / total_time if total_time > 0 else 0
output_throughput = total_output / total_time if total_time > 0 else 0
mean_ttft = np.mean(ttfts) * 1000 if ttfts else 0
median_ttft = np.median(ttfts) * 1000 if ttfts else 0
std_ttft = np.std(ttfts) * 1000 if ttfts else 0
p99_ttft = np.percentile(ttfts, 99) * 1000 if ttfts else 0
mean_tpot = np.mean(tpots) * 1000 if tpots else 0
median_tpot = np.median(tpots) * 1000 if tpots else 0
std_tpot = np.std(tpots) * 1000 if tpots else 0
p99_tpot = np.percentile(tpots, 99) * 1000 if tpots else 0
mean_itl = np.mean(itls) * 1000 if itls else 0
median_itl = np.median(itls) * 1000 if itls else 0
std_itl = np.std(itls) * 1000 if itls else 0
p99_itl = np.percentile(itls, 99) * 1000 if itls else 0
# Print benchmark results
print("{s:{c}^{n}}".format(s=" Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", completed))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", total_time))
print("{:<40} {:<10}".format("Total input tokens:", total_input))
print("{:<40} {:<10}".format("Total generated tokens:", total_output))
print(
"{:<40} {:<10.2f}".format(
"Request throughput (req/s):", request_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
"Input token throughput (tok/s):", input_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
"Output token throughput (tok/s):", output_throughput
)
)
print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
print("{:<40} {:<10.4f}".format("Mean TTFT (ms):", mean_ttft))
print("{:<40} {:<10.4f}".format("Median TTFT (ms):", median_ttft))
print("{:<40} {:<10.4f}".format("Std TTFT (ms):", std_ttft))
print("{:<40} {:<10.4f}".format("P99 TTFT (ms):", p99_ttft))
print(
"{s:{c}^{n}}".format(
s="Time per Output Token (excl. 1st token)", n=50, c="-"
)
)
print("{:<40} {:<10.4f}".format("Mean TPOT (ms):", mean_tpot))
print("{:<40} {:<10.4f}".format("Median TPOT (ms):", median_tpot))
print("{:<40} {:<10.4f}".format("Std TPOT (ms):", std_tpot))
print("{:<40} {:<10.4f}".format("P99 TPOT (ms):", p99_tpot))
print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
print("{:<40} {:<10.4f}".format("Mean ITL (ms):", mean_itl))
print("{:<40} {:<10.4f}".format("Median ITL (ms):", median_itl))
print("{:<40} {:<10.4f}".format("Std ITL (ms):", std_itl))
print("{:<40} {:<10.4f}".format("P99 ITL (ms):", p99_itl))
print("=" * 50)
else:
# Initialize variables for metrics
total_input = 0
completed = 0
actual_output_lens = []
latencies = []
per_token_latencies = []
per_output_token_latencies = []
for output in self.outputs:
if output.success:
actual_output_lens.append(output.completion_tokens)
total_input += output.prompt_len
latencies.append(output.latency)
per_token_latencies.append(
output.latency / (output.prompt_len + output.completion_tokens)
)
if output.completion_tokens > 0:
per_output_token_latencies.append(
output.latency / output.completion_tokens
)
completed += 1
else:
actual_output_lens.append(0)
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2,
)
# Calculate statistics
total_output = sum(actual_output_lens)
request_throughput = len(self.outputs) / total_time if total_time > 0 else 0
input_throughput = total_input / total_time if total_time > 0 else 0
output_throughput = total_output / total_time if total_time > 0 else 0
mean_latency = np.mean(latencies) if latencies else 0
mean_per_token_latency = (
np.mean(per_token_latencies) if per_token_latencies else 0
)
mean_per_output_token_latency = (
np.mean(per_output_token_latencies) if per_output_token_latencies else 0
)
# Print benchmark results
print("{s:{c}^{n}}".format(s=" Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", completed))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", total_time))
print("{:<40} {:<10}".format("Total input tokens:", total_input))
print("{:<40} {:<10}".format("Total generated tokens:", total_output))
print(
"{:<40} {:<10.2f}".format(
"Request throughput (req/s):", request_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
"Input token throughput (tok/s):", input_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
"Output token throughput (tok/s):", output_throughput
)
)
print("{s:{c}^{n}}".format(s="Latency Statistics", n=50, c="-"))
print("{:<40} {:<10.4f}".format("Mean latency (s):", mean_latency))
print(
"{:<40} {:<10.4f}".format(
"Mean latency per token (s):", mean_per_token_latency
)
)
print(
"{:<40} {:<10.4f}".format(
"Mean latency per output token (s):", mean_per_output_token_latency
)
)
print("=" * 50)
print(f"Total time: {total_time:.2f} s")
print(f"Throughput: {len(self.outputs) / total_time:.2f} requests/s")
if completed < len(self.input_requests):
if self.print_error:
logger.info("Errors encountered during benchmark:")
for output in self.outputs:
if not output.success:
print(f"Error for prompt with length {output.prompt_len}: {output.error}")
else:
logger.info(
"Errors were encountered during the benchmark. Run with --print-error to see detailed error messages."
)
class ConcurrentBenchmarkRunner(BenchmarkRunner):
def __init__(
self,
api_url: str,
model_uid: str,
input_requests: List[Tuple[str, int, int]],
stream: bool,
concurrency: int,
api_key: Optional[str] = None,
print_error: bool = False,
):
super().__init__(
api_url,
model_uid,
input_requests,
stream,
api_key,
print_error,
)
self.concurrency = concurrency
self.left = len(input_requests)
async def worker(self):
pass
================================================
FILE: benchmark/benchmark_serving.py
================================================
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import logging
import random
from typing import List, Tuple, Optional
import numpy as np
from utils import sample_requests, get_tokenizer
from benchmark_runner import ConcurrentBenchmarkRunner
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ServingBenchmarkRunner(ConcurrentBenchmarkRunner):
def __init__(
self,
api_url: str,
model_uid: str,
input_requests: List[Tuple[str, int, int]],
stream: bool,
concurrency: int,
request_rate: float,
api_key: Optional[str] = None,
print_error: bool = False,
):
super().__init__(
api_url,
model_uid,
input_requests,
stream,
concurrency,
api_key,
print_error,
)
self.request_rate = request_rate
self.queue = None # delay the creation of the queue
async def _run(self):
tasks = []
for _ in range(self.concurrency):
tasks.append(asyncio.create_task(self.worker()))
await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
async def warm_up(self, num_requests: int = 5):
if self.queue is None:
self.queue = asyncio.Queue(len(self.input_requests))
logger.info(f"Enqueuing {len(self.input_requests)} requests.")
for req in iter(self.input_requests):
await self.queue.put(req)
await super().warm_up(num_requests)
async def worker(self):
"""
wait request dispatch by run(), and then send_request.
When all request is done, most worker will hang on self.queue,
but at least one worker will exit"""
while self.left > 0:
request = await self.queue.get()
await self.send_request(request)
self.left -= 1
print("\rdone_request, left %d " % (self.left), end="")
if self.request_rate != float("inf"):
# If the request rate is infinity, then we don't need to wait.
# Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / self.request_rate)
# The next request will be sent after the interval.
await asyncio.sleep(interval)
print("")
def main(args: argparse.Namespace):
if args.concurrency > args.num_prompts:
print("Fix concurrency with num_prompts %d" % (args.num_prompts))
args.concurrency = args.num_prompts
print(args)
random.seed(args.seed)
np.random.seed(args.seed)
api_url = f"http://{args.host}:{args.port}/v1/chat/completions"
model_uid = args.model_uid
logger.info("Preparing for benchmark.")
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
input_requests = sample_requests(
args.dataset,
args.num_prompts,
tokenizer,
prompt_len_limit=args.prompt_len_limit,
)
logger.info("Benchmark starts.")
benchmark = ServingBenchmarkRunner(
api_url,
model_uid,
input_requests,
args.stream,
request_rate=args.request_rate,
concurrency=args.concurrency,
api_key=args.api_key,
print_error=args.print_error,
)
asyncio.run(benchmark.run())
benchmark.print_stats()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput."
)
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=9997)
parser.add_argument(
"--dataset", type=str, required=True, help="Path to the dataset."
)
parser.add_argument(
"--tokenizer", type=str, required=True, help="Name or path of the tokenizer."
)
parser.add_argument(
"--num-prompts", type=int, default=100, help="Number of prompts to process."
)
parser.add_argument(
"--prompt-len-limit", type=int, default=1024, help="Prompt length limitation."
)
parser.add_argument(
"--api-key",
type=str,
default=None,
help="Authorization api key",
)
parser.add_argument(
"--concurrency",
"-c",
type=int,
default=100,
help="Set the concurrency of request to send",
)
parser.add_argument(
"--request-rate",
type=float,
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from huggingface.",
)
parser.add_argument("--model-uid", type=str, help="Xinference model UID.")
parser.add_argument(
"--stream", action="store_true", help="Enable streaming responses."
)
parser.add_argument(
"--print-error",
action="store_true",
help="Print detailed error messages if any errors encountered."
)
args = parser.parse_args()
main(args)
================================================
FILE: benchmark/utils.py
================================================
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import random
from typing import TYPE_CHECKING, List, Tuple
from transformers import AutoTokenizer, PreTrainedTokenizerFast
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase
# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
def get_tokenizer(
tokenizer_name: str,
*args,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> "PreTrainedTokenizerBase":
"""Gets a tokenizer for the given model name via Huggingface."""
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if (
"llama" in tokenizer_name.lower()
and kwargs.get("use_fast", True)
and tokenizer_name != _FAST_LLAMA_TOKENIZER
):
logger.info(
"For some LLaMA-based models, initializing the fast tokenizer may "
"take a long time. To eliminate the initialization time, consider "
f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
"tokenizer."
)
try:
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name, *args, trust_remote_code=trust_remote_code, **kwargs
)
except TypeError as e:
# The LLaMA tokenizer causes a protobuf error in some environments.
err_msg = (
"Failed to load the tokenizer. If you are using a LLaMA-based "
f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original "
"tokenizer."
)
raise RuntimeError(err_msg) from e
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
if not trust_remote_code and (
"does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e)
):
err_msg = (
"Failed to load the tokenizer. If the tokenizer is a custom "
"tokenizer not yet available in the HuggingFace transformers "
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return tokenizer
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: "PreTrainedTokenizerBase",
prompt_len_limit: int = 1024,
) -> List[Tuple[str, int, int]]:
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [
(data["conversations"][0]["value"], data["conversations"][1]["value"])
for data in dataset
]
# Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids
completions = [completion for _, completion in dataset]
completion_token_ids = tokenizer(completions).input_ids
tokenized_dataset = []
for i in range(len(dataset)):
output_len = len(completion_token_ids[i])
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
# Filter out too long sequences.
filtered_dataset: List[Tuple[str, int, int]] = []
for prompt, prompt_token_ids, output_len in tokenized_dataset:
prompt_len = len(prompt_token_ids)
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
# This is because TGI causes errors when the input or output length
# is too short.
continue
if (
prompt_len > prompt_len_limit
or prompt_len + output_len > prompt_len_limit * 2
):
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
# Sample the requests.
sampled_requests = random.sample(filtered_dataset, num_requests)
return sampled_requests
def generate_sorting_prompts(
num_prompts: int,
context_length: int,
prompt_len_limit: int,
tokenizer: "PreTrainedTokenizerBase",
) -> List[Tuple[str, int, int]]:
prompts = []
for i in range(0, num_prompts):
random_nums = []
_prompt_len = 0
while True:
r_str = "%s" % random.randint(0, 99)
r_len = len(r_str) + 1
if r_len + _prompt_len > prompt_len_limit:
break
random_nums.append(r_str)
_prompt_len += r_len
prompt = "Sort the numbers:" + ",".join(random_nums)
prompts.append(prompt)
prompt_token_ids = tokenizer(prompts).input_ids
dataset = []
for i in range(0, len(prompts)):
prompt_len = len(prompt_token_ids[i])
dataset.append((prompts[i], prompt_len, context_length - prompt_len))
return dataset
================================================
FILE: doc/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SPHINXINTL ?= sphinx-intl
SOURCEDIR = source
BUILDDIR = build
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR)
I18NSPHINXLANGS = -l zh_CN
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile html_zh_cn html_ja_jp gettext
html_zh_cn:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) -t zh_cn -D language='zh_CN' "$(SOURCEDIR)" $(BUILDDIR)/html_zh_cn
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
$(SPHINXINTL) update -p $(BUILDDIR)/locale $(I18NSPHINXLANGS)
python $(SOURCEDIR)/norm_zh.py
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: doc/source/_static/switcher.json
================================================
[
{
"name": "简体中文(Chinese)",
"version": "zh-cn",
"url": "https://inference.readthedocs.io/zh-cn/latest/"
},
{
"name": "English",
"version": "en",
"url": "https://inference.readthedocs.io/en/latest/",
"preferred": true
}
]
================================================
FILE: doc/source/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'Xinference'
copyright = '2025, Xorbits Inc.'
author = 'xorbitsai'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.mathjax",
"sphinx.ext.ifconfig",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
"sphinx.ext.githubpages",
"sphinx.ext.autosummary",
"sphinx.ext.napoleon",
"sphinx_tabs.tabs",
"sphinx_design",
"IPython.sphinxext.ipython_directive",
"IPython.sphinxext.ipython_console_highlighting",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# i18n
locale_dirs = ["locale/"] # path is example but recommended.
gettext_compact = False # optional
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'pydata_sphinx_theme'
html_title = "Xinference"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Define the json_url for our version switcher.
version_match = os.environ.get("READTHEDOCS_LANGUAGE")
json_url = "https://inference.readthedocs.io/en/latest/_static/switcher.json"
if not version_match:
version_match = 'en'
html_theme_options = {
"show_toc_level": 2,
"header_links_before_dropdown": 7,
"icon_links": [
{
"name": "GitHub",
"url": "https://github.com/xorbitsai/inference",
"icon": "fa-brands fa-github",
"type": "fontawesome",
},
],
"navbar_align": "content", # [left, content, right] For testing that the navbar items align properly
"navbar_start": ["navbar-logo", "version-switcher"],
"navbar_center": ["navbar-nav"],
"switcher": {
"json_url": json_url,
"version_match": version_match,
},
}
if version_match != 'zh-cn':
html_theme_options['icon_links'].extend([{
"name": "Discord",
"url": "https://discord.gg/Xw9tszSkr5",
"icon": "fa-brands fa-discord",
"type": "fontawesome",
},
{
"name": "Twitter",
"url": "https://twitter.com/xorbitsio",
"icon": "fa-brands fa-twitter",
"type": "fontawesome",
}])
html_theme_options["external_links"] = [
{"name": "Official Site", "url": "https://xinference.io"},
]
html_theme_options["header_links_before_dropdown"] = 6
else:
html_theme_options['icon_links'].extend([{
"name": "WeChat",
"url": "https://xinference.cn/images/WeCom.jpg",
"icon": "fa-brands fa-weixin",
"type": "fontawesome",
},
{
"name": "Zhihu",
"url": "https://zhihu.com/org/xorbits",
"icon": "fa-brands fa-zhihu",
"type": "fontawesome",
}])
html_theme_options["external_links"] = [
{"name": "产品官网", "url": "https://xinference.cn"},
]
html_favicon = "_static/favicon.svg"
================================================
FILE: doc/source/development/contributing_codebase.rst
================================================
=============================
Contributing to the code base
=============================
.. contents:: Table of contents:
:local:
Code standards
--------------
Writing good code is not just about what you write. It is also about *how* you write it.
During Continuous Integration testing, several tools will be run to check your code for stylistic errors.
Good style is a requirement for submitting code to Xinference.
In addition, it is important that we do not make sudden changes to the code that
could have the potential to break a lot of user code as a result. Therefore
we need it to be as backwards compatible as possible to avoid mass breakages.
Autofixing formatting errors
----------------------------
Moreover, Continuous Integration will run code formatting checks
like ``black``, ``flake8``, ``isort``, and others using `pre-commit hooks `_
Any warnings generated by these checks will cause the Continuous Integration to fail. Therefore,
it is advisable to run the check yourself before submitting code. This
can be done by installing ``pre-commit``::
pip install pre-commit
and then running::
pre-commit install
from the root of the Xinference repository. This setup ensures that all styling checks are
automatically executed each time you commit changes without your needing to run each one manually.
In addition, using ``pre-commit`` will also allow you to more easily
remain up-to-date with our code checks as they change.
Note that if needed, you can skip these checks with ``git commit --no-verify``.
If you don't want to use ``pre-commit`` as part of your workflow, you can still use it
to run its checks with::
pre-commit run --files
without needing to have done ``pre-commit install`` beforehand.
If you want to run checks on all recently committed files on upstream/main you can use::
pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files
without needing to have done ``pre-commit install`` beforehand.
.. note::
You may consider periodically running ``pre-commit gc`` to clean up repos
which are no longer used.
.. note::
If you have conflicting installations of ``virtualenv``, if could lead to
errors - refer to `here `_.
Also, due to a `bug in virtualenv `_,
you may run into issues if you're using conda. To solve this, you can downgrade
``virtualenv`` to version ``20.0.33``.
Backwards compatibility
-----------------------
Please try to maintain backward compatibility. If you think breakage is necessary,
clearly state why as part of the pull request. Also, be careful when changing method
signatures and add deprecation warnings where needed. Also, add the deprecated sphinx
directive to the deprecated functions or methods.
You'll also need to
1. Write a new test that asserts a warning is issued when calling with the deprecated argument
2. Update all of Xinference existing tests and code to use the new argument
Type hints
----------
Xinference strongly encourages the use of :pep:`484` style type hints. New development should
contain type hints and pull requests to annotate existing code are accepted as well!
Test-driven development
-----------------------
Xinference is serious about testing and strongly encourages contributors to embrace
`test-driven development (TDD) `_.
This development process "relies on the repetition of a very short development cycle:
first the developer writes an (initially failing) automated test case that defines a desired
improvement or new function, then produces the minimum amount of code to pass that test."
So, before actually writing any code, you should write your tests. Often the test can be
taken from the original GitHub issue. However, it is always worth considering additional
use cases and writing corresponding tests.
Adding tests is frequently requested after code is pushed to Xinference. Thus,
it is worth getting in the habit of writing tests ahead of time so this is never an issue.
================================================
FILE: doc/source/development/contributing_environment.rst
================================================
==================================
Creating a development environment
==================================
.. contents:: Table of contents:
:local:
Before proceeding with any code modifications, it's essential to set up the necessary environment for Xinference development,
which includes familiarizing yourself with Git usage, establishing an isolated environment, installing Xinference, and compiling the frontend.
Getting started with Git
-------------------------
Now that you have identified an issue you wish to resolve, an enhancement to incorporate, or documentation to enhance,
it's crucial to acquaint yourself with GitHub and the Xinference codebase.
To the new user, working with Git is one of the more intimidating aspects of contributing to Xinference.
It can very quickly become overwhelming, but sticking to the guidelines below will help simplify the process
and minimize potential issues. As always, if you are having difficulties please
feel free to ask for help.
The code is hosted on `GitHub `_. To
contribute you will need to sign up for a `free GitHub account
`_. We use `Git `_ for
version control to allow many people to work together on the project.
`GitHub has instructions `__ for installing git,
setting up your SSH key, and configuring git. All these steps need to be completed before
you can work seamlessly between your local repository and GitHub.
Some great resources for learning Git:
* `Official Git Documentation `_
* `Pro Git Book `_
* `Git Tutorial by Atlassian `_
* `Git - Concise Guide `_
.. note::
If the speed of ``git clone`` is slow, you can use the following command
to add a proxy:
::
export https_proxy=YourProxyAddress
Creating an isolated environment
--------------------------------
Before formally installing Xinference, it's recommended to create an isolated
environment, using Conda recommended, for ease of subsequent operations.
::
conda create --name xinf
conda activate xinf
``xinf`` can be replaced with a custom Conda environment name.
Afterward, you'll need to install Python and Node.js (npm) in the newly created
Conda environment. Here are the commands:
::
conda install python=3.12
conda install nodejs
Install from source code
------------------------
Before we begin, please make sure that you have cloned the repository.
Suppose you clone the repository as ``inference`` directory, ``cd`` to this directory
where the ``setup.cfg`` and ``setup.py`` files are located, and run the following command:
::
pip install -e .
xinference-local
If the commands run successfully, you can use Xinference normally. For
detailed usage instructions, refer to
`using_xinference `__.
If errors occur or the process freezes during execution, the next step
is to compile the frontend.
Frontend Compilation
--------------------
Navigate to the ``inference/xinference/ui/web/ui`` directory. Then, execute the following command
to clear the cache:
::
npm cache clean
If the command fails to execute, you can try adding the ``--force`` option.
.. note::
If the ``node_modules`` folder already exists in this directory,
it's recommended to manually delete it before cleaning the cache.
Next, execute the following command in this directory to compile the
frontend:
::
npm install
npm run build
Still, if the first command fails to execute, you can try adding the ``--force`` option.
After compiling the frontend, you can ``cd`` back to the directory
where the ``setup.cfg`` and ``setup.py`` files are located,
and install Xinference via ``pip install -e .``.
================================================
FILE: doc/source/development/index.rst
================================================
.. _development_index:
===========
Development
===========
.. toctree::
:maxdepth: 2
contributing_environment
contributing_codebase
xinference_internals
================================================
FILE: doc/source/development/xinference_internals.rst
================================================
===========================
The internals of Xinference
===========================
.. contents:: Table of contents:
:local:
Overview
========
Xinference leverages `Xoscar `_, an actor programming framework we designed,
as its core component to manage machines, devices, and model inference processes. Each actor serves as a basic
unit for model inference and various inference backends can be integrate into the actor, enabling us to support
multiple inference engines and hardware. These actors are hosted and scheduled within actor pools, which are
designed to be asynchronous and non-blocking and function as resource pools.
.. raw:: html
====
Both supervisor and worker are actor instances. Initially, an actor pool, serving as a resource pool, needs to be created
on each server; and each actor can utilize a CPU core or a GPU device. Each server has its own address (IP address or
hostname), so actors on different computing nodes can communicate with each other through these addresses. See `Actor`_ for more information.
RESTful API
===========
The RESTful API is implemented using `FastAPI `_, as specified in
`api/restful_api.py `_.
::
self._router.add_api_route("/status", self.get_status, methods=["GET"])
This is an example of the API ``/status``, it's corresponding function is ``get_status``. You can add connection
between RESTful API and the backend function you want in `api/restful_api.py `_.
Command Line
============
The Command Line is implemented using `Click `_, as specified in
`deploy/cmdline.py `_,
allowing users to interact with the Xinference deployment features directly from the terminal.
Entry Points
------------
Take the command-lines we implemented as examples:
- ``xinference``: Provides commands for model management, including registering/unregistering models, listing all
registered/running models, and launching or terminating specific models.
It also features interactive commands like generate and chat for testing and interacting with deployed models in real-time.
- ``xinference-local``: Starts a local Xinference service.
- ``xinference-supervisor``: Initiates a supervisor process that manages and monitors worker actors within a distributed setup.
- ``xinference-worker``: Starts a worker process that executes tasks assigned by the supervisor, utilizing available
computational resources effectively.
Each command is equipped with ``options`` and ``flags`` to customize its behavior, such as specifying log levels,
host addresses, port numbers, and other relevant settings.
Python projects define command-line console entry points in `setup.cfg` or `setup.py`.
::
console_scripts =
xinference = xinference.deploy.cmdline:cli
xinference-local = xinference.deploy.cmdline:local
xinference-supervisor = xinference.deploy.cmdline:supervisor
xinference-worker = xinference.deploy.cmdline:worker
The command-line ``xinference`` can be referred to code in ``xinference.deploy.cmdline:cli``.
Click
-----
We use Click to implement a specific command-line:
::
@click.option(
"--host",
"-H",
default=XINFERENCE_DEFAULT_DISTRIBUTED_HOST,
type=str,
help="Specify the host address for the supervisor.",
)
@click.option(
"--port",
"-p",
default=XINFERENCE_DEFAULT_ENDPOINT_PORT,
type=int,
help="Specify the port number for the Xinference web ui and service.",
)
For example, the ``xinference-local`` command allows you to define the host address and port.
Actor
=====
Xinference is fundamentally based on `Xoscar `_, our actor framework,
which can manage computational resources and Python processes to support scalable and concurrent programming.
The following is a pseudocode demonstrating how our Worker Actor works, the actual Worker Actor is more complex than this.
::
import xoscar as xo
class WorkerActor(xo.Actor):
def __init__(self, *args, **kwargs):
...
async def launch_model(self, model_id, n_gpu, ...):
# launch an inference engine, use specific model class to load model checkpoints
...
async def list_models(self):
# list models on this actor
...
async def terminate_model(self, model_id):
# terminate the model
...
async def __post_create__(self):
# called after the actor instance is created
...
async def __pre_destroy__(self):
# called before the actor instance is destroyed
...
We use the ``WorkerActor`` as an example to illustrate how we build the Xinference. Each actor class
is a standard Python class that inherits from ``xoscar.Actor``. An instance of this class is a specific actor
within the actor pool.
- **Define Actor Actions**: Each actor needs to define certain actions or behaviors to accomplish specific tasks.
For instance, the model inference ``WorkerActor`` needs to launch the model (``launch_model``), list the models
in this actor (``list_models``), terminate a model (``terminate_model``). There are two special methods worth
noting. The ``__post_create__`` is invoked before the actor is created, allowing for necessary initializations.
The ``__pre_destroy__`` is called after the actor is destroyed, allowing for cleanup or finalization tasks.
- **Reference Actor and Invoke Methods**: When an actor is created, it yields a reference variable so that other
actors can reference it. The actor reference can also be referenced with the address. Suppose the ``WorkerActor``
is created and the reference variable is ``worker_ref``, the ``launch_model`` method of this actor class can
be invoked by calling ``worker_ref.launch_model()``.
Even if the actor's method is originally a synchronized method, when called with an actor reference, it will
become as an asynchronous method.
- **Inference Engine**: The actor can manage the process, and the inference engine is also a process. In the launch
model part of the ``WorkerActor``, we can initialize different inference engines according to the user's need.
Therefore, Xinference can support multiple inference engines and can easily adapt to new inference engines in the
future.
See `Xoscar document `_ for more actor use cases.
Asynchronous Programming
========================
Both Xinference and Xoscar highly utilize asynchronous programming of ``asyncio``.
Asynchronous programming is a programming paradigm that does not block.
Instead, requests and function calls are issued and executed in the background
and results are returned in the future. This enables us to perform
activities concurrently.
If you're not familiar with Pythons's ``asyncio``, you can see more tutorials for help:
- `Python Asyncio Tutorial `__
- `Real Python's asyncio Tutorial `__
- `Python Official Documentation `__
Model
=====
Xinference supports different types of models including large language models (LLMs), image models, audio models, embedding models, etc.
All models are implemented in `model/ `_.
LLM
---
Take `model/llm/ `_ for example, it focuses on
the management and instantiation of LLMs. It includes detailed implementations for loading, configuring,
and deploying LLMs.
We support many backends such as GGML, PyTorch, and vLLM. Our generated content is compatible with the format of OpenAI, supporting features such as streaming output and returning chat completion format (for chat models only).
Therefore, there is a lot of adaptation work to be done after the model generate content. These tasks are not difficult, but they do require some time. When writing this part of the code, please refer to the `OpenAI API documentation `_ and the documentation of various inference backends, and make the necessary adaptations.
JSON
----
In `model/llm/llm_family.json `_,
we utilize JSON files to manage the metadata of emerging open-source models. Adding a new model does not necessitate writing new code,
it merely requires appending new metadata to the existing JSON file.
::
{
"model_name": "llama-2-chat",
"model_ability": ["chat"],
"model_specs": [
{
"model_format": "ggmlv3",
"model_size_in_billions": 70,
"quantization": ["q8_0", ...],
"model_id": "TheBloke/Llama-2-70B-Chat-GGML",
},
...
],
"prompt_style": {
"style_name": "LLAMA2",
"system_prompt": "[INST] <>\nYou are a helpful AI assistant.\n<>\n\n",
"roles": ["[INST]", "[/INST]"],
"stop_token_ids": [2],
"stop": [""]
}
}
This is an example of how to define the Llama-2 chat model. The ``model_specs`` define the information of the model, as one model family
usually comes with various sizes, quantization methods, and file formats.
For instance, the ``model_format`` could be ``pytorch`` (using Hugging Face Transformers or vLLM as backend),
``ggmlv3`` (a tensor library associated with llama.cpp), or ``gptq`` (a post-training quantization framework).
The ``model_id`` defines the repository of the model hub from which Xinference downloads the checkpoint files.
Furthermore, due to distinct instruction-tuning processes, different model families have varying prompt styles.
The ``prompt_style`` in the JSON file specifies how to format prompts for this particular model.
For example, ``system_prompt`` and ``roles`` are used to specify the instructions and personality of the model.
Code Walkthrough
================
The main code is located in the `xinference/ `_:
- `api/ `_: `restful_api.py `_
is the core part that sets up and runs the RESTful APIs.
It integrates an authentication service (the specific code is located in `oauth2/ `_),
as some or all endpointsrequire user authentication.
- `client/ `_: This is the client of Xinference.
- `oscar/ `_ defines the Actor Client which acts as
a client interface for interacting with models deployed in a Xinference cluster.
- `restful/ `_ implements a RESTful client for
interacting with a Xinference service.
- `core/ `_: This is the core part of Xinference.
- `metrics.py `_ and
`resource.py `_
defines a set of tools for collecting and reporting metrics and the status of node resources, including model throughput,
latency, the usage of CPU and GPU, memory usage, and more.
- `image_interface.py `_ and
`chat_interface.py `_
implement `Gradio `_ interfaces for image and chat models, respectively.
These interfaces allow users to interact with models through a Web UI, such as generating images or engaging in chat.
They build user interfaces using the gradio package and communicate with backend models through our RESTful APIs.
- `worker.py `_ and
`supervisor.py `_
respectively define the logic for worker actors and supervisor actor. Worker actors are responsible for carrying out specific
model computation tasks, while supervisor actors manage the lifecycle of worker nodes, schedule tasks, and monitor system states.
- `status_guard.py `_ implements a status monitor
to track the status of models (like creating, updating, terminating, etc.). It allows querying status information of model instances
and managing these statuses based on the model's UID.
- `cache_tracker.py `_ defines a cache tracker for
recording and managing cache status and information of model versions. It supports recording cache locations and statuses of model
versions and querying model version information based on model names.
- `event.py `_ defines an event collector for gathering and
reporting various runtime events of models, such as information, warnings, and errors.
`model.py `_ defines a Model Actor, the core component for
direct model interactions. The Model Actor is responsible for executing model inference requests, handling input and output data streams,
and supports various types of model operations.
- `deploy/ `_: It provides a command-line interface (CLI) for interacting
with the Xinference framework, allowing users to perform operations by command line. See `Command Line`_ for more information.
- `locale/ `_: It supports multi-language localization. By simply adding
and updating JSON translation files, it becomes possible to support more languages, improving user experience.
- `model/ `_: It provides a structure for model descriptions, creation,
and caching. See `Model`_ for more information.
- `web/ui/ `_: The js code of the frontend (Web UI).
================================================
FILE: doc/source/examples/ai_podcast.rst
================================================
.. _examples_ai_podcast:
======================
Example: AI Podcast 🎙
======================
**Description**:
🎙️AI Podcast - Voice Conversations with Multiple Agents on M2 Max 💻
**Support Language** :
English (AI_Podcast.py)
Chinese (AI_Podcast_ZH.py)
**Used Technology (EN version)** :
@ `OpenAI `_ 's `whisper `_
@ `ggerganov `_ 's `ggml `_
@ `WizardLM_AI `_ 's `wizardlm v1.0 `_
@ `lmsysorg `_ 's `vicuna v1.3 `_
@ `Xinference `_ as a launcher
**Detailed Explanation on the Demo Functionality** :
1. Generate the Wizardlm Model and Vicuna Model when the program is launching with Xorbits Inference.
Initiate the Chatroom by giving the two chatbot their names and telling them that there is a human user
called "username", where "username" is given by user's input. Initialize a empty chat history for the chatroom.
2. Use Audio device to store recording into file, and transcribe the file using OpenAI's Whisper to receive a human readable text as string.
3. Based on the input message string, determine which agents the user want to talk to. Call the target agents and
parse in the input string and chat history for the model to generate.
4. When the responses are ready, use Macos's "Say" Command to produce audio through speaker. Each agents have their
own voice while speaking.
5. Store the user input and the agent response into chat history, and recursively looping the program until user
explicitly says words like "see you" in their responses.
**Highlight Features with Xinference** :
1. With Xinference's distributed system, we can easily deploy two different models in the same session and in the
same "chatroom". With enough resources, the framework can deploy any amount of models you like at the same time.
2. With Xinference, you can deploy the model easily by just adding a few lines of code.
For examples, for launching the vicuna model in the demo, just by::
args = parser.parse_args()
endpoint = args.endpoint
client = Client(endpoint)
model_a = "vicuna-v1.3"
model_a_uid = client.launch_model(
model_name=model_a,
model_format="ggmlv3",
model_size_in_billions=7,
quantization="q4_0",
n_ctx=2048,
)
model_a_ref = client.get_model(model_a_uid)
Then, the Xinference client will handle "target model downloading and caching", "set up environment and process
for the model", and "run the service at selected endpoint. " You are now ready to play with your llm model.
**Original Demo Video** :
* `🎙️AI Podcast - Voice Conversations with Multiple Agents on M2 Max💻🔥🤖 `_
**Source Code** :
* `AI_Podcast `_ (English Version)
* `AI_Podcast_ZH `_ (Chinese Version)
================================================
FILE: doc/source/examples/chatbot.rst
================================================
.. _examples_chatbot:
========================
Example: CLI chatbot 🤖️
========================
**Description**:
Demonstrate how to interact with Xinference to play with LLM chat functionality with an AI agent in command line💻
**Used Technology**:
@ `ggerganov `_ 's `ggml `_
@ `Xinference `_ as a launcher
@ All LLaMA and Chatglm models supported by `Xorbitsio inference `_
**Detailed Explanation on the Demo Functionality** :
1. Take the user command line input in the terminal and grab the required parameters for model launching.
2. Launch the Xinference frameworks and automatically deploy the model user demanded into the cluster.
3. Initialize an empty chat history to store all the context in the chatroom.
4. Recursively ask for user's input as prompt and let the model to generate response based on the prompt and the
chat history. Show the Output of the response in the terminal.
5. Store the user's input and agent's response into the chat history as context for the upcoming rounds.
**Source Code** :
* `chat `_
================================================
FILE: doc/source/examples/gradio_chatinterface.rst
================================================
.. _examples_gradio_chatinterface:
===============================
Example: Gradio ChatInterface🤗
===============================
**Description**:
This example showcases how to build a chatbot with 120 lines of code with Gradio ChatInterface and Xinference local LLM
**Used Technology**:
@ `Xinference `_ as a LLM model hosting service
@ `Gradio `_ as a web interface for the chatbot
**Detailed Explanation on the Demo Functionality** :
* Parse user-provided command line arguments to capture essential model parameters such as model name, size, format, and quantization.
* Establish a connection to the Xinference framework and deploy the specified model, ensuring it's ready for real-time interactions.
* Implement helper functions (flatten and to_chat) to efficiently handle and store chat interactions, ensuring the model has context for generating relevant responses.
* Set up an interactive chat interface using Gradio, allowing users to communicate with the model in a user-friendly environment.
* Activate the Gradio web interface, enabling users to start their chat sessions and receive model-generated responses based on their queries.
**Source Code** :
* `Gradio ChatInterface `_
================================================
FILE: doc/source/examples/index.rst
================================================
.. _examples_index:
========
Examples
========
.. toctree::
:maxdepth: 2
:hidden:
ai_podcast
chatbot
gradio_chatinterface
pdf_chatbot
langchain_streamlit_doc_chat
Here you can find examples and resources to learn about how to use Xinference.
Demos
=====
End-to-end applications of using Xinference:
* `Voice Conversations with AI Agents on M2 Max `_
* `Interacting with LLM Models: A Command-Line Example `_
* `Interacting with LLM Models: A Gradio ChatInterface Example `_
* `PDF Chatbot with Local LLM and Embeddings `_
* `Local Doc Conversations with LangChain and Streamlit `_
If you come across other examples in your own workflows we encourage you to contribute a `PR `_!
Tutorials
=========
The following tutorials cover the basics of using Xinference in different scenarios:
* `[Notebook] Question-answering(QA) Application with Xinference, Milvus and LangChain `_
* `Using Xinference local LLMs within LlamaIndex `_
* `[Chinese] 如何让 Chatbox 接入开源大模型,实现免费聊天 `_
* `[Chinese] 摆脱 OpenAI 依赖,8 分钟教你用开源生态构建全栈 AI 应用 `_
* `[Chinese] 使用全套开源工具构建 LLM 应用实战: 在 Dify 调用 Baichuan 开源模型能力 `_
Third-Party Library Integrations
================================
Xinference is designed to seamlessly integrate and deploy open-sourced AI models, so we want to incorporate support for mainstream toolkits
in the AI landscape. Xinference can be used with the following third-party libraries:
* LangChain `Text Embedding Models `_ and `LLMs `_
* `LlamaIndex Xinference LLM `_
================================================
FILE: doc/source/examples/langchain_streamlit_doc_chat.rst
================================================
.. _examples_langchain_streamlit_doc_chat:
=======================================
Example: LangChain Streamlit Doc Chat📄
=======================================
**Description**:
This Streamlit-based application demonstrates a AI chatbot powered by local LLM and embedding models
**Used Technology**:
@ `Xinference `_: as the LLM and embedding model hosting service
@ `LangChain `_: orchestrates the entire document processing and query answering pipeline
@ `Streamlit `_: for interactive user interface
**Detailed Explanation on the Demo Functionality** :
* Streamlit UI for uploading text files, enhancing user interaction.
* Texts are split into chunks and embedded using Xinference for efficient processing.
* Executes similarity searches on embedded texts to pinpoint relevant sections for user queries.
* Utilizes a structured prompt template for focused LLM interactions.
* Xinference's LLM processes queries within the context of relevant document parts, providing accurate responses.
* The system facilitates effective and context-sensitive document exploration, aiding users in information retrieval.
**Source Code** :
* `LangChain Streamlit Doc Chat `_
================================================
FILE: doc/source/examples/pdf_chatbot.rst
================================================
.. _examples_pdf_chatbot:
======================
Example: PDF Chatbot📚
======================
**Description**:
This example showcases how to build a PDF chatbot with local LLM and Embedding models
**Used Technology**:
@ `Xinference `_ as a LLM model hosting service
@ `LlamaIndex `_ for orchestrating the entire RAG pipeline
@ `Streamlit `_ for interactive UI
**Detailed Explanation on the Demo Functionality** :
* Crafted a Dockerfile to simplify the process and ensure easy reproducibility.
* Set up models with Xinference and expose two ports for accessing them.
* Leverage Streamlit for seamless file uploads and interactive communication with the chat engine.
* 5x faster doc embedding than OpenAI's API.
* Leveraging the power of GGML to offload models to the GPU, ensuring swift acceleration. Less long waits for returns.
**Source Code** :
* `PDF Chatbot `_
================================================
FILE: doc/source/gen_docs.py
================================================
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
from collections import defaultdict
from jinja2 import Environment, FileSystemLoader
# Mock engine libraries before importing xinference modules
def mock_engine_libraries():
"""Mock engine libraries to make them appear installed for documentation generation"""
from types import ModuleType
from importlib.machinery import ModuleSpec
# Create mock vllm module
vllm_mock = ModuleType('vllm')
vllm_mock.__version__ = "1.0.0" # Latest version for full feature support
vllm_mock.__spec__ = ModuleSpec('vllm', None)
vllm_mock.__file__ = "mock_vllm.py"
# Create mock mlx module with core submodule
mlx_mock = ModuleType('mlx')
mlx_mock.__version__ = "1.0.0"
mlx_mock.__spec__ = ModuleSpec('mlx', None)
mlx_mock.__file__ = "mock_mlx.py"
mlx_core_mock = ModuleType('mlx.core')
mlx_core_mock.__spec__ = ModuleSpec('mlx.core', None)
mlx_core_mock.__file__ = "mock_mlx_core.py"
# Add required attributes for xoscar serialization
mlx_core_mock.array = type('MockArray', (), {})
mlx_mock.core = mlx_core_mock
# Create mock lmdeploy module
lmdeploy_mock = ModuleType('lmdeploy')
lmdeploy_mock.__version__ = "0.6.0"
lmdeploy_mock.__spec__ = ModuleSpec('lmdeploy', None)
lmdeploy_mock.__file__ = "mock_lmdeploy.py"
# Create mock sglang module
sglang_mock = ModuleType('sglang')
sglang_mock.__version__ = "0.3.0"
sglang_mock.__spec__ = ModuleSpec('sglang', None)
sglang_mock.__file__ = "mock_sglang.py"
# Create mock xllamacpp module with proper module spec for importlib.util.find_spec
import importlib.util
import importlib.machinery
xllamacpp_mock = ModuleType('xllamacpp')
xllamacpp_mock.__version__ = "1.0.0"
# Create a proper ModuleSpec that importlib.util.find_spec can find
xllamacpp_spec = importlib.machinery.ModuleSpec('xllamacpp', None)
xllamacpp_spec.origin = "mock_xllamacpp.py"
xllamacpp_mock.__spec__ = xllamacpp_spec
xllamacpp_mock.__file__ = "mock_xllamacpp.py"
# Create mock mlx_lm module
mlx_lm_mock = ModuleType('mlx_lm')
mlx_lm_mock.__version__ = "1.0.0"
mlx_lm_mock.__spec__ = ModuleSpec('mlx_lm', None)
mlx_lm_mock.__file__ = "mock_mlx_lm.py"
# Create mock mlx_vlm module
mlx_vlm_mock = ModuleType('mlx_vlm')
mlx_vlm_mock.__version__ = "1.0.0"
mlx_vlm_mock.__spec__ = ModuleSpec('mlx_vlm', None)
mlx_vlm_mock.__file__ = "mock_mlx_vlm.py"
# Mock these modules in sys.modules
sys.modules['vllm'] = vllm_mock
sys.modules['mlx'] = mlx_mock
sys.modules['mlx.core'] = mlx_core_mock
sys.modules['lmdeploy'] = lmdeploy_mock
sys.modules['sglang'] = sglang_mock
sys.modules['xllamacpp'] = xllamacpp_mock
sys.modules['mlx_lm'] = mlx_lm_mock
sys.modules['mlx_vlm'] = mlx_vlm_mock
# Apply mocking before importing xinference modules
mock_engine_libraries()
# Mock platform checks BEFORE importing xinference modules
def mock_platform_checks():
"""Mock platform and hardware checks for documentation generation"""
# Import and mock engine checks without modifying system-wide platform settings
try:
# Mock vLLM platform checks
import xinference.model.llm.vllm.core as vllm_core
vllm_core.VLLMModel._is_linux = lambda: True
vllm_core.VLLMModel._has_cuda_device = lambda: True
vllm_core.VLLMChatModel._is_linux = lambda: True
vllm_core.VLLMChatModel._has_cuda_device = lambda: True
vllm_core.VLLMMultiModel._is_linux = lambda: True
vllm_core.VLLMMultiModel._has_cuda_device = lambda: True
# Mock SGLang platform checks if available
try:
import xinference.model.llm.sglang.core as sglang_core
sglang_core.SGLANGModel._is_linux = lambda: True
sglang_core.SGLANGModel._has_cuda_device = lambda: True
sglang_core.SGLANGChatModel._is_linux = lambda: True
sglang_core.SGLANGChatModel._has_cuda_device = lambda: True
sglang_core.SGLANGVisionModel._is_linux = lambda: True
sglang_core.SGLANGVisionModel._has_cuda_device = lambda: True
except ImportError:
pass
# Mock LMDEPLOY platform checks if available
try:
import xinference.model.llm.lmdeploy.core as lmdeploy_core
lmdeploy_core.LMDeployModel._is_linux = lambda: True
lmdeploy_core.LMDeployModel._has_cuda_device = lambda: True
lmdeploy_core.LMDeployChatModel._is_linux = lambda: True
lmdeploy_core.LMDeployChatModel._has_cuda_device = lambda: True
except ImportError:
pass
# Mock MLX engine platform checks by monkey-patching the imports within MLX module
try:
# First, let's monkey-patch sys and platform imports within the MLX module only
import xinference.model.llm.mlx.core as mlx_core
# Create mock objects that look like sys.platform and platform functions
class MockSys:
platform = "darwin"
class MockPlatform:
@staticmethod
def system():
return "Darwin"
@staticmethod
def processor():
return "arm"
# Store original references
original_mlx_match = mlx_core.MLXModel.match_json
original_mlx_chat_match = mlx_core.MLXChatModel.match_json
original_mlx_vision_match = mlx_core.MLXVisionModel.match_json
# Now create wrapper functions that replace sys and platform only during the platform check
def create_wrapped_match_json(original_match):
def wrapped_match_json(cls, llm_family, llm_spec, quantization):
# Temporarily replace sys and platform in the MLX module
import sys as original_sys
import platform as original_platform
# Replace sys and platform temporarily
mlx_core.sys = MockSys()
mlx_core.platform = MockPlatform()
try:
# Call the original match_json which will now see the mocked platform
result = original_match.__func__(cls, llm_family, llm_spec, quantization)
return result
finally:
# Restore original sys and platform
mlx_core.sys = original_sys
mlx_core.platform = original_platform
return classmethod(wrapped_match_json)
# Apply the wrapped match_json methods
mlx_core.MLXModel.match_json = create_wrapped_match_json(original_mlx_match)
mlx_core.MLXChatModel.match_json = create_wrapped_match_json(original_mlx_chat_match)
mlx_core.MLXVisionModel.match_json = create_wrapped_match_json(original_mlx_vision_match)
except ImportError:
pass
except Exception as e:
# If any mocking fails, continue without it
print(f"Warning: Could not mock some engine platform checks: {e}")
pass
mock_platform_checks()
from xinference.model.llm.llm_family import SUPPORTED_ENGINES, check_engine_by_spec_parameters
from xinference.model.llm.vllm.core import VLLM_INSTALLED, VLLM_SUPPORTED_MODELS, VLLM_SUPPORTED_CHAT_MODELS
# Mock platform checks again after imports to ensure they stick
# Re-register engines with mocked platform checks
from xinference.model.llm import generate_engine_config_by_model_family
from xinference.model.llm.llm_family import BUILTIN_LLM_FAMILIES, LLM_ENGINES
# Clear existing engine configurations
LLM_ENGINES.clear()
# Re-register all model families with mocked platform checks
for family in BUILTIN_LLM_FAMILIES:
generate_engine_config_by_model_family(family)
MODEL_HUB_HUGGING_FACE = "Hugging Face"
MODEL_HUB_MODELSCOPE = "ModelScope"
_LEGACY_TRANSFORMERS_FORMATS = {"pytorch", "gptq", "awq", "bnb"}
def build_architecture_to_models(models):
architecture_to_models = defaultdict(list)
for model in models:
for architecture in model.get("architectures", []) or []:
architecture_to_models[architecture].append(model["model_name"])
return architecture_to_models
def get_metrics_from_url(metrics_url):
from prometheus_client.parser import text_string_to_metric_families
import requests
metrics = requests.get(metrics_url).content
result = []
for family in text_string_to_metric_families(metrics.decode("utf-8")):
result.append({
"name": family.name,
"type": family.type,
"help": family.documentation,
})
return result
def _can_use_transformers_legacy(model, model_spec):
if model_spec.get("model_format") not in _LEGACY_TRANSFORMERS_FORMATS:
return False
abilities = set(model.get("model_ability", []))
return "chat" in abilities or "generate" in abilities
def _extract_primary_model_src(model):
if model.get("model_specs"):
for spec in model["model_specs"]:
if isinstance(spec, dict) and "model_src" in spec:
return spec["model_src"]
return model.get("model_src")
def main():
template_dir = '../templates'
env = Environment(loader=FileSystemLoader(template_dir))
with open('../../xinference/model/llm/llm_family.json', 'r') as model_file:
models = json.load(model_file)
model_by_names = { m['model_name']: m for m in models}
sorted_models = []
output_dir = './models/builtin/llm'
os.makedirs(output_dir, exist_ok=True)
current_files = {f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))}
for model_name in sorted(model_by_names, key=str.lower):
model = model_by_names[model_name]
sorted_models.append(model)
for model_spec in model['model_specs']:
model_spec['model_hubs'] = []
# Process different model sources
if 'model_src' in model_spec:
# Handle new model_src structure
if 'huggingface' in model_spec['model_src']:
hf_src = model_spec['model_src']['huggingface']
model_spec['model_hubs'].append({
'name': MODEL_HUB_HUGGING_FACE,
'url': f"https://huggingface.co/{hf_src['model_id']}"
})
# Set model_id and quantizations for template compatibility
model_spec['model_id'] = hf_src['model_id']
model_spec['quantizations'] = hf_src['quantizations']
quantizations = hf_src['quantizations']
if 'modelscope' in model_spec['model_src']:
ms_src = model_spec['model_src']['modelscope']
model_spec['model_hubs'].append({
'name': MODEL_HUB_MODELSCOPE,
'url': f"https://modelscope.cn/models/{ms_src['model_id']}"
})
# If only modelscope exists and no huggingface, use modelscope data
if 'modelscope' in model_spec['model_src'] and 'huggingface' not in model_spec['model_src']:
ms_src = model_spec['model_src']['modelscope']
model_spec['model_id'] = ms_src['model_id']
model_spec['quantizations'] = ms_src['quantizations']
quantizations = ms_src['quantizations']
else:
# Fallback for old format if still exists
model_spec['model_hubs'].append({
'name': MODEL_HUB_HUGGING_FACE,
'url': f"https://huggingface.co/{model_spec['model_id']}"
})
quantizations = model_spec.get('quantizations', [])
# model engines
engines = []
for engine in SUPPORTED_ENGINES:
for quantization in quantizations:
size = model_spec['model_size_in_billions']
if isinstance(size, str) and '_' not in size:
size = int(size)
try:
check_engine_by_spec_parameters(engine, model_name, model_spec['model_format'],
size, quantization)
except ValueError:
if engine == "Transformers" and _can_use_transformers_legacy(
model, model_spec
):
engines.append(engine)
continue
else:
engines.append(engine)
model_spec['engines'] = sorted(list(set(engines)), reverse=True)
rendered = env.get_template('llm.rst.jinja').render(model)
output_file_name = f"{model['model_name'].lower()}.rst"
if output_file_name in current_files:
current_files.remove(output_file_name)
output_file_path = os.path.join(output_dir, output_file_name)
with open(output_file_path, 'w') as output_file:
output_file.write(rendered)
print(output_file_path)
if current_files:
for f in current_files:
print(f"remove {f}")
os.remove(os.path.join(output_dir, f))
index_file_path = os.path.join(output_dir, "index.rst")
with open(index_file_path, "w") as file:
rendered_index = env.get_template('llm_index.rst.jinja').render(models=sorted_models)
file.write(rendered_index)
llm_sorted_models = sorted_models
with open('../../xinference/model/embedding/model_spec.json', 'r') as file:
models = json.load(file)
model_by_names = { m['model_name']: m for m in models}
sorted_models = []
output_dir = './models/builtin/embedding'
os.makedirs(output_dir, exist_ok=True)
for model_name in sorted(model_by_names, key=str.lower):
model = model_by_names[model_name]
sorted_models.append(model)
model['model_hubs'] = []
# Process model specs for new model_src structure
if 'model_specs' in model and model['model_specs']:
model_spec = model['model_specs'][0] # Use first spec for model hubs
if 'model_src' in model_spec:
if 'huggingface' in model_spec['model_src']:
hf_src = model_spec['model_src']['huggingface']
model['model_hubs'].append({
'name': MODEL_HUB_HUGGING_FACE,
'url': f"https://huggingface.co/{hf_src['model_id']}"
})
# Set model_id for template compatibility (prefer huggingface)
model['model_id'] = hf_src['model_id']
if 'modelscope' in model_spec['model_src']:
ms_src = model_spec['model_src']['modelscope']
model['model_hubs'].append({
'name': MODEL_HUB_MODELSCOPE,
'url': f"https://modelscope.cn/models/{ms_src['model_id']}"
})
# Only set modelscope model_id if no huggingface exists
if 'huggingface' not in model_spec['model_src']:
model['model_id'] = ms_src['model_id']
else:
# Fallback for old format
model_id = model_spec.get('model_id', model.get('model_id', ''))
model['model_id'] = model_id
model['model_hubs'].append({
'name': MODEL_HUB_HUGGING_FACE,
'url': f"https://huggingface.co/{model_id}"
})
else:
# Fallback for very old format
if 'model_id' in model:
model['model_hubs'].append({
'name': MODEL_HUB_HUGGING_FACE,
'url': f"https://huggingface.co/{model['model_id']}"
})
rendered = env.get_template('embedding.rst.jinja').render(model)
output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
with open(output_file_path, 'w') as output_file:
output_file.write(rendered)
print(output_file_path)
index_file_path = os.path.join(output_dir, "index.rst")
with open(index_file_path, "w") as file:
rendered_index = env.get_template('embedding_index.rst.jinja').render(models=sorted_models)
file.write(rendered_index)
with open('../../xinference/model/rerank/model_spec.json', 'r') as file:
models = json.load(file)
sorted_models = sorted(models, key=lambda x: x['model_name'].lower())
output_dir = './models/builtin/rerank'
os.makedirs(output_dir, exist_ok=True)
for model in sorted_models:
# Initialize model_hubs list
model['model_hubs'] = []
# Process model specs for new model_src structure
model_spec = model['model_specs'][0] # Use first spec for model hubs
if 'model_src' in model_spec:
if 'huggingface' in model_spec['model_src']:
hf_src = model_spec['model_src']['huggingface']
model['model_hubs'].append({
'name': MODEL_HUB_HUGGING_FACE,
'url': f"https://huggingface.co/{hf_src['model_id']}"
})
# Set model_id for template compatibility (prefer huggingface)
model['model_id'] = hf_src['model_id']
if 'modelscope' in model_spec['model_src']:
ms_src = model_spec['model_src']['modelscope']
model['model_hubs'].append({
'name': MODEL_HUB_MODELSCOPE,
'url': f"https://modelscope.cn/models/{ms_src['model_id']}"
})
# Only set modelscope model_id if no huggingface exists
if 'huggingface' not in model_spec['model_src']:
model['model_id'] = ms_src['model_id']
rendered = env.get_template('rerank.rst.jinja').render(model)
output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
with open(output_file_path, 'w') as output_file:
output_file.write(rendered)
index_file_path = os.path.join(output_dir, "index.rst")
with open(index_file_path, "w") as file:
rendered_index = env.get_template('rerank_index.rst.jinja').render(models=sorted_models)
file.write(rendered_index)
with open('../../xinference/model/image/model_spec.json', 'r') as file:
models = json.load(file)
sorted_models = sorted(models, key=lambda x: x['model_name'].lower())
output_dir = './models/builtin/image'
os.makedirs(output_dir, exist_ok=True)
for model in sorted_models:
# Process model_src for template compatibility
model_src = _extract_primary_model_src(model)
if model_src:
if 'huggingface' in model_src:
hf_src = model_src['huggingface']
model['model_id'] = hf_src['model_id']
# Handle GGUF related fields
if 'gguf_model_id' in hf_src:
model['gguf_model_id'] = hf_src['gguf_model_id']
if 'gguf_quantizations' in hf_src:
model['gguf_quantizations'] = ", ".join(hf_src['gguf_quantizations'])
# Handle Lightning related fields
if 'lightning_model_id' in hf_src:
model['lightning_model_id'] = hf_src['lightning_model_id']
if 'lightning_versions' in hf_src:
model['lightning_versions'] = ", ".join(hf_src['lightning_versions'])
elif 'modelscope' in model_src:
model['model_id'] = model_src['modelscope']['model_id']
available_controlnet = [cn["model_name"] for cn in model.get("controlnet", [])]
if not available_controlnet:
available_controlnet = None
model["available_controlnet"] = available_controlnet
model["model_ability"] = ', '.join(model.get("model_ability"))
# Ensure gguf_quantizations is properly formatted (fallback for old format)
if "gguf_quantizations" not in model:
model["gguf_quantizations"] = ", ".join(model.get("gguf_quantizations", []))
rendered = env.get_template('image.rst.jinja').render(model)
output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
with open(output_file_path, 'w') as output_file:
output_file.write(rendered)
index_file_path = os.path.join(output_dir, "index.rst")
with open(index_file_path, "w") as file:
rendered_index = env.get_template('image_index.rst.jinja').render(models=sorted_models)
file.write(rendered_index)
with open('../../xinference/model/audio/model_spec.json', 'r') as file:
models = json.load(file)
sorted_models = sorted(models, key=lambda x: x['model_name'].lower())
output_dir = './models/builtin/audio'
os.makedirs(output_dir, exist_ok=True)
for model in sorted_models:
# Process model_src for template compatibility
model_src = _extract_primary_model_src(model)
if model_src:
if 'huggingface' in model_src:
model['model_id'] = model_src['huggingface']['model_id']
elif 'modelscope' in model_src:
model['model_id'] = model_src['modelscope']['model_id']
rendered = env.get_template('audio.rst.jinja').render(model)
output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
with open(output_file_path, 'w') as output_file:
output_file.write(rendered)
index_file_path = os.path.join(output_dir, "index.rst")
with open(index_file_path, "w") as file:
rendered_index = env.get_template('audio_index.rst.jinja').render(models=sorted_models)
file.write(rendered_index)
with open('../../xinference/model/video/model_spec.json', 'r') as file:
models = json.load(file)
sorted_models = sorted(models, key=lambda x: x['model_name'].lower())
output_dir = './models/builtin/video'
os.makedirs(output_dir, exist_ok=True)
for model in sorted_models:
# Process model_src for template compatibility
model_src = _extract_primary_model_src(model)
if model_src:
if 'huggingface' in model_src:
model['model_id'] = model_src['huggingface']['model_id']
elif 'modelscope' in model_src:
model['model_id'] = model_src['modelscope']['model_id']
model["model_ability"] = ', '.join(model.get("model_ability"))
rendered = env.get_template('video.rst.jinja').render(model)
output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
with open(output_file_path, 'w') as output_file:
output_file.write(rendered)
index_file_path = os.path.join(output_dir, "index.rst")
with open(index_file_path, "w") as file:
rendered_index = env.get_template('video_index.rst.jinja').render(models=sorted_models)
file.write(rendered_index)
if VLLM_INSTALLED:
architecture_to_models = build_architecture_to_models(llm_sorted_models)
supported_architectures = []
for architecture in VLLM_SUPPORTED_MODELS + VLLM_SUPPORTED_CHAT_MODELS:
if architecture not in supported_architectures:
supported_architectures.append(architecture)
groups = []
for architecture in supported_architectures:
if architecture in architecture_to_models:
model_names = sorted(set(architecture_to_models[architecture]), key=str.lower)
groups.append(model_names)
else:
groups.append([architecture])
groups = [', '.join("``%s``" % m for m in group) for group in groups]
vllm_model_str = '\n'.join('- %s' % group for group in groups)
for fn in ['getting_started/installation.rst', 'user_guide/backends.rst']:
with open(fn) as f:
content = f.read()
start_label = '.. vllm_start'
end_label = '.. vllm_end'
start = content.find(start_label) + len(start_label)
end = content.find(end_label)
new_content = content[:start] + '\n\n' + vllm_model_str + '\n' + content[end:]
with open(fn, 'w') as f:
f.write(new_content)
try:
output_dir = './user_guide'
os.makedirs(output_dir, exist_ok=True)
supervisor_metrics = get_metrics_from_url("http://127.0.0.1:9997/metrics")
worker_metrics = get_metrics_from_url("http://127.0.0.1:9977/metrics")
all_metrics = {"supervisor_metrics": supervisor_metrics, "worker_metrics": worker_metrics}
rendered = env.get_template('metrics.jinja').render(all_metrics)
output_file_path = os.path.join(output_dir, "metrics.rst")
with open(output_file_path, 'w') as output_file:
output_file.write(rendered)
except Exception:
print("Skip generate metrics doc, please start a local xinference server by: `xinference-local -mp 9977`.")
if __name__ == "__main__":
main()
================================================
FILE: doc/source/getting_started/environments.rst
================================================
.. _environments:
======================
Environments Variables
======================
XINFERENCE_ENDPOINT
~~~~~~~~~~~~~~~~~~~~
Endpoint of Xinference, used to connect to Xinference service.
Default value is http://127.0.0.1:9997 , you can get it through logs.
XINFERENCE_MODEL_SRC
~~~~~~~~~~~~~~~~~~~~~
Modelhub used for downloading models. Default is "huggingface", or you
can set "modelscope" as downloading source.
.. _environments_xinference_home:
XINFERENCE_HOME
~~~~~~~~~~~~~~~~
By default, Xinference uses ``/.xinference`` as home path to store
necessary files such as logs and models, where ```` is the home
path of current user. You can change this directory by configuring this environment
variable.
XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The maximum number of failed health checks tolerated at Xinference startup.
Default value is 5.
XINFERENCE_HEALTH_CHECK_INTERVAL
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Health check interval (seconds) at Xinference startup.
Default value is 5.
XINFERENCE_HEALTH_CHECK_TIMEOUT
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Health check timeout (seconds) at Xinference startup.
Default value is 10.
XINFERENCE_DISABLE_HEALTH_CHECK
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Xinference will automatically report health check at Xinference startup.
Setting this environment to 1 can disable health check.
XINFERENCE_DISABLE_METRICS
~~~~~~~~~~~~~~~~~~~~~~~~~~
Xinference will by default enable the metrics exporter on the supervisor and worker.
Setting this environment to 1 will disable the /metrics endpoint on the supervisor
and the HTTP service (only provide the /metrics endpoint) on the worker.
XINFERENCE_DOWNLOAD_MAX_ATTEMPTS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Maximum download retry attempts for model files.
Default value is 3.
XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Enable continuous batching for text-to-image models by specifying the target image size
(e.g., ``1024*1024``). Default is unset.
XINFERENCE_SSE_PING_ATTEMPTS_SECONDS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Server-Sent Events keepalive ping interval (seconds).
Default value is 600.
XINFERENCE_MAX_TOKENS
~~~~~~~~~~~~~~~~~~~~~
Global max tokens limit override for requests. Default is unset.
XINFERENCE_ALLOWED_IPS
~~~~~~~~~~~~~~~~~~~~~~
Restrict access to specified IPs or CIDR blocks. Default is unset (no restriction).
XINFERENCE_BATCH_SIZE
~~~~~~~~~~~~~~~~~~~~~
Default batch size used by the server when batching is enabled.
Default value is 32.
XINFERENCE_BATCH_INTERVAL
~~~~~~~~~~~~~~~~~~~~~~~~~
Default batching interval (seconds).
Default value is 0.003.
XINFERENCE_ALLOW_MULTI_REPLICA_PER_GPU
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Whether to allow multiple replicas on a single GPU.
Default value is 1 (enabled).
XINFERENCE_LAUNCH_STRATEGY
~~~~~~~~~~~~~~~~~~~~~~~~~~
GPU allocation strategy for replicas. Default is ``IDLE_FIRST_LAUNCH_STRATEGY``.
XINFERENCE_ENABLE_VIRTUAL_ENV
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Enable model virtual environments globally.
Default value is 1 (enabled, starting from v2.0).
XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Skip packages already present in system site-packages when creating virtual environments.
Default value is 1.
XINFERENCE_CSG_TOKEN
~~~~~~~~~~~~~~~~~~~~
Authentication token for CSGHub model source.
Default is unset.
XINFERENCE_CSG_ENDPOINT
~~~~~~~~~~~~~~~~~~~~~~~
CSGHub endpoint for model source.
Default value is ``https://hub-stg.opencsg.com/``.
================================================
FILE: doc/source/getting_started/index.rst
================================================
.. _getting_started_index:
===============
Getting Started
===============
.. toctree::
:maxdepth: 2
installation
using_xinference
logging
using_docker_image
using_kubernetes
troubleshooting
environments
release_notes
================================================
FILE: doc/source/getting_started/installation.rst
================================================
.. _installation:
============
Installation
============
Xinference can be installed with ``pip`` on Linux, Windows, and macOS. To run models using Xinference, you will need to install the backend corresponding to the type of model you intend to serve.
If you aim to serve all supported models, you can install all the necessary dependencies with a single command::
pip install "xinference[all]"
.. versionchanged:: v1.8.1
Due to irreconcilable package dependency conflicts between vLLM and sglang, we have removed sglang from the all extra. If you want to use sglang, please install it separately via ``pip install 'xinference[sglang]'``.
Several usage scenarios require special attention.
.. admonition:: **GGUF format** with **llama.cpp engine**
In this situation, it's advised to install its dependencies manually based on your hardware specifications to enable acceleration. For more details, see the :ref:`installation_gguf` section.
.. admonition:: **AWQ or GPTQ** format with **transformers engine**
**This section is added in v1.6.0.**
This is because the dependencies at this stage require special options and are difficult to install. Please run command below in advance
.. code-block:: bash
pip install "xinference[transformers_quantization]" --no-build-isolation
Some dependencies like ``transformers`` might be downgraded, you can run ``pip install "xinference[all]"`` afterwards.
If you want to install only the necessary backends, here's a breakdown of how to do it.
.. _inference_backend:
Transformers Backend
~~~~~~~~~~~~~~~~~~~~
PyTorch (transformers) supports the inference of most state-of-art models. It is the default backend for models in PyTorch format::
pip install "xinference[transformers]"
Notes:
- The transformers engine supports ``pytorch`` / ``gptq`` / ``awq`` / ``bnb`` / ``fp4`` formats.
- FP4 format requires ``transformers`` with ``FPQuantConfig`` support. If you see an import error,
please upgrade ``transformers`` to a newer version.
vLLM Backend
~~~~~~~~~~~~
vLLM is a fast and easy-to-use library for LLM inference and serving. Xinference will choose vLLM as the backend to achieve better throughput when the following conditions are met:
- The model format is ``pytorch``, ``gptq``, ``awq``, ``fp4``, ``fp8`` or ``bnb``.
- When the model format is ``pytorch``, the quantization is ``none``.
- When the model format is ``awq``, the quantization is ``Int4``.
- When the model format is ``gptq``, the quantization is ``Int3``, ``Int4`` or ``Int8``.
- The system is Linux and has at least one CUDA device
- The model family (for custom models) / model name (for builtin models) is within the list of models supported by vLLM
Currently, supported models include:
.. vllm_start
- ``code-llama``, ``code-llama-instruct``, ``code-llama-python``, ``deepseek``, ``deepseek-chat``, ``deepseek-coder``, ``deepseek-coder-instruct``, ``deepseek-r1-distill-llama``, ``gorilla-openfunctions-v2``, ``HuatuoGPT-o1-LLaMA-3.1``, ``llama-2``, ``llama-2-chat``, ``llama-3``, ``llama-3-instruct``, ``llama-3.1``, ``llama-3.1-instruct``, ``llama-3.3-instruct``, ``tiny-llama``, ``wizardcoder-python-v1.0``, ``wizardmath-v1.0``, ``Yi``, ``Yi-1.5``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``, ``Yi-200k``, ``Yi-chat``
- ``codestral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``, ``mistral-instruct-v0.3``, ``mistral-large-instruct``, ``mistral-nemo-instruct``, ``mistral-v0.1``, ``openhermes-2.5``, ``seallm_v2``
- ``Baichuan-M2``, ``codeqwen1.5``, ``codeqwen1.5-chat``, ``deepseek-r1-distill-qwen``, ``DianJin-R1``, ``fin-r1``, ``HuatuoGPT-o1-Qwen2.5``, ``KAT-V1``, ``marco-o1``, ``qwen1.5-chat``, ``qwen2-instruct``, ``qwen2.5``, ``qwen2.5-coder``, ``qwen2.5-coder-instruct``, ``qwen2.5-instruct``, ``qwen2.5-instruct-1m``, ``qwenLong-l1``, ``QwQ-32B``, ``QwQ-32B-Preview``, ``seallms-v3``, ``skywork-or1``, ``skywork-or1-preview``, ``XiYanSQL-QwenCoder-2504``
- ``llama-3.2-vision``, ``llama-3.2-vision-instruct``
- ``baichuan-2``, ``baichuan-2-chat``
- ``InternLM2ForCausalLM``
- ``qwen-chat``
- ``mixtral-8x22B-instruct-v0.1``, ``mixtral-instruct-v0.1``, ``mixtral-v0.1``
- ``cogagent``
- ``glm-edge-chat``, ``glm4-chat``, ``glm4-chat-1m``
- ``codegeex4``, ``glm-4v``
- ``seallm_v2.5``
- ``orion-chat``
- ``qwen1.5-moe-chat``, ``qwen2-moe-instruct``
- ``CohereForCausalLM``
- ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``, ``deepseek-vl2``
- ``deepseek-prover-v2``, ``deepseek-r1``, ``deepseek-r1-0528``, ``deepseek-v3``, ``deepseek-v3-0324``, ``Deepseek-V3.1``, ``moonlight-16b-a3b-instruct``
- ``deepseek-r1-0528-qwen3``, ``qwen3``
- ``minicpm3-4b``
- ``internlm3-instruct``
- ``gemma-3-1b-it``
- ``glm4-0414``
- ``minicpm-2b-dpo-bf16``, ``minicpm-2b-dpo-fp16``, ``minicpm-2b-dpo-fp32``, ``minicpm-2b-sft-bf16``, ``minicpm-2b-sft-fp32``, ``minicpm4``
- ``Ernie4.5``
- ``Qwen3-Coder``, ``Qwen3-Instruct``, ``Qwen3-Thinking``
- ``glm-4.5``, ``GLM-4.6``, ``GLM-4.7``
- ``gpt-oss``
- ``seed-oss``
- ``Qwen3-Next-Instruct``, ``Qwen3-Next-Thinking``
- ``DeepSeek-V3.2``, ``DeepSeek-V3.2-Exp``
- ``MiniMax-M2``, ``MiniMax-M2.5``
- ``glm-5``
.. vllm_end
To install Xinference and vLLM::
pip install "xinference[vllm]"
# FlashInfer is optional but required for specific functionalities such as sliding window attention with Gemma 2.
# For CUDA 12.4 & torch 2.4 to support sliding window attention for gemma 2 and llama 3.1 style rope
pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4
# For other CUDA & torch versions, please check https://docs.flashinfer.ai/installation.html
.. _installation_gguf:
Llama.cpp Backend
~~~~~~~~~~~~~~~~~
Xinference supports models in ``gguf`` format via ``xllamacpp``.
`xllamacpp `_ is developed by Xinference team,
and is the sole backend for llama.cpp since v1.6.0.
.. warning::
Since Xinference v1.5.0, ``llama-cpp-python`` is deprecated.
Since Xinference v1.6.0, ``llama-cpp-python`` has been removed.
Initial setup::
pip install "xinference[llama_cpp]"
For more installation instructions for ``xllamacpp`` to enable GPU acceleration, please refer to: https://github.com/xorbitsai/xllamacpp
SGLang Backend
~~~~~~~~~~~~~~
SGLang has a high-performance inference runtime with RadixAttention. It significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. And it also supports other common techniques like continuous batching and tensor parallelism.
Initial setup::
pip install "xinference[sglang]"
MLX Backend
~~~~~~~~~~~
MLX-lm is designed for Apple silicon users to run LLM efficiently.
Initial setup::
pip install "xinference[mlx]"
Other Platforms
~~~~~~~~~~~~~~~
* :ref:`Ascend NPU `
================================================
FILE: doc/source/getting_started/installation_npu.rst
================================================
.. _installation_npu:
=================================
Installation Guide for Ascend NPU
=================================
Xinference can run on Ascend NPU, follow below instructions to install.
.. warning::
The open-source version relies on Transformers for inference,
which can be slow on chips like 310p3. We provide an enterprise version that supports the MindIE engine,
offering better performance and compatibility for Ascend NPU.
Refer to `Xinference Enterprise `_
Installing PyTorch and Ascend extension for PyTorch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Install PyTorch CPU version and corresponding Ascend extension.
Take PyTorch v2.1.0 as example.
.. code-block:: bash
pip3 install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cpu
Then install `Ascend extension for PyTorch `_.
.. code-block:: bash
pip3 install 'numpy<2.0'
pip3 install decorator
pip3 install torch-npu==2.1.0.post3
Running below command to see if it correctly prints the Ascend NPU count.
.. code-block:: bash
python -c "import torch; import torch_npu; print(torch.npu.device_count())"
Installing Xinference
~~~~~~~~~~~~~~~~~~~~~
.. code-block:: bash
pip3 install xinference
Now you can use xinference according to :ref:`doc `.
``Transformers`` backend is the only available engine supported for Ascend NPU for open source version.
Enterprise Support
~~~~~~~~~~~~~~~~~~
If you encounter any performance or other issues for Ascend NPU, please reach out to us
via `link `_.
================================================
FILE: doc/source/getting_started/logging.rst
================================================
.. _logging:
=====================
Logging in Xinference
=====================
Configure Log Level
###################
You can configure the log level with the ``--log-level`` option.
For example, starting a local cluster with ``DEBUG`` log level:
.. code-block:: bash
xinference-local --log-level debug
Log Files
#########
Xinference supports log rotation of log files.
By default, logs rotate when they reach 100MB (maxBytes), and up to 30 backup files (backupCount) are kept.
Note that the log level configured above takes effect in both the command line logs and the log files.
Log Directory Structure
***********************
All the logs are stored in the ``/logs`` directory, where ```` can be configured as mentioned in :ref:`using_xinference`.
Xinference creates a subdirectory under the log directory ``/logs``.
The name of the subdirectory corresponds to the Xinference cluster startup time in milliseconds.
Local deployment
================
In a local deployment, the logs of Xinference supervisor and Xinference workers are combined into a single file. An example of the log directory structure is shown below::
/logs
└── local_1699503558105
└── xinference.log
where ``1699503558105`` is the timestamp when the Xinference cluster was created.
Therefore, when you create a cluster locally multiple times, you can look for the corresponding logs based on this timestamp.
Distributed deployment
======================
In a distributed deployment, Xinference supervisor and Xinference workers each create their own subdirectory under the log directory.
The name of the subdirectory starts with the role name, followed by the role startup time in milliseconds.
An example of the log directory structure is shown below::
/logs
└── supervisor_1699503558908
└── xinference.log
worker_1699503559105
└── xinference.log
================================================
FILE: doc/source/getting_started/release_notes.rst
================================================
.. _release_ntoes:
Release Notes
=============
This page provides a version-by-version index of Xinference release notes.
For detailed updates, please visit the corresponding links below.
+-----------------+--------------------------------------------------------------------------------+
| Version | Release Notes |
+=================+================================================================================+
| v2.3.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v2.2.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v2.1.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v2.0.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.17.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.16.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.15.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.14.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.13.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.12.0 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.11.0.post1 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
| v1.10.1 | `View release notes `_ |
+-----------------+--------------------------------------------------------------------------------+
----
For older versions and source history, see our GitHub releases page:
https://github.com/xorbitsai/inference/releases
================================================
FILE: doc/source/getting_started/troubleshooting.rst
================================================
.. _troubleshooting:
===============
Troubleshooting
===============
No huggingface repo access
==========================
Sometimes, you may face errors accessing huggingface models, such as the following message when accessing `llama2`:
.. code-block:: text
Cannot access gated repo for url https://huggingface.co/api/models/meta-llama/Llama-2-7b-hf.
Repo model meta-llama/Llama-2-7b-hf is gated. You must be authenticated to access it.
This typically indicates either a lack of access rights to the repository or missing huggingface access tokens.
The following sections provide guidance on addressing these issues.
Get access to the huggingface repo
----------------------------------
To obtain access, navigate to the desired huggingface repository and agree to its terms and conditions.
As an illustration, for the `llama2` model, you can use this link:
`https://huggingface.co/meta-llama/Llama-2-7b-hf `_.
Set up credentials to access huggingface
----------------------------------------
Your credential to access huggingface can be found online at `https://huggingface.co/settings/tokens `_.
You can set the token as an environmental variable, with ``export HUGGING_FACE_HUB_TOKEN=your_token_here``.
Incompatibility Between NVIDIA Driver and PyTorch Version
=========================================================
If you are using a NVIDIA GPU, you may face the following error:
.. code-block:: text
UserWarning: CUDA initialization: The NVIDIA driver on your system is too old
(found version 10010). Please update your GPU driver by downloading and installi
ng a new version from the URL: http://www.nvidia.com/Download/index.aspx Alterna
tively, go to: https://pytorch.org to install a PyTorch version that has been co
mpiled with your version of the CUDA driver. (Triggered internally at ..\c10\cu
da\CUDAFunctions.cpp:112.)
This typically indicates that your CUDA driver version is not compatible with the PyTorch version you are using.
Go to `https://pytorch.org `_ to install a PyTorch version that has been compiled with your
version of the CUDA driver. **Do not install a cuda version smaller than 11.8, preferably between 11.8 and 12.1.**
Say if your CUDA driver version is 11.8, then you can install PyTorch with the following command:
.. code-block:: python
pip install torch==2.0.1+cu118
Xinference service cannot be accessed from external systems through ``:9997``
=================================================================================
Use ``-H 0.0.0.0`` parameter in when starting Xinference:
.. code:: bash
xinference-local -H 0.0.0.0
Then Xinference service will listen on all network interfaces (not limited to ``127.0.0.1`` or ``localhost``).
If you are using the :ref:`using_docker_image`, please add ``-p :9997``
during the docker run command, then access is available through ``:`` of
the local machine.
Launching a built-in model takes a long time, and sometimes the model fails to download
=======================================================================================
Xinference by default uses HuggingFace as the source for models. If your
machines are in Mainland China, there might be accessibility issues when
using built-in models.
To address this, add environment variable ``XINFERENCE_MODEL_SRC=modelscope`` when starting
the Xinference to change the model source to ModelScope, which is optimized
for Mainland China.
If you’re starting Xinference with Docker, include ``-e XINFERENCE_MODEL_SRC=modelscope``
during the docker run command.
When using the official Docker image, RayWorkerVllm died due to OOM, causing the model to fail to load
=======================================================================================================
Docker's ``--shm-size`` parameter is used to set the size of shared memory.
The default size of shared memory (/dev/shm) is 64MB, which may be too small for vLLM backend.
You can increase its size by setting the ``--shm-size`` parameter as follows:
.. code:: bash
docker run --shm-size=128g ...
Missing ``model_engine`` parameter when launching LLM models
============================================================
Since version ``v0.11.0``, launching LLM models requires an additional ``model_engine`` parameter.
For specific information, please refer to :ref:`here `.
Resolving MKL Threading Layer Conflicts
========================================
When starting the Xinference server, you may encounter the error: ``ValueError: Model architectures ['Qwen2ForCausalLM'] failed to be inspected. Please check the logs for more details.``
The underlying cause shown in the logs is:
.. code-block:: text
Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.
Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.
This typically occurs when NumPy was installed via conda. Conda's NumPy is built with Intel MKL optimizations, which conflicts with the GNU OpenMP library (libgomp) already loaded in your environment.
Solution 1: Override the Threading Layer
-----------------------------------------
Force Intel's Math Kernel Library to use GNU's OpenMP implementation:
.. code-block:: bash
MKL_THREADING_LAYER=GNU xinference-local
Solution 2: Reinstall NumPy with pip
-------------------------------------
Uninstall conda's NumPy and reinstall using pip:
.. code-block:: bash
pip uninstall -y numpy && pip install numpy
#Or just --force-reinstall
pip install --force-reinstall numpy
Related Note: vLLM and PyTorch
-------------------------------
If you're using vLLM, avoid installing PyTorch with conda. Refer to the official vLLM installation guide for GPU-specific instructions: https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html
Configuring PyPI Mirrors to Speed Up Package Installation
==========================================================
If you're in Mainland China, using a PyPI mirror can significantly speed up package installation. Here are some commonly used mirrors:
- Tsinghua University: ``https://pypi.tuna.tsinghua.edu.cn/simple``
- Alibaba Cloud: ``https://mirrors.aliyun.com/pypi/simple/``
- Tencent Cloud: ``https://mirrors.cloud.tencent.com/pypi/simple``
However, be aware that some packages may not be available on certain mirrors. For example, if you're installing ``xinference[audio]`` using only the Aliyun mirror, the installation may fail.
This happens because ``num2words``, a dependency used by ``MeloTTS``, is not available on the Aliyun mirror. As a result, ``pip install xinference[audio]`` will resolve to older versions like ``xinference==1.2.0`` and ``xoscar==0.8.0`` (as of Oct 27, 2025).
These older versions are incompatible and will produce the error: ``MainActorPool.append_sub_pool() got an unexpected keyword argument 'start_method'``
.. code-block:: bash
curl -s https://mirrors.aliyun.com/pypi/simple/num2words/ | grep -i "num2words"
# Returns NOTHING! But it works on Tsinghua or Tencent mirrors.
# uv pip install "xinference[audio]" will then install the following packages (as of Oct 27, 2025):
+ x-transformers==2.10.2
+ xinference==1.2.0
+ xoscar==0.8.0
To avoid this issue when installing the xinference audio package, use multiple mirrors:
.. code-block:: bash
uv pip install xinference[audio] --index-url https://mirrors.aliyun.com/pypi/simple --extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple
# Optional: Set this globally in your uv config
mkdir -p ~/.config/uv
cat >> ~/.config/uv/uv.toml << EOF
index-url = "https://mirrors.aliyun.com/pypi/simple"
extra-index-url = ["https://pypi.tuna.tsinghua.edu.cn/simple"]
EOF
Installing Xinference 1.12.0 with uv Fails (As of November 2025)
=================================================================
**Note:** This is a temporary issue due to the current package ecosystem and uv prioritizing **higher versions for direct dependencies** over **indirect dependencies**.
Symptom
-------
When installing xinference 1.12.0 as of November 2025 using ``uv pip install xinference``, you may encounter an issue where very old package versions are installed, particularly:
- ``transformers==4.12.2`` (from 2021)
- ``tokenizers==0.10.3`` (from 2021)
- ``huggingface-hub==1.0.1``
Then uv fails with "Failed to build `tokenizers==0.10.3`"
Root Cause
----------
This occurs because uv prioritizes **higher versions for direct dependencies** over **indirect dependencies**:
1. xinference 1.12.0 specifies ``huggingface-hub>=0.19.4`` as a **direct dependency** (no upper bound)
2. uv selects the latest: ``huggingface-hub==1.0.1`` as of November 06 2025
3. However, ``transformers<=4.57.3`` (an **indirect dependency** via ``peft``) requires ``huggingface-hub<1.0``
4. To resolve the conflict, uv keeps the direct dependency at 1.0.1 and downgrades the indirect dependency ``transformers`` to ancient version 4.12.2
**This is by design in uv**: it prioritizes what you explicitly ask for (direct dependencies) over transitive dependencies. Refer to https://github.com/astral-sh/uv/issues/16601
**Update:** The latest transformers 4.57.3 (as in 2026.01.05) still requires ``huggingface-hub<1.0``.
Solutions
---------
**Solution 1: Pre-constrain huggingface-hub (Recommended)**
Explicitly constrain ``huggingface-hub`` to a compatible version range:
.. code-block:: bash
uv pip install "huggingface-hub>=0.34.0,<1.0" xinference
This forces uv to select a ``huggingface-hub`` version that's compatible with modern ``transformers``.
**Solution 2: Make transformers a direct dependency**
By specifying ``transformers`` explicitly, it becomes a direct dependency and uv will prefer higher versions:
.. code-block:: bash
uv pip install transformers xinference
**Solution 3: Use pip**
Or just resort to using ``pip install xinference`` which will resolve to the following versions
- ``transformers==4.57.1``
- ``huggingface-hub==0.36.0``
- ``tokenizers==0.22.1``
vLLM + Torch + Xinference Compatibility Issue (Segmentation Fault)
===================================================================
Symptom
-------
If you have **vLLM < 0.12.0** installed and upgrade xinference (particularly using ``uv pip install -U xinference``), xinference may fail to start with a segmentation fault:
.. code-block:: text
root@server:/home# xinference-local --host 0.0.0.0 --port 9997
INFO 12-30 17:35:37 [__init__.py:216] Automatically detected platform cuda.
Aborted (core dumped)
Root Cause
----------
This issue has three contributing factors:
1. **Binary Incompatibility**: vLLM versions before 0.12.0 were compiled against PyTorch 2.8.0. These versions are incompatible with PyTorch 2.9. Reference: `vLLM v0.12.0 Release Notes `_
2. **Xinference's Unbounded Torch Dependency**: Xinference's ``setup.cfg`` does not specify an upper bound for PyTorch:
.. code-block:: ini
[options]
install_requires =
torch # No version constraint!
This allows package managers to upgrade PyTorch to incompatible versions.
3. **Different Package Manager Behaviors**:
- **pip**: Conservative - only upgrades the specified package unless dependencies are incompatible
- **uv with -U flag**: Aggressive - re-resolves ALL dependencies and picks latest versions
Therefore before you're ready to upgrade your entire stack and just want to upgrade xinference, use either:
- ``pip install -U xinference`` (keeps PyTorch unchanged, only upgrades xinference)
- ``uv pip install "xinference==1.16.0"`` (without -U flag, only upgrades xinference too)
================================================
FILE: doc/source/getting_started/using_docker_image.rst
================================================
.. _using_docker_image:
=======================
Xinference Docker Image
=======================
Xinference provides official images for use on Dockerhub.
.. versionchanged:: v2.0
Starting from **Xinference v2.0**, to use the CUDA version of the image, the minimum CUDA version must be **CUDA 12.9**.
Prerequisites
=============
* The image can only run in an environment with GPUs and CUDA installed, because Xinference in the image relies on Nvidia GPUs for acceleration.
* CUDA must be successfully installed on the host machine. This can be determined by whether you can successfully execute the ``nvidia-smi`` command.
* For CUDA version >= 12.9, CUDA version in the docker image is ``12.9``, and the CUDA version on the host machine should be ``12.9`` or above, and the NVIDIA driver version should be ``575`` or above.
* Ensure `NVIDIA Container Toolkit `_ installed.
Docker Image
============
The official image of Xinference is available on DockerHub in the repository ``xprobe/xinference``.
Available tags include:
* ``nightly-main``: This image is built daily from the `GitHub main branch `_ and generally does not guarantee stability.
* ``v``: This image is built each time a Xinference release version is published, and it is typically more stable.
* ``latest``: This image is built with the latest Xinference release version.
* For CPU version, add ``-cpu`` suffix, e.g. ``nightly-main-cpu``.
Dockerfile for custom build
===========================
If you need to build the Xinference image according to your own requirements, the source code for the Dockerfile is located at `xinference/deploy/docker/Dockerfile `_ for reference.
Please make sure to be in the top-level directory of Xinference when using this Dockerfile. For example:
.. code-block:: bash
git clone https://github.com/xorbitsai/inference.git
cd inference
docker build --progress=plain -t test -f xinference/deploy/docker/Dockerfile .
Image usage
===========
You can start Xinference in the container like this, simultaneously mapping port 9997 in the container to port 9998 on the host, enabling debug logging, and downloading models from modelscope.
.. code-block:: bash
docker run -e XINFERENCE_MODEL_SRC=modelscope -p 9998:9997 --gpus all xprobe/xinference:v xinference-local -H 0.0.0.0 --log-level debug
.. warning::
* The option ``--gpus`` is essential and cannot be omitted, because as mentioned earlier, the image requires the host machine to have a GPU. Otherwise, errors will occur.
* The ``-H 0.0.0.0`` parameter after the ``xinference-local`` command cannot be omitted. Otherwise, the host machine may not be able to access the port inside the container.
* You can add multiple ``-e`` options to introduce multiple environment variables.
Certainly, if you prefer, you can also manually enter the docker container and start Xinference in any desired way.
.. note::
For multiple GPUs, make sure to set the shared memory size, for example: `docker run --shm-size=128g ...`
Mount your volume for loading and saving models
===============================================
The image does not contain any model files by default, and it downloads the models into the container.
Typically, you would need to mount a directory on the host machine to the docker container, so that Xinference can download the models onto it, allowing for reuse.
In this case, you need to specify a volume when running the Docker image and configure environment variables for Xinference:
.. code-block:: bash
docker run -v : -e XINFERENCE_HOME= -p 9998:9997 --gpus all xprobe/xinference:v xinference-local -H 0.0.0.0
The principle behind the above command is to mount the specified directory from the host machine into the container, and then set the ``XINFERENCE_HOME`` environment variable to point to that directory inside the container.
This way, all downloaded model files will be stored in the directory you specified on the host machine.
You don't have to worry about losing them when the Docker container stops, and the next time you run it, you can directly use the existing models without the need for repetitive downloads.
If you downloaded the model using the default path on the host machine, and since the xinference cache directory
stores the model using symbolic links, you need to mount the directory where the original file is located into the container as well.
For example, if you are using HuggingFace and Modelscope as model hub, you would need to mount the corresponding
directories into the container. Generally, the cache directories for HuggingFace and Modelscope are located
at /.cache/huggingface and /.cache/modelscope. The command would be like:
.. code-block:: bash
docker run \
-v /.xinference:/root/.xinference \
-v /.cache/huggingface:/root/.cache/huggingface \
-v /.cache/modelscope:/root/.cache/modelscope \
-p 9997:9997 \
--gpus all \
xprobe/xinference:v \
xinference-local -H 0.0.0.0
================================================
FILE: doc/source/getting_started/using_kubernetes.rst
================================================
.. _using_kubernetes:
########################
Xinference on Kubernetes
########################
************
Helm Support
************
Xinference provides a method for installation in a Kubernetes cluster via ``Helm`` .
Prerequisites
=============
* You have a fully functional Kubernetes cluster.
* Enable GPU support in Kubernetes, refer to `here `_.
* ``Helm`` is correctly installed.
Steps
=====
#. Add xinference helm repo.
.. code-block:: bash
helm repo add xinference https://xorbitsai.github.io/xinference-helm-charts
#. Update xinference helm repo indexes and query versions.
.. code-block:: bash
helm repo update xinference
helm search repo xinference/xinference --devel --versions
#. Install
.. code-block:: bash
helm install xinference xinference/xinference -n xinference --version
Customized Installation
=======================
The installation method mentioned above sets up a Xinference cluster similar to a single-machine setup,
with only one worker and all startup parameters at their default values.
However, this is usually not the desired setup.
Below are some common custom installation configurations.
#. I need to download models from ``ModelScope``.
.. code-block:: bash
helm install xinference xinference/xinference -n xinference --version --set config.model_src="modelscope"
#. I want to use cpu image of xinference (or use any other version of xinference images).
.. code-block:: bash
helm install xinference xinference/xinference -n xinference --version --set config.xinference_image=""
#. I want to have 4 Xinference workers, with each worker managing 4 GPUs.
.. code-block:: bash
helm install xinference xinference/xinference -n xinference --version --set config.worker_num=4 --set config.gpu_per_worker="4"
The above installation method is based on Helm ``--set`` option.
For more complex custom installations, such as multiple workers with shared storage,
it is highly recommended to use your own ``values.yaml`` file with Helm ``-f`` option for installation.
The default ``values.yaml`` file is located `here `_.
Some examples can be found `here `_.
******************
KubeBlocks Support
******************
You can also install Xinference in Kubernetes using the third-party ``KubeBlocks``.
This method is not maintained by Xinference and does not guarantee timely updates or availability.
Please refer to the documentation at `here `_.
================================================
FILE: doc/source/getting_started/using_xinference.rst
================================================
.. _using_xinference:
================
Using Xinference
================
Run Xinference Locally
======================
Let's start by running Xinference on a local machine and running a classic LLM model: ``qwen2.5-instruct``.
After this quickstart, you will move on to learning how to deploy Xinference in a cluster environment.
Start Local Server
------------------
First, please ensure that you have installed Xinference according to the instructions provided :ref:`here `.
To start a local instance of Xinference, run the following command:
.. tabs::
.. tab:: shell
.. code-block:: bash
xinference-local --host 0.0.0.0 --port 9997
.. tab:: output
.. code-block:: bash
INFO Xinference supervisor 0.0.0.0:64570 started
INFO Xinference worker 0.0.0.0:64570 started
INFO Starting Xinference at endpoint: http://0.0.0.0:9997
INFO Uvicorn running on http://0.0.0.0:9997 (Press CTRL+C to quit)
.. note::
By default, Xinference uses ``/.xinference`` as home path to store necessary files such as logs and models,
where ```` is the home path of current user.
You can change this directory by configuring the environment variable ``XINFERENCE_HOME``.
For example:
.. code-block:: bash
XINFERENCE_HOME=/tmp/xinference xinference-local --host 0.0.0.0 --port 9997
Congrats! You now have Xinference running on your local machine. Once Xinference is running, there are multiple ways
we can try it: via the web UI, via cURL, via the command line, or via the Xinference's python client.
You can visit the web UI at `http://127.0.0.1:9997/ui `_ and visit `http://127.0.0.1:9997/docs `_
to inspect the API docs.
You can install the Xinference command line tool and Python client using the following command:
.. code-block:: bash
pip install xinference
The command line tool is ``xinference``. You can list the commands that can be used by running:
.. tabs::
.. tab:: shell
.. code-block:: bash
xinference --help
.. tab:: output
.. code-block:: bash
Usage: xinference [OPTIONS] COMMAND [ARGS]...
Options:
-v, --version Show the version and exit.
--log-level TEXT
-H, --host TEXT
-p, --port INTEGER
--help Show this message and exit.
Commands:
cached
cal-model-mem
chat
engine
generate
launch
list
login
register
registrations
remove-cache
stop-cluster
terminate
unregister
vllm-models
You can install the Xinference Python client with minimal dependencies using the following command.
Please ensure that the version of the client matches the version of the Xinference server.
.. code-block:: bash
pip install xinference-client==${SERVER_VERSION}
.. _about_model_engine:
About Model Engine
------------------
Since ``v0.11.0`` , before launching the LLM model, you need to specify the inference engine you want to run.
Currently, xinference supports the following inference engines:
* ``vllm``
* ``sglang``
* ``llama.cpp``
* ``transformers``
* ``MLX``
About the details of these inference engine, please refer to :ref:`here `.
Note that when launching a LLM model, the ``model_format`` and ``quantization`` of the model you want to launch
is closely related to the inference engine.
You can use ``xinference engine`` command to query the combination of parameters of the model you want to launch.
This will demonstrate under what conditions a model can run on which inference engines.
For example:
#. I would like to query about which inference engines the ``qwen-chat`` model can run on, and what are their respective parameters.
.. code-block:: bash
xinference engine -e --model-name qwen-chat
#. I want to run ``qwen-chat`` with ``VLLM`` as the inference engine, but I don't know how to configure the other parameters.
.. code-block:: bash
xinference engine -e --model-name qwen-chat --model-engine vllm
#. I want to launch the ``qwen-chat`` model in the ``GGUF`` format, and I need to know how to configure the remaining parameters.
.. code-block:: bash
xinference engine -e --model-name qwen-chat -f ggufv2
In summary, compared to previous versions, when launching LLM models,
you need to additionally pass the ``model_engine`` parameter.
You can retrieve information about the supported inference engines and their related parameter combinations
through the ``xinference engine`` command.
.. note::
Here are some recommendations on when to use which engine:
- **Linux**
- When possible, prioritize using **vLLM** or **SGLang** for better performance.
- If resources are limited, consider using **llama.cpp**, as it offers more quantization options.
- For other cases, consider using **Transformers**, which supports nearly all models.
- **Windows**
- It is recommended to use **WSL**, and in this case, follow the same choices as Linux.
- Otherwise, prefer **llama.cpp**, and for unsupported models, opt for **Transformers**.
- **Mac**
- If supported by the model, use the **MLX engine**, as it delivers the best performance.
- For other cases, prefer **llama.cpp**, and for unsupported models, choose **Transformers**.
Run qwen2.5-instruct
--------------------
Let's start by running a built-in model: ``qwen2.5-instruct``. When you start a model for the first time, Xinference will
download the model parameters from HuggingFace, which might take a few minutes depending on the size of the model weights.
We cache the model files locally, so there's no need to redownload them for subsequent starts.
.. note::
Xinference also allows you to download models from other sites. You can do this by setting an environment variable
when launching Xinference. For example, if you want to download models from `modelscope `_,
do the following:
.. code-block:: bash
XINFERENCE_MODEL_SRC=modelscope xinference-local --host 0.0.0.0 --port 9997
We can specify the model's UID using the ``--model-uid`` or ``-u`` flag. If not specified, Xinference will generate a unique ID.
The default unique ID will be identical to the model name.
.. tabs::
.. code-tab:: bash shell
xinference launch --model-engine -n qwen2.5-instruct -s 0_5 -f pytorch
.. code-tab:: bash cURL
curl -X 'POST' \
'http://127.0.0.1:9997/v1/models' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model_engine": "",
"model_name": "qwen2.5-instruct",
"model_format": "pytorch",
"size_in_billions": "0_5"
}'
.. code-tab:: python
from xinference.client import RESTfulClient
client = RESTfulClient("http://127.0.0.1:9997")
model_uid = client.launch_model(
model_engine="",
model_name="qwen2.5-instruct",
model_format="pytorch",
size_in_billions="0_5"
)
print('Model uid: ' + model_uid)
.. code-tab:: bash output
Model uid: qwen2.5-instruct
.. note::
For some engines, such as vllm, users need to specify the engine-related parameters when
running models. In this case, you can directly specify the parameter name and value in the
command line, for example:
.. code-block:: bash
xinference launch --model-engine vllm -n qwen2.5-instruct -s 0_5 -f pytorch --gpu_memory_utilization 0.9
`gpu_memory_utilization=0.9` will pass to vllm when launching model.
.. note::
For more tips on model launching, refer to :ref:`launch`.
Congrats! You now have ``qwen2.5-instruct`` running by Xinference. Once the model is running, we can try it out either via cURL,
or via Xinference's python client:
.. tabs::
.. code-tab:: bash cURL
curl -X 'POST' \
'http://127.0.0.1:9997/v1/chat/completions' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "qwen2.5-instruct",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is the largest animal?"
}
]
}'
.. code-tab:: python
from xinference.client import RESTfulClient
client = RESTfulClient("http://127.0.0.1:9997")
model = client.get_model("qwen2.5-instruct")
model.chat(
messages=[
{"role": "user", "content": "Who won the world series in 2020?"}
]
)
.. code-tab:: json output
{
"id": "chatcmpl-8d76b65a-bad0-42ef-912d-4a0533d90d61",
"model": "qwen2.5-instruct",
"object": "chat.completion",
"created": 1688919187,
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "The largest animal that has been scientifically measured is the blue whale, which has a maximum length of around 23 meters (75 feet) for adult animals and can weigh up to 150,000 pounds (68,000 kg). However, it is important to note that this is just an estimate and that the largest animal known to science may be larger still. Some scientists believe that the largest animals may not have a clear \"size\" in the same way that humans do, as their size can vary depending on the environment and the stage of their life."
},
"finish_reason": "None"
}
],
"usage": {
"prompt_tokens": -1,
"completion_tokens": -1,
"total_tokens": -1
}
}
Xinference provides OpenAI-compatible APIs for its supported models, so you can use Xinference as a local drop-in replacement for OpenAI APIs. For example:
.. code-block:: python
from openai import OpenAI
client = OpenAI(base_url="http://127.0.0.1:9997/v1", api_key="not used actually")
response = client.chat.completions.create(
model="qwen2.5-instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the largest animal?"}
]
)
print(response)
The following OpenAI APIs are supported:
- Chat Completions: `https://platform.openai.com/docs/api-reference/chat `_
- Completions: `https://platform.openai.com/docs/api-reference/completions `_
- Embeddings: `https://platform.openai.com/docs/api-reference/embeddings `_
Xinference also supports Anthropic API via base url ``http://127.0.0.1:9997/anthropic``, you can use Xinference in Claude Code and so forth.
Refer to :ref:`anthropic client ` for more details.
Manage Models
-------------
In addition to launching models, Xinference offers various ways to manage the entire lifecycle of models.
You can manage models in Xinference through the command line, cURL, or Xinference's python client.
You can list all models of a certain type that are available to launch in Xinference:
.. tabs::
.. code-tab:: bash shell
xinference registrations -t LLM
.. code-tab:: bash cURL
curl http://127.0.0.1:9997/v1/model_registrations/LLM
.. code-tab:: python
from xinference.client import RESTfulClient
client = RESTfulClient("http://127.0.0.1:9997")
print(client.list_model_registrations(model_type='LLM'))
The following command gives you the currently running models in Xinference:
.. tabs::
.. code-tab:: bash shell
xinference list
.. code-tab:: bash cURL
curl http://127.0.0.1:9997/v1/models
.. code-tab:: python
from xinference.client import RESTfulClient
client = RESTfulClient("http://127.0.0.1:9997")
print(client.list_models())
When you no longer need a model that is currently running, you can remove it in the following way to free up the resources it occupies:
.. tabs::
.. code-tab:: bash shell
xinference terminate --model-uid "qwen2.5-instruct"
.. code-tab:: bash cURL
curl -X DELETE http://127.0.0.1:9997/v1/models/qwen2.5-instruct
.. code-tab:: python
from xinference.client import RESTfulClient
client = RESTfulClient("http://127.0.0.1:9997")
client.terminate_model(model_uid="qwen2.5-instruct")
.. _distributed_getting_started:
Deploy Xinference In a Cluster
==============================
To deploy Xinference in a cluster, you need to start a Xinference supervisor on one server and Xinference workers
on the other servers.
First, make sure you have already installed Xinference on each of the servers according to the instructions
provided :ref:`here `. Then follow the steps below:
Start the Supervisor
--------------------
On the server where you want to run the Xinference supervisor, run the following command:
.. code-block:: bash
xinference-supervisor -H "${supervisor_host}"
Replace ``${supervisor_host}`` with the actual host of your supervisor server.
You can the supervisor's web UI at `http://${supervisor_host}:9997/ui `_ and visit
`http://${supervisor_host}:9997/docs `_ to inspect the API docs.
Start the Workers
-----------------
On each of the other servers where you want to run Xinference workers, run the following command:
.. code-block:: bash
xinference-worker -e "http://${supervisor_host}:9997" -H "${worker_host}"
.. note::
Note that you must replace ``${worker_host}`` with the actual host of your worker server.
.. note::
Note that if you need to interact with the Xinference in a cluster via the command line,
you should include the ``-e`` or ``--endpoint`` flag to specify the supervisor server's endpoint. For example:
.. code-block:: bash
xinference launch -n qwen2.5-instruct -s 0_5 -f pytorch -e "http://${supervisor_host}:9997"
Using Xinference With Docker
=============================
To start Xinference in a Docker container, run the following command:
Run On Nvidia GPU Host
-----------------------
For cuda 12.4:
.. code-block:: bash
docker run -e XINFERENCE_MODEL_SRC=modelscope -p 9998:9997 --gpus all xprobe/xinference: xinference-local -H 0.0.0.0 --log-level debug
For cuda 12.8:
.. versionadded:: v1.8.1
CUDA 12.8 version is experimental, welcome to give us feedbacks to help us to improve.
.. versionchanged:: v1.16.0
CUDA 12.8 version is removed in v1.16.0 .
.. code-block:: bash
docker run -e XINFERENCE_MODEL_SRC=modelscope -p 9998:9997 --gpus all xprobe/xinference:-cu128 xinference-local -H 0.0.0.0 --log-level debug
For cuda 12.9:
.. versionadded:: v1.16.0
CUDA 12.9 will become the default version when Xinference v2.0.0 released.
.. code-block:: bash
docker run -e XINFERENCE_MODEL_SRC=modelscope -p 9998:9997 --gpus all xprobe/xinference:-cu129 xinference-local -H 0.0.0.0 --log-level debug
Run On CPU Only Host
-----------------------
.. code-block:: bash
docker run -e XINFERENCE_MODEL_SRC=modelscope -p 9998:9997 xprobe/xinference:-cpu xinference-local -H 0.0.0.0 --log-level debug
Replace ```` with Xinference versions, e.g. ``v0.10.3``, ``latest`` can be used for the latest version.
For more docker usage, refer to :ref:`Using Docker Image `.
What's Next?
============
Congratulations on getting started with Xinference! To help you navigate and make the most out of this
powerful tool, here are some resources and guides:
* :ref:`How to Use Client APIs for Different Types of Models `
* :ref:`Choosing the Right Backends for Your Needs `
================================================
FILE: doc/source/index.rst
================================================
.. _index:
======================
Welcome to Xinference!
======================
.. toctree::
:maxdepth: 2
:hidden:
getting_started/index
models/index
user_guide/index
examples/index
reference/index
development/index
Xorbits Inference (Xinference) is an open-source platform to streamline the operation and integration
of a wide array of AI models. With Xinference, you're empowered to run inference using any open-source LLMs,
embedding models, and multimodal models either in the cloud or on your own premises, and create robust
AI-driven applications.
Developing Real-world AI Applications with Xinference
-----------------------------------------------------
.. tabs::
.. code-tab:: python LLM
from xinference.client import Client
client = Client("http://localhost:9997")
model = client.get_model("MODEL_UID")
# Chat to LLM
model.chat(
messages=[{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "What is the largest animal?"}],
generate_config={"max_tokens": 1024}
)
# Chat to VL model
model.chat(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What’s in this image?"},
{
"type": "image_url",
"image_url": {
"url": "http://i.epochtimes.com/assets/uploads/2020/07/shutterstock_675595789-600x400.jpg",
},
},
],
}
],
generate_config={"max_tokens": 1024}
)
.. code-tab:: python Embedding
from xinference.client import Client
client = Client("http://localhost:9997")
model = client.get_model("MODEL_UID")
model.create_embedding("What is the capital of China?")
.. code-tab:: python Image
from xinference.client import Client
client = Client("http://localhost:9997")
model = client.get_model("MODEL_UID")
model.text_to_image("An astronaut walking on the mars")
.. code-tab:: python Audio
from xinference.client import Client
client = Client("http://localhost:9997")
model = client.get_model("MODEL_UID")
with open("speech.mp3", "rb") as audio_file:
model.transcriptions(audio_file.read())
.. code-tab:: python Rerank
from xinference.client import Client
client = Client("http://localhost:9997")
model = client.get_model("MODEL_UID")
query = "A man is eating pasta."
corpus = [
"A man is eating food.",
"A man is eating a piece of bread.",
"The girl is carrying a baby.",
"A man is riding a horse.",
"A woman is playing violin."
]
print(model.rerank(corpus, query))
.. code-tab:: python Video
from xinference.client import Client
client = Client("http://localhost:9997")
model = client.get_model("MODEL_UID")
model.text_to_video("")
Getting Started
---------------
.. grid:: 2
.. grid-item-card:: Install Xinference
:link: installation
:link-type: ref
Install Xinference on Linux, Windows, and macOS.
.. grid-item-card:: Try it out!
:link: using_xinference
:link-type: ref
Start by running Xinference on a local machine.
.. grid:: 2
.. grid-item-card:: Explore models
:link: models_builtin_index
:link-type: ref
Explore a wide range of models supported by Xinference.
.. grid-item-card:: Register your own model
:link: models_custom
:link-type: ref
Register model weights and turn it into an API.
Explore the API
---------------
.. grid:: 2
.. grid-item-card:: Chat & Generate
:link: chat
:link-type: ref
Learn how to chat with LLMs in Xinference.
.. grid-item-card:: Tools
:link: tools
:link-type: ref
Learn how to connect LLM with external tools.
.. grid:: 2
.. grid-item-card:: Embeddings
:link: embed
:link-type: ref
Learn how to create text embeddings in Xinference.
.. grid-item-card:: Rerank
:link: rerank
:link-type: ref
Learn how to use rerank models in Xinference.
.. grid:: 2
.. grid-item-card:: Images
:link: image
:link-type: ref
Learn how to generate images with Xinference.
.. grid-item-card:: Multimodal
:link: multimodal
:link-type: ref
Learn how to process images and audio with LLMs.
.. grid:: 2
.. grid-item-card:: Audio
:link: audio
:link-type: ref
Learn how to turn audio into text or text into audio with Xinference.
.. grid-item-card:: Video
:link: video
:link-type: ref
Learn how to generate video with Xinference.
.. grid:: 2
.. grid-item-card:: Flexible
:link: flexible
:link-type: ref
Learn how to inference traditional ML models with Xinference.
Getting Involved
----------------
.. grid::
:gutter: 1
.. grid-item::
.. div:: sd-font-weight-normal sd-fs-5
Get Latest News
.. grid:: 1
:gutter: 3
.. grid-item-card::
:link: https://twitter.com/Xorbitsio
:fab:`twitter` Follow us on Twitter
.. grid-item-card::
:link: https://zhihu.com/org/xorbits
:fab:`zhihu` Read our blogs
.. grid-item::
.. div:: sd-font-weight-normal sd-fs-5
Get Support
.. grid:: 1
:gutter: 3
.. grid-item-card::
:link: https://xinference.cn/images/WeCom.jpg
:fab:`weixin` Find community on WeChat
.. grid-item-card::
:link: https://discord.gg/Xw9tszSkr5
:fab:`discord` Find community on Discord
.. grid-item-card::
:link: https://github.com/xorbitsai/inference/issues/new/choose
:fab:`github` Open an issue
.. grid-item::
.. div:: sd-fs-5
Contribute to Xinference
.. grid:: 1
:gutter: 3
.. grid-item-card::
:link: https://github.com/xorbitsai/inference/pulls
:fab:`github` Create a pull request
================================================
FILE: doc/source/locale/zh_CN/LC_MESSAGES/development/contributing_codebase.po
================================================
# SOME DESCRIPTIVE TITLE.
# Copyright (C) 2023, Xorbits Inc.
# This file is distributed under the same license as the Xinference package.
# FIRST AUTHOR , 2024.
#
#, fuzzy
msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2024-03-07 15:03+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language: zh_CN\n"
"Language-Team: zh_CN \n"
"Plural-Forms: nplurals=1; plural=0;\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.14.0\n"
#: ../../source/development/contributing_codebase.rst:3
msgid "Contributing to the code base"
msgstr "代码库开发指南"
#: ../../source/development/contributing_codebase.rst:6
msgid "Table of contents:"
msgstr "目录"
#: ../../source/development/contributing_codebase.rst:9
msgid "Code standards"
msgstr "代码规范"
#: ../../source/development/contributing_codebase.rst:11
msgid ""
"Writing good code is not just about what you write. It is also about "
"*how* you write it. During Continuous Integration testing, several tools "
"will be run to check your code for stylistic errors. Good style is a "
"requirement for submitting code to Xinference."
msgstr ""
"写出好的代码不仅在于你写了什么,更在于你是如何写的。在持续集成测试期间,会有多个工具来检查您的代码是否存在风格错误。良好的编程风格是提交代码到 "
"Xinference 的要求之一。"
#: ../../source/development/contributing_codebase.rst:15
msgid ""
"In addition, it is important that we do not make sudden changes to the "
"code that could have the potential to break a lot of user code as a "
"result. Therefore we need it to be as backwards compatible as possible to"
" avoid mass breakages."
msgstr "此外,不要对代码进行突然的更改,这可能会导致大量用户代码出现问题。所以,我们需要尽可能地保持向后兼容,以避免大规模的故障。"
#: ../../source/development/contributing_codebase.rst:20
msgid "Autofixing formatting errors"
msgstr "自动修复格式错误"
#: ../../source/development/contributing_codebase.rst:22
msgid ""
"Moreover, Continuous Integration will run code formatting checks like "
"``black``, ``flake8``, ``isort``, and others using `pre-commit hooks "
"`_ Any warnings generated by these checks will "
"cause the Continuous Integration to fail. Therefore, it is advisable to "
"run the check yourself before submitting code. This can be done by "
"installing ``pre-commit``::"
msgstr ""
"此外,持续集成将使用 `pre-commit hooks `_ 运行诸如 ``black``、``flake8``、``isort`` "
"等代码格式检查工具。任何由这些检查生成的警告都将导致持续集成失败。因此,建议在提交代码之前自行运行这些检查。"
"可以通过在 Xinference 仓库的根目录下安装 ``pre-commit`` 来完成这一操作:"
#: ../../source/development/contributing_codebase.rst:30
msgid "and then running::"
msgstr "然后执行命令:"
#: ../../source/development/contributing_codebase.rst:34
msgid ""
"from the root of the Xinference repository. This setup ensures that all "
"styling checks are automatically executed each time you commit changes "
"without your needing to run each one manually. In addition, using ``pre-"
"commit`` will also allow you to more easily remain up-to-date with our "
"code checks as they change."
msgstr ""
"安装好了以后就能确保每次提交更改时都会自动执行所有样式检查,无需手动逐个运行。"
"此外,使用 ``pre-commit`` 也能让您更轻松地在我们的代码检查发生更改的时候保持同步。"
#: ../../source/development/contributing_codebase.rst:39
msgid ""
"Note that if needed, you can skip these checks with ``git commit --no-"
"verify``."
msgstr "请注意,如果需要,您可以通过使用 ``git commit --no-verify`` 命令来跳过这些检查。"
#: ../../source/development/contributing_codebase.rst:41
msgid ""
"If you don't want to use ``pre-commit`` as part of your workflow, you can"
" still use it to run its checks with::"
msgstr "如果您不想将 ``pre-commit`` 作为工作流程的一部分,仍然可以运行如下命令来使用它进行检查:"
#: ../../source/development/contributing_codebase.rst:46
#: ../../source/development/contributing_codebase.rst:52
msgid "without needing to have done ``pre-commit install`` beforehand."
msgstr "而不需要事先执行 ``pre-commit install``。"
#: ../../source/development/contributing_codebase.rst:48
msgid ""
"If you want to run checks on all recently committed files on "
"upstream/main you can use::"
msgstr "如果您想在所有最近提交的文件上运行检查,您可以使用以下命令:"
#: ../../source/development/contributing_codebase.rst:56
msgid ""
"You may consider periodically running ``pre-commit gc`` to clean up repos"
" which are no longer used."
msgstr "您可以考虑定期运行 ``pre-commit gc`` 命令来清理不再使用的存储库。"
#: ../../source/development/contributing_codebase.rst:61
msgid ""
"If you have conflicting installations of ``virtualenv``, if could lead to"
" errors - refer to `here "
"`_."
msgstr "如果您安装了冲突的 ``virtualenv`` 版本,可能会导致错误 - 可以参考"
" `这里 `_ 。"
#: ../../source/development/contributing_codebase.rst:64
msgid ""
"Also, due to a `bug in virtualenv "
"`_, you may run into "
"issues if you're using conda. To solve this, you can downgrade "
"``virtualenv`` to version ``20.0.33``."
msgstr ""
"此外,由于 ``virtualenv`` 中的一个 `错误 `_ ,如果您使用 conda,可能会遇到问题。"
"要解决这个问题,您可以将 ``virtualenv`` 降级到版本 ``20.0.33``。"
#: ../../source/development/contributing_codebase.rst:69
msgid "Backwards compatibility"
msgstr "向后兼容"
#: ../../source/development/contributing_codebase.rst:71
msgid ""
"Please try to maintain backward compatibility. If you think breakage is "
"necessary, clearly state why as part of the pull request. Also, be "
"careful when changing method signatures and add deprecation warnings "
"where needed. Also, add the deprecated sphinx directive to the deprecated"
" functions or methods."
msgstr ""
"请尽量保持向后兼容性。如果您认为必须进行更改,请在拉取请求中说明清楚原因。同时,在更改方法签名时要小心,并在需要时添加弃用警告。此外,为弃用的函数或方法添加弃用的"
" sphinx 指令。"
#: ../../source/development/contributing_codebase.rst:76
msgid "You'll also need to"
msgstr "同时你还需要"
#: ../../source/development/contributing_codebase.rst:78
msgid ""
"Write a new test that asserts a warning is issued when calling with the "
"deprecated argument"
msgstr "编写一个新的测试样例,在调用带有弃用参数时会发出警告。"
#: ../../source/development/contributing_codebase.rst:79
msgid "Update all of Xinference existing tests and code to use the new argument"
msgstr "更新所有 Xinference 现有的测试样例和代码,以使用新的参数。"
#: ../../source/development/contributing_codebase.rst:82
msgid "Type hints"
msgstr "类型提示"
#: ../../source/development/contributing_codebase.rst:84
msgid ""
"Xinference strongly encourages the use of :pep:`484` style type hints. "
"New development should contain type hints and pull requests to annotate "
"existing code are accepted as well!"
msgstr "Xinference 强烈鼓励使用 :pep:`484` 风格的类型提示。新的开发应包含类型提示,并且对现有代码进行注释的拉取请求也是可以接受的!"
#: ../../source/development/contributing_codebase.rst:88
msgid "Test-driven development"
msgstr "测试驱动开发"
#: ../../source/development/contributing_codebase.rst:90
msgid ""
"Xinference is serious about testing and strongly encourages contributors "
"to embrace `test-driven development (TDD) `_. This development process \"relies on the "
"repetition of a very short development cycle: first the developer writes "
"an (initially failing) automated test case that defines a desired "
"improvement or new function, then produces the minimum amount of code to "
"pass that test.\" So, before actually writing any code, you should write "
"your tests. Often the test can be taken from the original GitHub issue. "
"However, it is always worth considering additional use cases and writing "
"corresponding tests."
msgstr ""
"Xinference 非常重视测试,并强烈鼓励贡献者采用 `测试驱动开发(TDD) `_ 。这种开发过程 "
"\"依赖于非常短的开发周期的重复:首先,开发者编写一个(初始为失败的)自动化测试样例来定义所需的改进或新功能,然后用最少的代码来通过该测试。\"因此,在实际编写任何代码之前,您应该编写您的测试样例。通常,测试样例可以从原始的"
" GitHub issue 中获取。然而,值得考虑额外的情况并编写相应的测试样例。"
#: ../../source/development/contributing_codebase.rst:99
msgid ""
"Adding tests is frequently requested after code is pushed to Xinference. "
"Thus, it is worth getting in the habit of writing tests ahead of time so "
"this is never an issue."
msgstr "在将代码推送到 Xinference 之后,经常会要求添加测试样例。因此,养成提前编写测试样例的习惯非常重要,这样就不会出现问题。"
#~ msgid "Pre-commit"
#~ msgstr "Pre-commit"
================================================
FILE: doc/source/locale/zh_CN/LC_MESSAGES/development/contributing_environment.po
================================================
# SOME DESCRIPTIVE TITLE.
# Copyright (C) 2023, Xorbits Inc.
# This file is distributed under the same license as the Xinference package.
# FIRST AUTHOR , 2024.
#
#, fuzzy
msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2025-08-02 23:15+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language: zh_CN\n"
"Language-Team: zh_CN \n"
"Plural-Forms: nplurals=1; plural=0;\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.14.0\n"
#: ../../source/development/contributing_environment.rst:3
msgid "Creating a development environment"
msgstr "创建开发环境"
#: ../../source/development/contributing_environment.rst:6
msgid "Table of contents:"
msgstr "目录"
#: ../../source/development/contributing_environment.rst:8
msgid ""
"Before proceeding with any code modifications, it's essential to set up "
"the necessary environment for Xinference development, which includes "
"familiarizing yourself with Git usage, establishing an isolated "
"environment, installing Xinference, and compiling the frontend."
msgstr ""
"在进行任何代码修改之前,建立起适用于 Xinference 开发的必要环境至关重要。"
"包括熟悉 Git 的使用、建立一个独立的环境、安装 Xinference 以及前端部分的"
"编译。"
#: ../../source/development/contributing_environment.rst:12
msgid "Getting started with Git"
msgstr "Git 的使用"
#: ../../source/development/contributing_environment.rst:14
msgid ""
"Now that you have identified an issue you wish to resolve, an enhancement"
" to incorporate, or documentation to enhance, it's crucial to acquaint "
"yourself with GitHub and the Xinference codebase."
msgstr ""
"当你有一个需要修复的问题、需要添加的增强功能或需要改进的文档时,熟悉 "
"GitHub 和 Xinference 代码库很重要。"
#: ../../source/development/contributing_environment.rst:17
msgid ""
"To the new user, working with Git is one of the more intimidating aspects"
" of contributing to Xinference. It can very quickly become overwhelming, "
"but sticking to the guidelines below will help simplify the process and "
"minimize potential issues. As always, if you are having difficulties "
"please feel free to ask for help."
msgstr ""
"对新用户来说,使用 Git 是参与 Xinference 开发最令人畏惧的方面之一。很快就"
"会感到压力山大,但以下指南将有助于简化流程并减少潜在问题。如果您遇到"
"难以解决的问题,欢迎在社区寻求帮助。"
#: ../../source/development/contributing_environment.rst:22
msgid ""
"The code is hosted on `GitHub `_."
" To contribute you will need to sign up for a `free GitHub account "
"`_. We use `Git `_ "
"for version control to allow many people to work together on the project."
msgstr ""
"Xinference 的代码托管在 `GitHub `"
"_ 。要参与 Xinference 代码贡献,你需要注册一个 `免费的 GitHub 账户 `_ 。我们使用 `Git `_ "
"进行版本控制,以便大家共同参与项目的开发。"
#: ../../source/development/contributing_environment.rst:27
msgid ""
"`GitHub has instructions `__"
" for installing git, setting up your SSH key, and configuring git. All "
"these steps need to be completed before you can work seamlessly between "
"your local repository and GitHub."
msgstr ""
"你可以参考 `GitHub 指南 `_ "
"来安装 git,设置 SSH 密钥以及配置 git。你需要完成这些步骤以确保你的本地"
"仓库和 GitHub 可以正常工作,后续的工作才可以顺利进行。"
#: ../../source/development/contributing_environment.rst:31
msgid "Some great resources for learning Git:"
msgstr "以下是一些很好的学习 Git 的资源:"
#: ../../source/development/contributing_environment.rst:33
msgid "`Official Git Documentation `_"
msgstr "`Git 官方文档 `_"
#: ../../source/development/contributing_environment.rst:34
msgid "`Pro Git Book `_"
msgstr "`Pro Git 书籍 `_"
#: ../../source/development/contributing_environment.rst:35
msgid "`Git Tutorial by Atlassian `_"
msgstr "`Atlassian 提供的 Git 教程 `_"
#: ../../source/development/contributing_environment.rst:36
msgid ""
"`Git - Concise Guide `_"
msgstr "`Git-简明指南 `_"
#: ../../source/development/contributing_environment.rst:39
msgid ""
"If the speed of ``git clone`` is slow, you can use the following command "
"to add a proxy:"
msgstr "如果在 ``git clone`` 代码的时候速度较慢,可以通过如下命令添加代理"
#: ../../source/development/contributing_environment.rst:47
msgid "Creating an isolated environment"
msgstr "创建一个隔离环境"
#: ../../source/development/contributing_environment.rst:49
msgid ""
"Before formally installing Xinference, it's recommended to create an "
"isolated environment, using Conda recommended, for ease of subsequent "
"operations."
msgstr "在正式安装Xinference之前,建议使用 Conda 创建一个隔离环境方便后续操作。"
#: ../../source/development/contributing_environment.rst:57
msgid "``xinf`` can be replaced with a custom Conda environment name."
msgstr "``xinf`` 可替换为自定义的 Conda 环境名。"
#: ../../source/development/contributing_environment.rst:59
msgid ""
"Afterward, you'll need to install Python and Node.js (npm) in the newly "
"created Conda environment. Here are the commands:"
msgstr "随后需要在新建的 Conda 环境中安装 Python 以及 Node.js (npm)。命令如下:"
#: ../../source/development/contributing_environment.rst:68
msgid "Install from source code"
msgstr "从源码安装"
#: ../../source/development/contributing_environment.rst:70
msgid ""
"Before we begin, please make sure that you have cloned the repository. "
"Suppose you clone the repository as ``inference`` directory, ``cd`` to "
"this directory where the ``setup.cfg`` and ``setup.py`` files are "
"located, and run the following command:"
msgstr ""
"在开始之前,请确保您已经克隆了存储库。假设您将存储库克隆到名为 ``"
"inference`` 的目录中,请进入该目录,其中包含 ``setup.cfg`` 和 ``setup.py`"
"` 文件,并执行以下命令:"
#: ../../source/development/contributing_environment.rst:79
msgid ""
"If the commands run successfully, you can use Xinference normally. For "
"detailed usage instructions, refer to `using_xinference "
"`__."
msgstr ""
"如果命令能够成功运行,接下来就能正常使用 Xinference 了,使用教程详情见 `"
"使用 `__。"
#: ../../source/development/contributing_environment.rst:83
msgid ""
"If errors occur or the process freezes during execution, the next step is"
" to compile the frontend."
msgstr "如果出现报错或者在运行过程中卡死,那就需要进行下一步前端编译。"
#: ../../source/development/contributing_environment.rst:87
msgid "Frontend Compilation"
msgstr "前端编译"
#: ../../source/development/contributing_environment.rst:89
msgid ""
"Navigate to the ``inference/xinference/ui/web/ui`` directory. Then, "
"execute the following command to clear the cache:"
msgstr ""
"首先需要进入 ``inference/xinference/ui/web/ui`` 目录下,随后执行如下命令清除"
"缓存:"
#: ../../source/development/contributing_environment.rst:96
msgid ""
"If the command fails to execute, you can try adding the ``--force`` "
"option."
msgstr "如果命令执行失败,您可以尝试添加 ``--force`` 选项"
#: ../../source/development/contributing_environment.rst:99
msgid ""
"If the ``node_modules`` folder already exists in this directory, it's "
"recommended to manually delete it before cleaning the cache."
msgstr "如果该目录下已经存在 ``node_modules`` 文件夹的话建议先手动删除该文件夹"
#: ../../source/development/contributing_environment.rst:102
msgid ""
"Next, execute the following command in this directory to compile the "
"frontend:"
msgstr "接着在该目录下执行以下命令进行前端编译:"
#: ../../source/development/contributing_environment.rst:110
msgid ""
"Still, if the first command fails to execute, you can try adding the "
"``--force`` option."
msgstr "如果第一个命令执行失败,您仍然可以尝试通过添加 ``--force`` 选项解决"
#: ../../source/development/contributing_environment.rst:112
msgid ""
"After compiling the frontend, you can ``cd`` back to the directory where "
"the ``setup.cfg`` and ``setup.py`` files are located, and install "
"Xinference via ``pip install -e .``."
msgstr ""
"编译完前端后,您可以返回到包含 ``setup.cfg`` 和 ``setup.py`` 文件的目录,"
"然后通过 ``pip install -e .`` 安装 Xinference。"
================================================
FILE: doc/source/locale/zh_CN/LC_MESSAGES/development/index.po
================================================
# SOME DESCRIPTIVE TITLE.
# Copyright (C) 2023, Xorbits Inc.
# This file is distributed under the same license as the Xinference package.
# FIRST AUTHOR , 2024.
#
#, fuzzy
msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2024-03-06 12:05+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language: zh_CN\n"
"Language-Team: zh_CN \n"
"Plural-Forms: nplurals=1; plural=0;\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.14.0\n"
#: ../../source/development/index.rst:5
msgid "Development"
msgstr "开发指南"
================================================
FILE: doc/source/locale/zh_CN/LC_MESSAGES/development/xinference_internals.po
================================================
# SOME DESCRIPTIVE TITLE.
# Copyright (C) 2023, Xorbits Inc.
# This file is distributed under the same license as the Xinference package.
# FIRST AUTHOR , 2024.
#
#, fuzzy
msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2024-05-31 11:46+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language: zh_CN\n"
"Language-Team: zh_CN \n"
"Plural-Forms: nplurals=1; plural=0;\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.14.0\n"
#: ../../source/development/xinference_internals.rst:3
msgid "The internals of Xinference"
msgstr "Xinference 的内部结构"
#: ../../source/development/xinference_internals.rst:6
msgid "Table of contents:"
msgstr "目录"
#: ../../source/development/xinference_internals.rst:9
msgid "Overview"
msgstr "概述"
#: ../../source/development/xinference_internals.rst:10
msgid ""
"Xinference leverages `Xoscar `_, an "
"actor programming framework we designed, as its core component to manage "
"machines, devices, and model inference processes. Each actor serves as a "
"basic unit for model inference and various inference backends can be "
"integrate into the actor, enabling us to support multiple inference "
"engines and hardware. These actors are hosted and scheduled within actor "
"pools, which are designed to be asynchronous and non-blocking and "
"function as resource pools."
msgstr ""
"Xinference 利用我们设计的 actor 编程框架 `Xoscar `_ 作为其核心组件,以管理机器、设备和模型推理进程。每个 "
"actor 都是模型推理的基本单元,各种推理后端可以集成到 actor 中,从而使我们"
"能够支持多种推理引擎和硬件。这些 actor 在 actor 池中托管和调度,actor 池"
"具有资源池的功能,actor 的设计是异步和非阻塞的,。"
#: ../../source/development/xinference_internals.rst:22
msgid ""
"Both supervisor and worker are actor instances. Initially, an actor pool,"
" serving as a resource pool, needs to be created on each server; and each"
" actor can utilize a CPU core or a GPU device. Each server has its own "
"address (IP address or hostname), so actors on different computing nodes "
"can communicate with each other through these addresses. See `Actor`_ for"
" more information."
msgstr ""
"supervisor 和 worker 都是 actor 实例。需要先在每台服务器上创建一个作为"
"资源池的 actor 池;每个 actor 可以使用一个 CPU 内核或一块 GPU 设备。每台"
"服务器都有自己的地址(IP 地址或主机名),因此不同计算节点上的 actor 可以"
"通过这些地址相互通信。更多信息,请参阅 `Actor`_。"
#: ../../source/development/xinference_internals.rst:27
msgid "RESTful API"
msgstr "RESTful API"
#: ../../source/development/xinference_internals.rst:28
msgid ""
"The RESTful API is implemented using `FastAPI "
"`_, as specified in "
"`api/restful_api.py "
"`_."
msgstr ""
"RESTful API 是利用 `FastAPI `_ 实现"
"的,具体代码在 `api/restful_api.py `_。"
#: ../../source/development/xinference_internals.rst:35
msgid ""
"This is an example of the API ``/status``, it's corresponding function is"
" ``get_status``. You can add connection between RESTful API and the "
"backend function you want in `api/restful_api.py "
"`_."
msgstr ""
"这是一个 API 的示例,API ``/status`` 对应函数 ``get_status``。您可以在 `"
"api/restful_api.py `_ 中添加 RESTful API 和对应后端函数之间的"
"关系。"
#: ../../source/development/xinference_internals.rst:39
msgid "Command Line"
msgstr "命令行"
#: ../../source/development/xinference_internals.rst:40
msgid ""
"The Command Line is implemented using `Click "
"`_, as specified in "
"`deploy/cmdline.py "
"`_,"
" allowing users to interact with the Xinference deployment features "
"directly from the terminal."
msgstr ""
"命令行是通过 `Click `_ 实现的,具体"
"代码在 `deploy/cmdline.py `_,命令行允许用户直接在终端与 "
"Xinference 进行交互。"
#: ../../source/development/xinference_internals.rst:45
msgid "Entry Points"
msgstr "入口点"
#: ../../source/development/xinference_internals.rst:46
msgid "Take the command-lines we implemented as examples:"
msgstr "以我们实现的命令行为例:"
#: ../../source/development/xinference_internals.rst:48
msgid ""
"``xinference``: Provides commands for model management, including "
"registering/unregistering models, listing all registered/running models, "
"and launching or terminating specific models. It also features "
"interactive commands like generate and chat for testing and interacting "
"with deployed models in real-time."
msgstr ""
"``xinference``:提供命令用于模型管理,包括注册/取消注册模型、列出所有已"
"注册/运行的模型,以及启动或终止特定模型。它还提供生成语言和聊天等交互式"
"命令,用于测试或交互已部署的模型。"
#: ../../source/development/xinference_internals.rst:52
msgid "``xinference-local``: Starts a local Xinference service."
msgstr "``xinference-local``:启动一个本地 Xinference 服务。"
#: ../../source/development/xinference_internals.rst:54
msgid ""
"``xinference-supervisor``: Initiates a supervisor process that manages "
"and monitors worker actors within a distributed setup."
msgstr ""
"``xinference-supervisor``:启动 supervisor 进程,在分布式环境中管理和监控"
" worker actors。"
#: ../../source/development/xinference_internals.rst:56
msgid ""
"``xinference-worker``: Starts a worker process that executes tasks "
"assigned by the supervisor, utilizing available computational resources "
"effectively."
msgstr ""
"``xinference-worker``:启动 worker 进程,利用可用计算资源,执行 "
"supervisor 分配的任务。"
#: ../../source/development/xinference_internals.rst:59
msgid ""
"Each command is equipped with ``options`` and ``flags`` to customize its "
"behavior, such as specifying log levels, host addresses, port numbers, "
"and other relevant settings."
msgstr ""
"每条命令都配有 ``option`` 和 ``flag``,可自定义其行为,如指定日志级别、"
"主机地址、端口号和其他相关设置。"
#: ../../source/development/xinference_internals.rst:62
msgid ""
"Python projects define command-line console entry points in `setup.cfg` "
"or `setup.py`."
msgstr "Python 项目会在 `setup.cfg` 或 `setup.py` 中定义命令行控制台入口点。"
#: ../../source/development/xinference_internals.rst:72
msgid ""
"The command-line ``xinference`` can be referred to code in "
"``xinference.deploy.cmdline:cli``."
msgstr "命令行 ``xinference`` 可参考 ``xinference.deploy.cmdline:cli`` 中的代码。"
#: ../../source/development/xinference_internals.rst:75
msgid "Click"
msgstr "Click"
#: ../../source/development/xinference_internals.rst:76
msgid "We use Click to implement a specific command-line:"
msgstr "我们使用 Click 来实现特定的命令行:"
#: ../../source/development/xinference_internals.rst:95
msgid ""
"For example, the ``xinference-local`` command allows you to define the "
"host address and port."
msgstr "例如,``xinference-local`` 命令允许您定义主机地址和端口。"
#: ../../source/development/xinference_internals.rst:98
msgid "Actor"
msgstr "Actor"
#: ../../source/development/xinference_internals.rst:99
msgid ""
"Xinference is fundamentally based on `Xoscar "
"`_, our actor framework, which can "
"manage computational resources and Python processes to support scalable "
"and concurrent programming. The following is a pseudocode demonstrating "
"how our Worker Actor works, the actual Worker Actor is more complex than "
"this."
msgstr ""
"Xinference 以 `Xoscar `_ 为基础,"
"Xoscar 是我们的 actor 框架,可以管理计算资源和 Python 进程,支持可扩展的"
"并发编程。下面的伪代码演示了 Worker Actor 的工作原理,实际的 Worker Actor"
" 要比这个复杂得多。"
#: ../../source/development/xinference_internals.rst:126
msgid ""
"We use the ``WorkerActor`` as an example to illustrate how we build the "
"Xinference. Each actor class is a standard Python class that inherits "
"from ``xoscar.Actor``. An instance of this class is a specific actor "
"within the actor pool."
msgstr ""
"我们以 ``WorkerActor`` 为例,说明如何构建 Xinference。每个 actor 类都是"
"继承自 ``xoscar.Actor`` 的标准 Python 类。该类的实例就是 actor 池中的一个"
"特定的 actor。"
#: ../../source/development/xinference_internals.rst:130
msgid ""
"**Define Actor Actions**: Each actor needs to define certain actions or "
"behaviors to accomplish specific tasks. For instance, the model inference"
" ``WorkerActor`` needs to launch the model (``launch_model``), list the "
"models in this actor (``list_models``), terminate a model "
"(``terminate_model``). There are two special methods worth noting. The "
"``__post_create__`` is invoked before the actor is created, allowing for "
"necessary initializations. The ``__pre_destroy__`` is called after the "
"actor is destroyed, allowing for cleanup or finalization tasks."
msgstr ""
"**定义 Actor 的行为**:每个 actor 都需要定义某些动作或行为来完成特定任务"
"。例如,模型推理 ``WorkerActor`` 需要启动模型(``launch_model``)、列出该"
" actor 中的模型(``list_models``)、终止模型(``termininate_model``)。有"
"两个特殊方法值得注意。``__post_create__`` 在创建 actor 之前调用,进行必要"
"的初始化。而 ``__pre_destroy__`` 会在 actor 被销毁后调用,执行清理任务。"
#: ../../source/development/xinference_internals.rst:136
msgid ""
"**Reference Actor and Invoke Methods**: When an actor is created, it "
"yields a reference variable so that other actors can reference it. The "
"actor reference can also be referenced with the address. Suppose the "
"``WorkerActor`` is created and the reference variable is ``worker_ref``,"
" the ``launch_model`` method of this actor class can be invoked by "
"calling ``worker_ref.launch_model()``. Even if the actor's method is "
"originally a synchronized method, when called with an actor reference, it"
" will become as an asynchronous method."
msgstr ""
"**引用 Actor 和调用方法**:当创建一个 Actor 时,它会产生一个引用变量,"
"以便其他 Actor 可以引用它。Actor 也可以用 IP 地址来引用。假设创建了 ``"
"WorkerActor``,且引用变量为 ``worker_ref``,那么就可以通过调用 ``worker_"
"ref.launch_model()`` 来调用该 Actor 类的 ``launch_model``。即使 actor 中"
"的方法原来是一个传统的阻塞式的方法,当我们使用引用变量调用这个方法时,它"
"也变成了一个异步方法。"
#: ../../source/development/xinference_internals.rst:143
msgid ""
"**Inference Engine**: The actor can manage the process, and the inference"
" engine is also a process. In the launch model part of the "
"``WorkerActor``, we can initialize different inference engines according "
"to the user's need. Therefore, Xinference can support multiple inference "
"engines and can easily adapt to new inference engines in the future."
msgstr ""
"**推理引擎**:Actor 可以管理进程,而推理引擎也是一种进程。在 ``"
"WorkerActor`` 的启动模型部分,我们可以根据用户的需要初始化不同的推理引擎"
"。因此,Xinference 可以支持多种推理引擎,并能轻松适应未来的新推理引擎。"
#: ../../source/development/xinference_internals.rst:148
msgid ""
"See `Xoscar document `_ for more actor use cases."
msgstr ""
"请参阅 `Xoscar 文档 `_ 了解更多 Actor 用例。"
#: ../../source/development/xinference_internals.rst:151
msgid "Asynchronous Programming"
msgstr "异步编程"
#: ../../source/development/xinference_internals.rst:153
msgid ""
"Both Xinference and Xoscar highly utilize asynchronous programming of "
"``asyncio``. Asynchronous programming is a programming paradigm that does"
" not block. Instead, requests and function calls are issued and executed "
"in the background and results are returned in the future. This enables us"
" to perform activities concurrently."
msgstr ""
"Xinference 和 Xoscar 非常依赖异步编程库 ``asyncio``。异步编程是一种非阻塞"
"的编程范式。相比于传统的阻塞式的函数调用,异步编程中的请求或函数调用在"
"后台执行,运行结果在未来某个时刻返回。异步编程的优势是使得可以同时并发"
"进行很多不同的活动或任务。"
#: ../../source/development/xinference_internals.rst:159
msgid ""
"If you're not familiar with Pythons's ``asyncio``, you can see more "
"tutorials for help:"
msgstr "如果您不熟悉 Python 的 ``asyncio``,可以查看更多教程以获得帮助:"
#: ../../source/development/xinference_internals.rst:161
msgid ""
"`Python Asyncio Tutorial `__"
msgstr ""
"`Python Asyncio 教程 `__"
#: ../../source/development/xinference_internals.rst:163
msgid ""
"`Real Python's asyncio Tutorial `__"
msgstr "`Real Python asyncio 教程 `__"
#: ../../source/development/xinference_internals.rst:165
msgid ""
"`Python Official Documentation "
"`__"
msgstr "`Python 官方文档 `__"
#: ../../source/development/xinference_internals.rst:169
msgid "Model"
msgstr "模型"
#: ../../source/development/xinference_internals.rst:171
msgid ""
"Xinference supports different types of models including large language "
"models (LLMs), image models, audio models, embedding models, etc. All "
"models are implemented in `model/ "
"`_."
msgstr ""
"Xinference 支持不同类型的模型,包括大型语言模型(LLM)、图像模型、音频"
"模型、嵌入模型等。所有模型在 `model/ `_ 文件夹下实现。"
#: ../../source/development/xinference_internals.rst:175
msgid "LLM"
msgstr ""
#: ../../source/development/xinference_internals.rst:177
msgid ""
"Take `model/llm/ "
"`_"
" for example, it focuses on the management and instantiation of LLMs. It "
"includes detailed implementations for loading, configuring, and deploying"
" LLMs."
msgstr ""
"以 `model/llm/ `_ 为例,它主要管理和启动 LLM,包括加载、配置和运行"
"大语言模型。"
#: ../../source/development/xinference_internals.rst:181
msgid ""
"We support many backends such as GGML, PyTorch, and vLLM. Our generated "
"content is compatible with the format of OpenAI, supporting features such"
" as streaming output and returning chat completion format (for chat "
"models only). Therefore, there is a lot of adaptation work to be done "
"after the model generate content. These tasks are not difficult, but they"
" do require some time. When writing this part of the code, please refer "
"to the `OpenAI API documentation "
"`_ and the documentation "
"of various inference backends, and make the necessary adaptations."
msgstr ""
"我们支持不同的推理后端,比如 GGML、PyTorch 和 vLLM。我们生成的内容与 "
"OpenAI 的格式兼容,比如支持流式输出(stream),对话模型以 chat completion"
" 格式返回。因此模型输出内容后要做很多适配工作。这些工作并不难,但需要一些"
"时间。编写这部分代码时,请参考 `OpenAI 的 API 文档 `_ 和各个推理后端的文档,做必要的适配。"
#: ../../source/development/xinference_internals.rst:185
msgid "JSON"
msgstr ""
#: ../../source/development/xinference_internals.rst:187
msgid ""
"In `model/llm/llm_family.json "
"`_,"
" we utilize JSON files to manage the metadata of emerging open-source "
"models. Adding a new model does not necessitate writing new code, it "
"merely requires appending new metadata to the existing JSON file."
msgstr ""
"在 `model/llm/llm_family.json `_ 中,我们利用 JSON 文件"
"来管理新出现的开源模型的元数据。添加一个新模型并不需要编写新代码,只需要"
"将新的元数据添加到现有的 JSON 文件中即可。"
#: ../../source/development/xinference_internals.rst:214
msgid ""
"This is an example of how to define the Llama-2 chat model. The "
"``model_specs`` define the information of the model, as one model family "
"usually comes with various sizes, quantization methods, and file formats."
" For instance, the ``model_format`` could be ``pytorch`` (using Hugging "
"Face Transformers or vLLM as backend), ``ggmlv3`` (a tensor library "
"associated with llama.cpp), or ``gptq`` (a post-training quantization "
"framework). The ``model_id`` defines the repository of the model hub from"
" which Xinference downloads the checkpoint files. Furthermore, due to "
"distinct instruction-tuning processes, different model families have "
"varying prompt styles. The ``prompt_style`` in the JSON file specifies "
"how to format prompts for this particular model. For example, "
"``system_prompt`` and ``roles`` are used to specify the instructions and "
"personality of the model."
msgstr ""
"这是一个如何定义 Llama-2 聊天模型的示例。``model_specs`` 定义了模型的信息"
",因为一个模型系列通常有不同的尺寸、量化方法和文件格式。例如,``model_"
"format`` 可以是 ``pytorch`` (使用 Hugging Face Transformers 或 vLLM 作为"
"后端)、 ``ggmlv3`` (与 llama.cpp 相关的张量库)或 ``gptq`` (训练后量化"
"框架)。 ``model_id`` 定义了模型中心的资源库,Xinference 从模型中心下载"
"检查点文件。此外,由于不同的指令调整过程,不同的模型系列有不同的提示风格"
"。JSON 文件中的 ``prompt_style`` 指定了该特定模型的提示格式。例如,``"
"system_prompt`` 和 ``roles`` 用于指定模型的指令和个性。"
#: ../../source/development/xinference_internals.rst:224
msgid "Code Walkthrough"
msgstr "代码指南"
#: ../../source/development/xinference_internals.rst:226
msgid ""
"The main code is located in the `xinference/ "
"`_:"
msgstr ""
"主要代码位于 `xinference/ `_:"
#: ../../source/development/xinference_internals.rst:228
msgid ""
"`api/ "
"`_: "
"`restful_api.py "
"`_"
" is the core part that sets up and runs the RESTful APIs. It integrates "
"an authentication service (the specific code is located in `oauth2/ "
"`_),"
" as some or all endpointsrequire user authentication."
msgstr ""
"`api/ `_"
":`restful_api.py `_ 是设置和运行 RESTful API 的核心部分。它"
"集成了一个身份验证服务(具体代码位于 `oauth2/ `_),因为部分或所有"
"端口需要用户身份验证。"
#: ../../source/development/xinference_internals.rst:233
msgid ""
"`client/ "
"`_: "
"This is the client of Xinference."
msgstr ""
"`client/ `_:这是 Xinference 的客户端。"
#: ../../source/development/xinference_internals.rst:235
msgid ""
"`oscar/ "
"`_"
" defines the Actor Client which acts as a client interface for "
"interacting with models deployed in a Xinference cluster."
msgstr ""
"`oscar/ `_ 定义了 Actor 客户端,它是一个客户端接口,用于与 "
"Xinference 中的模型交互。"
#: ../../source/development/xinference_internals.rst:238
msgid ""
"`restful/ "
"`_"
" implements a RESTful client for interacting with a Xinference service."
msgstr ""
"`restful/ `_ 实现与 Xinference 服务交互的 RESTful 客户端。"
#: ../../source/development/xinference_internals.rst:241
msgid ""
"`core/ "
"`_: "
"This is the core part of Xinference."
msgstr ""
"`core/ "
"`_:这是 Xinference 的核心部分。"
#: ../../source/development/xinference_internals.rst:243
msgid ""
"`metrics.py "
"`_"
" and `resource.py "
"`_"
" defines a set of tools for collecting and reporting metrics and the "
"status of node resources, including model throughput, latency, the usage "
"of CPU and GPU, memory usage, and more."
msgstr ""
"`metrics.py `_ 和 `resource.py `_ 定义了一套用于收集和"
"报告指标以及节点资源状态的工具,包括模型吞吐量、延迟、CPU 和 GPU 的使用率"
"、内存使用率等。"
#: ../../source/development/xinference_internals.rst:248
msgid ""
"`image_interface.py "
"`_"
" and `chat_interface.py "
"`_"
" implement `Gradio `_ interfaces "
"for image and chat models, respectively. These interfaces allow users to "
"interact with models through a Web UI, such as generating images or "
"engaging in chat. They build user interfaces using the gradio package and"
" communicate with backend models through our RESTful APIs."
msgstr ""
"`image_interface.py `_ 和 `chat_interface.py `_ 分别为图像和聊天模型实现了 `Gradio