SYMBOL INDEX (3167 symbols across 352 files) FILE: android/MLCChat/bundle_weight.py function main (line 12) | def main(apk_path: Path, package_output_path: Path): function _parse_apk_path (line 42) | def _parse_apk_path(path: str) -> Path: FILE: android/MLCEngineExample/bundle_weight.py function main (line 12) | def main(apk_path: Path, package_output_path: Path): function _parse_apk_path (line 42) | def _parse_apk_path(path: str) -> Path: FILE: android/mlc4j/prepare_libs.py function run_cmake (line 16) | def run_cmake(mlc4j_path: Path): function run_cmake_build (line 56) | def run_cmake_build(): function run_cmake_install (line 71) | def run_cmake_install(): function main (line 86) | def main(mlc_llm_source_dir: Path): FILE: android/mlc4j/src/cpp/tvm_runtime.h function namespace (line 49) | namespace tvm { FILE: android/mlc4j/src/main/java/ai/mlc/mlcllm/JSONFFIEngine.java class JSONFFIEngine (line 9) | public class JSONFFIEngine { method JSONFFIEngine (line 23) | public JSONFFIEngine() { method initBackgroundEngine (line 39) | public void initBackgroundEngine(KotlinFunction callback) { method reload (line 55) | public void reload(String engineConfigJSONStr) { method chatCompletion (line 59) | public void chatCompletion(String requestJSONStr, String requestId) { method runBackgroundLoop (line 63) | public void runBackgroundLoop() { method runBackgroundStreamBackLoop (line 67) | public void runBackgroundStreamBackLoop() { method exitBackgroundLoop (line 71) | public void exitBackgroundLoop() { method unload (line 75) | public void unload() { type KotlinFunction (line 79) | public interface KotlinFunction { method invoke (line 80) | void invoke(String arg); method reset (line 83) | public void reset() { FILE: cpp/json_ffi/conv_template.cc type mlc (line 8) | namespace mlc { type llm (line 9) | namespace llm { type json_ffi (line 10) | namespace json_ffi { function ModelVisionConfig (line 16) | ModelVisionConfig ModelVisionConfig::FromJSON(const tvm::ffi::json... function ModelConfig (line 85) | ModelConfig ModelConfig::FromJSON(const tvm::ffi::json::Object& js... function MessagePlaceholders (line 147) | MessagePlaceholders MessagePlaceholderFromString(const std::string... function TryGetFunctionCallingString (line 193) | Result> TryGetFunctionCallingString( function CreatePrompt (line 224) | Result> CreatePrompt(const Conversation& conv, FILE: cpp/json_ffi/conv_template.h function namespace (line 21) | namespace llm { FILE: cpp/json_ffi/image_utils.cc type mlc (line 9) | namespace mlc { type llm (line 10) | namespace llm { type json_ffi (line 11) | namespace json_ffi { class MemoryBufferStream (line 15) | class MemoryBufferStream : public tvm::support::Stream { method MemoryBufferStream (line 20) | MemoryBufferStream(const char* data, size_t size) : data_(data),... method Read (line 22) | size_t Read(void* ptr, size_t size) override { method Write (line 35) | size_t Write(const void* ptr, size_t size) override { function Base64DecodedSize (line 46) | size_t Base64DecodedSize(const std::string& base64_str) { function LoadImageFromBase64 (line 58) | Result LoadImageFromBase64(const std::string& base64_str) { function Tensor (line 78) | Tensor ClipPreprocessor(Tensor image_data, int target_size, DLDevi... FILE: cpp/json_ffi/image_utils.h function namespace (line 16) | namespace mlc { FILE: cpp/json_ffi/json_ffi_engine.cc type mlc (line 15) | namespace mlc { type llm (line 16) | namespace llm { type json_ffi (line 17) | namespace json_ffi { class JSONFFIEngineImpl (line 154) | class JSONFFIEngineImpl : public JSONFFIEngine, public ffi::Module... method InitBackgroundEngine (line 170) | void InitBackgroundEngine(int device_type, int device_id, method Reload (line 189) | void Reload(String engine_config_json_str) { method Unload (line 209) | void Unload() { this->engine_->Unload(); } method Reset (line 211) | void Reset() { this->engine_->Reset(); } method RunBackgroundLoop (line 213) | void RunBackgroundLoop() { this->engine_->RunBackgroundLoop(); } method RunBackgroundStreamBackLoop (line 215) | void RunBackgroundStreamBackLoop() { this->engine_->RunBackgroun... method String (line 217) | String GetResponseFromStreamOutput(Array de... function TVM_FFI_STATIC_INIT_BLOCK (line 299) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/json_ffi/json_ffi_engine.h function namespace (line 16) | namespace mlc { FILE: cpp/json_ffi/openai_api_protocol.cc type mlc (line 10) | namespace mlc { type llm (line 11) | namespace llm { type json_ffi (line 12) | namespace json_ffi { FILE: cpp/json_ffi/openai_api_protocol.h function namespace (line 22) | namespace llm { FILE: cpp/metadata/model.cc type mlc (line 7) | namespace mlc { type llm (line 8) | namespace llm { function ModelMetadata (line 76) | ModelMetadata ModelMetadata::FromJSON(const tvm::ffi::json::Object& ... function ModelMetadata (line 139) | ModelMetadata ModelMetadata::FromModule(Module module, const tvm::ff... FILE: cpp/metadata/model.h function namespace (line 18) | namespace llm { FILE: cpp/multi_gpu/builtin.cc type mlc (line 18) | namespace mlc { type llm (line 19) | namespace llm { type multi_gpu (line 20) | namespace multi_gpu { function ObjectRef (line 28) | ObjectRef DispatchFunctionByGroup(tvm::ffi::AnyView vm_arg, function ObjectRef (line 59) | ObjectRef SendFromLastGroupToWorker0(Tensor send, Optional... function TVM_FFI_STATIC_INIT_BLOCK (line 90) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/multi_gpu/multi_gpu_loader.cc type mlc (line 29) | namespace mlc { type llm (line 30) | namespace llm { type multi_gpu (line 31) | namespace multi_gpu { class RangeTimer (line 42) | class RangeTimer { method RangeTimer (line 44) | explicit RangeTimer(DurationType* result) class PreprocessorPool (line 59) | class PreprocessorPool { method PreprocessorPool (line 61) | explicit PreprocessorPool(const ModelMetadata& model_metadata, M... method Tensor (line 79) | Tensor Apply(Tensor param, const ModelMetadata::Param& param_inf... type ParamInfo (line 96) | struct ParamInfo { function Tensor (line 101) | Tensor RecvFromGlobalWorker0(Device device, const ModelMetadata::P... function Tensor (line 108) | Tensor BroadcastOrShardAndScatter(Tensor param, const ModelMetadat... function Tensor (line 127) | Tensor ReceiveBroadcastedOrSharded(Device device, const ModelMetad... function FormatDuration (line 143) | std::string FormatDuration(DurationType duration) { function LoadMultiGPU (line 150) | Array> LoadMultiGPU(const std::string& model_path... function LoadMultiGPUPresharded (line 250) | Array> LoadMultiGPUPresharded(const std::string& ... function TVM_FFI_STATIC_INIT_BLOCK (line 314) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/config.cc type mlc (line 18) | namespace mlc { type llm (line 19) | namespace llm { type serve (line 20) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 22) | TVM_FFI_STATIC_INIT_BLOCK() { function TotalDetectGlobalMemory (line 27) | uint64_t TotalDetectGlobalMemory(DLDevice device) { function GenerationConfig (line 365) | GenerationConfig GenerationConfig::GetDefaultFromModelConfig( function EngineConfig (line 423) | EngineConfig EngineConfig::FromJSONAndInferredConfig( function String (line 511) | String EngineConfigNode::AsJSONString() const { type ModelConfigLimits (line 550) | struct ModelConfigLimits { function BytesToMegabytesString (line 560) | inline std::string BytesToMegabytesString(double bytes) { function GetModelConfigLimits (line 570) | Result GetModelConfigLimits( type MemUsageEstimationResult (line 646) | struct MemUsageEstimationResult { function EstimateMemoryUsageOnMode (line 653) | Result EstimateMemoryUsageOnMode( function ModelsUseKVCache (line 1065) | Result ModelsUseKVCache(const std::vector, Array> SplitData(const Array& o... function ObjectRef (line 78) | ObjectRef TextDataNode::GetEmbedding(Model model, ObjectRef* dst, ... function TVM_FFI_STATIC_INIT_BLOCK (line 83) | TVM_FFI_STATIC_INIT_BLOCK() { function ObjectRef (line 106) | ObjectRef TokenDataNode::GetEmbedding(Model model, ObjectRef* dst,... function TVM_FFI_STATIC_INIT_BLOCK (line 110) | TVM_FFI_STATIC_INIT_BLOCK() { function ObjectRef (line 136) | ObjectRef ImageDataNode::GetEmbedding(Model model, ObjectRef* dst,... function TVM_FFI_STATIC_INIT_BLOCK (line 140) | TVM_FFI_STATIC_INIT_BLOCK() { function TokenToLogProbJSON (line 151) | inline void TokenToLogProbJSON(const Tokenizer& tokenizer, const T... function RequestStreamOutput (line 226) | RequestStreamOutput RequestStreamOutput::Usage(String request_id, function TVM_FFI_STATIC_INIT_BLOCK (line 234) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/data.h function namespace (line 23) | namespace mlc { FILE: cpp/serve/draft_token_workspace_manager.cc type mlc (line 10) | namespace mlc { type llm (line 11) | namespace llm { type serve (line 12) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 14) | TVM_FFI_STATIC_INIT_BLOCK() { DraftTokenWorkspaceManagerObj::Regis... FILE: cpp/serve/draft_token_workspace_manager.h function namespace (line 17) | namespace mlc { FILE: cpp/serve/engine.cc type mlc (line 39) | namespace mlc { type llm (line 40) | namespace llm { type serve (line 41) | namespace serve { class EngineModule (line 47) | class EngineModule method Init (line 1043) | void Init(const std::string& engine_config_json_str, Device device, method Create (line 1054) | static ffi::Module Create() { return ffi::Module(tvm::ffi::make_... method AddRequest (line 1056) | void AddRequest(Request request) { return GetEngine()->AddReques... method Abort (line 1058) | void Abort(const String& request_id) { return GetEngine()->Abort... method Request (line 1060) | Request CreateRequest(String id, Array inputs, String gene... method Step (line 1067) | void Step() { return GetEngine()->Step(); } method FRequestStreamCallback (line 1069) | FRequestStreamCallback GetRequestStreamCallback() { method SetRequestStreamCallback (line 1073) | void SetRequestStreamCallback(FRequestStreamCallback request_str... method Reset (line 1077) | void Reset() { return GetEngine()->Reset(); } method String (line 1080) | String JSONMetrics() { return GetEngine()->JSONMetrics(); } method Engine (line 1083) | Engine* GetEngine() { function GetTokenizerInfo (line 50) | inline std::optional GetTokenizerInfo(const tvm::ff... function GetEnvSocketHostPort (line 72) | inline std::pair, int> GetEnvSocketHost... function StreamBackErrorImpl (line 86) | void StreamBackErrorImpl(Request request, FRequestStreamCallback r... function AbortRequestImpl (line 104) | void AbortRequestImpl(EngineState estate, const Array& mode... class MockEchoEngineImpl (line 158) | class MockEchoEngineImpl : public Engine { method Create (line 160) | static Result Create(const std::string& en... method Reset (line 188) | void Reset() final {} method Empty (line 190) | bool Empty() final { return request_map_.empty(); } method SetRequestStreamCallback (line 192) | void SetRequestStreamCallback(FRequestStreamCallback request_str... method FRequestStreamCallback (line 196) | FRequestStreamCallback GetRequestStreamCallback() final { return... method AddRequest (line 198) | void AddRequest(Request request) final { method AbortRequest (line 260) | void AbortRequest(const String& request_id) { method AbortAllRequests (line 283) | void AbortAllRequests() final { method Step (line 294) | void Step() final { method String (line 321) | String JSONMetrics() final { return "{}"; } method DebugCallFuncOnAllAllWorker (line 324) | void DebugCallFuncOnAllAllWorker(const String& func_name, Option... type MockRequestState (line 327) | struct MockRequestState { class EngineImpl (line 344) | class EngineImpl : public Engine { method Create (line 350) | static Result Create(const std::string& en... method Reset (line 505) | void Reset() final { method Empty (line 513) | bool Empty() final { return estate_->running_queue.empty() && es... method String (line 515) | String JSONMetrics() final { return tvm::ffi::json::Stringify(es... method FRequestStreamCallback (line 517) | FRequestStreamCallback GetRequestStreamCallback() final { method SetRequestStreamCallback (line 521) | void SetRequestStreamCallback(FRequestStreamCallback request_str... method StreamBackError (line 526) | void StreamBackError(Request request, String finish_reason) { method HandleSpecialRequests (line 532) | void HandleSpecialRequests(Request request) { method HandleDisaggRequest (line 550) | bool HandleDisaggRequest(Request request) { method AddRequest (line 665) | void AddRequest(Request request) final { method AbortRequest (line 727) | void AbortRequest(const String& request_id) final { method AbortAllRequests (line 731) | void AbortAllRequests() final { method Step (line 746) | void Step() final { method CreateDiscoSession (line 769) | std::tuple, int, std::vector> CreateDisco... method DebugCallFuncOnAllAllWorker (line 884) | void DebugCallFuncOnAllAllWorker(const String& func_name, Option... method AutoDecideEngineConfig (line 890) | Result AutoDecideEngineConfig( method SetThreadMaxConcurrency (line 964) | void SetThreadMaxConcurrency() { method GetGrammarFromResponseFormat (line 978) | std::optional GetGrammarFromResponseF... function ClearGlobalMemoryManager (line 1022) | void ClearGlobalMemoryManager() { class EngineModule (line 1028) | class EngineModule : public ffi::ModuleObj { method Init (line 1043) | void Init(const std::string& engine_config_json_str, Device device, method Create (line 1054) | static ffi::Module Create() { return ffi::Module(tvm::ffi::make_... method AddRequest (line 1056) | void AddRequest(Request request) { return GetEngine()->AddReques... method Abort (line 1058) | void Abort(const String& request_id) { return GetEngine()->Abort... method Request (line 1060) | Request CreateRequest(String id, Array inputs, String gene... method Step (line 1067) | void Step() { return GetEngine()->Step(); } method FRequestStreamCallback (line 1069) | FRequestStreamCallback GetRequestStreamCallback() { method SetRequestStreamCallback (line 1073) | void SetRequestStreamCallback(FRequestStreamCallback request_str... method Reset (line 1077) | void Reset() { return GetEngine()->Reset(); } method String (line 1080) | String JSONMetrics() { return GetEngine()->JSONMetrics(); } method Engine (line 1083) | Engine* GetEngine() { function TVM_FFI_STATIC_INIT_BLOCK (line 1092) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/engine.h function namespace (line 15) | namespace mlc { FILE: cpp/serve/engine_actions/action.cc type mlc (line 8) | namespace mlc { type llm (line 9) | namespace llm { type serve (line 10) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 12) | TVM_FFI_STATIC_INIT_BLOCK() { EngineActionObj::RegisterReflection(... FILE: cpp/serve/engine_actions/action.h function namespace (line 18) | namespace mlc { FILE: cpp/serve/engine_actions/action_commons.cc type mlc (line 10) | namespace mlc { type llm (line 11) | namespace llm { type serve (line 12) | namespace serve { function CreateEngineActions (line 14) | Array CreateEngineActions(Array models, Engin... function RemoveRequestFromModel (line 137) | void RemoveRequestFromModel(EngineState estate, int64_t req_intern... function RemoveRequestStateEntry (line 151) | void RemoveRequestStateEntry(EngineState estate, const Array requests, EngineState es... function RequestStateEntry (line 331) | RequestStateEntry PreemptLastRunningRequestStateEntry( function ApplyLogitProcessorAndSample (line 427) | std::pair> ApplyLogitProcessorAn... FILE: cpp/serve/engine_actions/action_commons.h function namespace (line 19) | namespace mlc { FILE: cpp/serve/engine_actions/auto_spec_decode.cc type mlc (line 13) | namespace mlc { type llm (line 14) | namespace llm { type serve (line 15) | namespace serve { class AutoSpecDecodeActionObj (line 21) | class AutoSpecDecodeActionObj : public EngineActionObj { method AutoSpecDecodeActionObj (line 23) | explicit AutoSpecDecodeActionObj(Array spec_decode... method Step (line 30) | Array Step(EngineState estate) final { method CalculateDraftLength (line 54) | int CalculateDraftLength(EngineState estate, int num_running_rse... function EngineAction (line 80) | EngineAction EngineAction::AutoSpecDecode(std::vector models, Tokenizer tok... method Step (line 42) | Array Step(EngineState estate) final { method CanDecode (line 203) | bool CanDecode(int num_rsentries) { method RetokenizeWithNewToken (line 215) | std::pair> RetokenizeWithNewToken(Requ... method CommitTokenMayRetokenize (line 254) | void CommitTokenMayRetokenize(RequestStateEntry rsentry, Request... function EngineAction (line 316) | EngineAction EngineAction::BatchDecode(Array models, Tokeni... FILE: cpp/serve/engine_actions/batch_draft.cc type mlc (line 14) | namespace mlc { type llm (line 15) | namespace llm { type serve (line 16) | namespace serve { class BatchDraftActionObj (line 23) | class BatchDraftActionObj : public EngineActionObj { method BatchDraftActionObj (line 25) | explicit BatchDraftActionObj(Array models, LogitProcessor... method Step (line 38) | Array Step(EngineState estate) final { method CanDecode (line 304) | bool CanDecode(int num_rsentries) { method PrefillLaggedTokensByChunk (line 316) | void PrefillLaggedTokensByChunk(const Array& ... function EngineAction (line 395) | EngineAction EngineAction::BatchDraft(Array models, LogitPr... FILE: cpp/serve/engine_actions/batch_jumpforward.cc type mlc (line 18) | namespace mlc { type llm (line 19) | namespace llm { type serve (line 20) | namespace serve { class BatchJumpForwardActionObj (line 27) | class BatchJumpForwardActionObj : public EngineActionObj { method BatchJumpForwardActionObj (line 29) | explicit BatchJumpForwardActionObj(Array models, Tokenize... method Step (line 35) | Array Step(EngineState estate) final { method CheckMemForJumpForward (line 103) | bool CheckMemForJumpForward(int num_rsentries) { method CanJumpForward (line 111) | bool CanJumpForward(const RequestStateEntry& rsentry) { method RetokenizeWithNewString (line 133) | std::tuple, std::string> RetokenizeWit... method HandleRollback (line 188) | void HandleRollback(const RequestStateEntry& rsentry, RequestMod... function EngineAction (line 231) | EngineAction EngineAction::BatchJumpForward(Array models, T... FILE: cpp/serve/engine_actions/batch_prefill_base.cc type mlc (line 12) | namespace mlc { type llm (line 13) | namespace llm { type serve (line 14) | namespace serve { function HasPrefillSpace (line 16) | bool HasPrefillSpace(int num_required_pages, bool sliding_window_e... FILE: cpp/serve/engine_actions/batch_prefill_base.h function namespace (line 13) | namespace mlc { FILE: cpp/serve/engine_actions/batch_verify.cc type mlc (line 19) | namespace mlc { type llm (line 20) | namespace llm { type serve (line 21) | namespace serve { class BatchVerifyActionObj (line 28) | class BatchVerifyActionObj : public EngineActionObj { method BatchVerifyActionObj (line 30) | explicit BatchVerifyActionObj(Array models, LogitProcesso... method Step (line 44) | Array Step(EngineState estate) final { type DraftRequestStateEntries (line 277) | struct DraftRequestStateEntries { method DraftRequestStateEntries (line 292) | DraftRequestStateEntries GetDraftsToVerify(EngineState estate) { method CanVerify (line 337) | bool CanVerify(int num_required_pages) { function EngineAction (line 369) | EngineAction EngineAction::BatchVerify(Array models, LogitP... FILE: cpp/serve/engine_actions/disagg_prepare_recv.cc type mlc (line 12) | namespace mlc { type llm (line 13) | namespace llm { type serve (line 14) | namespace serve { class DisaggPrepareReceiveActionObj (line 21) | class DisaggPrepareReceiveActionObj : public BatchPrefillBaseActio... method DisaggPrepareReceiveActionObj (line 23) | explicit DisaggPrepareReceiveActionObj(Array models, Engi... method Step (line 34) | Array Step(EngineState estate) final { method GetRequestStateEntriesToPrefill (line 186) | std::optional GetRequestStateEntriesToPrefill(Engi... method CanPrefill (line 324) | bool CanPrefill(EngineState estate, int num_prefill_rsentries, i... method MatchPrefixCache (line 354) | int MatchPrefixCache(EngineState estate, PrefillInput* input) fi... function EngineAction (line 432) | EngineAction EngineAction::DisaggPrepareReceive(Array model... FILE: cpp/serve/engine_actions/disagg_remote_send.cc type mlc (line 9) | namespace mlc { type llm (line 10) | namespace llm { type serve (line 11) | namespace serve { class DisaggRemoteSendActionObj (line 19) | class DisaggRemoteSendActionObj : public BatchPrefillBaseActionObj { method DisaggRemoteSendActionObj (line 21) | explicit DisaggRemoteSendActionObj(Array models, method Step (line 40) | Array Step(EngineState estate) final { method GetRequestStateEntriesToPrefill (line 174) | std::vector GetRequestStateEntriesToPrefill(Engine... method MatchPrefixCache (line 385) | int MatchPrefixCache(EngineState estate, PrefillInput* input) fi... function EngineAction (line 487) | EngineAction EngineAction::DisaggRemoteSend( FILE: cpp/serve/engine_actions/eagle_batch_draft.cc type mlc (line 14) | namespace mlc { type llm (line 15) | namespace llm { type serve (line 16) | namespace serve { class EagleBatchDraftActionObj (line 23) | class EagleBatchDraftActionObj : public EngineActionObj { method EagleBatchDraftActionObj (line 25) | explicit EagleBatchDraftActionObj(Array models, LogitProc... method Step (line 38) | Array Step(EngineState estate) final { method CanDecode (line 190) | bool CanDecode(int num_rsentries) { function EngineAction (line 220) | EngineAction EngineAction::EagleBatchDraft(Array models, Lo... FILE: cpp/serve/engine_actions/eagle_batch_verify.cc type mlc (line 19) | namespace mlc { type llm (line 20) | namespace llm { type serve (line 21) | namespace serve { class EagleBatchVerifyActionObj (line 28) | class EagleBatchVerifyActionObj : public EngineActionObj { method EagleBatchVerifyActionObj (line 30) | explicit EagleBatchVerifyActionObj(Array models, LogitPro... method Step (line 44) | Array Step(EngineState estate) final { type DraftRequestStateEntries (line 347) | struct DraftRequestStateEntries { method DraftRequestStateEntries (line 362) | DraftRequestStateEntries GetDraftsToVerify(EngineState estate) { method CanVerify (line 397) | bool CanVerify(int num_required_pages) { method UpdateRequestStatesWithDraftProposals (line 402) | void UpdateRequestStatesWithDraftProposals(const Array models, Lo... method Step (line 33) | Array Step(EngineState estate) final { method UpdateRequestStatesWithDraftProposals (line 344) | void UpdateRequestStatesWithDraftProposals( method MatchPrefixCache (line 393) | int MatchPrefixCache(EngineState estate, PrefillInput* input) fi... function EngineAction (line 485) | EngineAction EngineAction::EagleNewRequestPrefill( FILE: cpp/serve/engine_actions/new_request_prefill.cc type mlc (line 9) | namespace mlc { type llm (line 10) | namespace llm { type serve (line 11) | namespace serve { class NewRequestPrefillActionObj (line 17) | class NewRequestPrefillActionObj : public BatchPrefillBaseActionObj { method NewRequestPrefillActionObj (line 19) | explicit NewRequestPrefillActionObj(Array models, LogitPr... method Step (line 30) | Array Step(EngineState estate) final { method MatchPrefixCache (line 280) | int MatchPrefixCache(EngineState estate, PrefillInput* input) fi... function EngineAction (line 352) | EngineAction EngineAction::NewRequestPrefill(Array models, ... FILE: cpp/serve/engine_state.cc type mlc (line 7) | namespace mlc { type llm (line 8) | namespace llm { type serve (line 9) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 11) | TVM_FFI_STATIC_INIT_BLOCK() { EngineStateObj::RegisterReflection(); } function RequestState (line 28) | RequestState EngineStateObj::GetRequestState(Request request) { FILE: cpp/serve/engine_state.h function namespace (line 16) | namespace mlc { FILE: cpp/serve/event_trace_recorder.cc type mlc (line 19) | namespace mlc { type llm (line 20) | namespace llm { type serve (line 21) | namespace serve { type detail (line 25) | namespace detail { type PairHash (line 27) | struct PairHash { class EventTraceRecorderImpl (line 39) | class EventTraceRecorderImpl : public EventTraceRecorderObj { method AddEvent (line 41) | void AddEvent(const String& request_id, const std::string& event... method AddEvent (line 52) | void AddEvent(const Array& request_ids, const std::strin... method DumpJSON (line 65) | std::string DumpJSON() final { method AddEventInternal (line 124) | void AddEventInternal(const std::string& request_id, const std::... function EventTraceRecorder (line 146) | EventTraceRecorder EventTraceRecorder::Create() { function TVM_FFI_STATIC_INIT_BLOCK (line 150) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/event_trace_recorder.h function namespace (line 16) | namespace mlc { FILE: cpp/serve/function_table.cc type mlc (line 24) | namespace mlc { type llm (line 25) | namespace llm { type serve (line 26) | namespace serve { function GetDiscoWorkerCPUBinding (line 28) | Optional GetDiscoWorkerCPUBinding(int num_workers) { function Function (line 53) | Function FunctionTable::SessionFuncAsPackedFunc(Session sess, DRef... function ObjectRef (line 155) | ObjectRef FunctionTable::LoadParams(const std::string& model_path,... function ObjectRef (line 294) | ObjectRef FunctionTable::Empty(Shape shape, DataType dtype, Device... function ObjectRef (line 305) | ObjectRef FunctionTable::CopyToWorker0(const Tensor& host_array, S... FILE: cpp/serve/function_table.h function namespace (line 23) | namespace mlc { FILE: cpp/serve/logit_processor.cc type mlc (line 13) | namespace mlc { type llm (line 14) | namespace llm { type serve (line 15) | namespace serve { function CopyArray (line 17) | inline void CopyArray(Tensor src, Tensor dst, TVMStreamHandle copy... function SyncCopyStream (line 22) | inline void SyncCopyStream(Device device, TVMStreamHandle compute_... function TVM_FFI_STATIC_INIT_BLOCK (line 34) | TVM_FFI_STATIC_INIT_BLOCK() { LogitProcessorObj::RegisterReflectio... class LogitProcessorImpl (line 36) | class LogitProcessorImpl : public LogitProcessorObj { method LogitProcessorImpl (line 39) | explicit LogitProcessorImpl(int max_num_token, int vocab_size, F... method InplaceUpdateLogits (line 99) | void InplaceUpdateLogits(Tensor logits, ... method Tensor (line 153) | Tensor ComputeProbsFromLogits(Tensor logits, const Array GetMultiStepLogits(const ObjectRef& hidden_states)... method ObjectRef (line 200) | ObjectRef FuseEmbedHidden(const ObjectRef& embeddings, const Obj... method Tensor (line 243) | Tensor BatchPrefill(const ObjectRef& embeddings, const std::vect... method ObjectRef (line 352) | ObjectRef BatchPrefillToLastHidden(const ObjectRef& embedding_or... method Tensor (line 420) | Tensor BatchDecode(const ObjectRef& embeddings, const std::vecto... method Tensor (line 488) | Tensor BatchTreeDecode(const ObjectRef& embeddings, const std::v... method ObjectRef (line 561) | ObjectRef BatchDecodeToLastHidden(const ObjectRef& hidden_states... method Tensor (line 612) | Tensor BatchVerify(const ObjectRef& embeddings, const std::vecto... method ObjectRef (line 684) | ObjectRef BatchVerifyToLastHidden(const ObjectRef& embeddings, method CreateKVCache (line 752) | void CreateKVCache(int page_size, int max_num_sequence, int64_t ... method AddNewSequence (line 783) | void AddNewSequence(int64_t seq_id) final { method ForkSequence (line 790) | void ForkSequence(int64_t parent_seq_id, int64_t child_seq_id, i... method RemoveSequence (line 798) | void RemoveSequence(int64_t seq_id) final { method PopNFromKVCache (line 806) | void PopNFromKVCache(int64_t seq_id, int num_tokens) final { method CommitAcceptedTokenTreeNodesToKVCache (line 813) | void CommitAcceptedTokenTreeNodesToKVCache( method EnableSlidingWindowForSeq (line 822) | void EnableSlidingWindowForSeq(int64_t seq_id) final { method IntTuple (line 832) | IntTuple DisaggPrepareKVRecv(int64_t seq_id, int length) final { method DisaggMarkKVSend (line 851) | void DisaggMarkKVSend(int64_t seq_id, int begin_pos, IntTuple co... method ModelMetadata (line 866) | ModelMetadata GetMetadata() const final { return ft_.model_metad... method GetNumAvailablePages (line 868) | int GetNumAvailablePages() const final { method GetCurrentTotalSequenceLength (line 877) | int GetCurrentTotalSequenceLength() const final { method LoadParams (line 888) | void LoadParams() final { this->params_ = ft_.LoadParams(model_,... method SetMaxNumSequence (line 890) | void SetMaxNumSequence(int max_num_sequence) final { method SetPrefillChunkSize (line 896) | void SetPrefillChunkSize(int prefill_chunk_size) final { method LogitProcessor (line 913) | LogitProcessor CreateLogitProcessor(int max_num_token, method Sampler (line 919) | Sampler CreateSampler(int max_num_sample, int num_models, method EstimateHostCPURequirement (line 929) | int EstimateHostCPURequirement() const final { method GetSlidingWindowSize (line 934) | int GetSlidingWindowSize() const final { return sliding_window_s... method GetAttentionSinkSize (line 936) | int GetAttentionSinkSize() const final { return attention_sink_s... method ObjectRef (line 938) | ObjectRef AllocEmbeddingTensor() final { method ObjectRef (line 961) | ObjectRef AllocHiddenStatesTensor() final { method Reset (line 985) | void Reset() final { method DraftTokenWorkspaceManager (line 994) | DraftTokenWorkspaceManager CreateDraftTokenWorkspaceManager(int ... method ObjectRef (line 999) | ObjectRef GatherHiddenStates(const ObjectRef& input, const std::... method ScatterHiddenStates (line 1018) | void ScatterHiddenStates(const ObjectRef& input, const std::vect... method Tensor (line 1028) | Tensor GatherDraftProbs(const Tensor& input, const std::vector GetMedusaLogits(const ObjectRef& hidden_states) { method DebugCallFuncOnAllAllWorker (line 1064) | void DebugCallFuncOnAllAllWorker(const String& func_name, Option... method LoadModelConfigJSON (line 1070) | void LoadModelConfigJSON(const tvm::ffi::json::Object& config) { function Model (line 31) | Model Model::Create(String reload_lib_path, String model_path, class ModelImpl (line 58) | class ModelImpl : public ModelObj { method ModelImpl (line 64) | explicit ModelImpl(String reload_lib_path, String model_path, tv... method ObjectRef (line 85) | ObjectRef TokenEmbed(IntTuple token_ids, ObjectRef* dst, int off... method ObjectRef (line 126) | ObjectRef ImageEmbed(const Tensor& image, ObjectRef* dst, int of... method CanGetLogits (line 154) | bool CanGetLogits() final { method Tensor (line 158) | Tensor GetLogits(const ObjectRef& hidden_states) final { method GetMultiStepLogits (line 184) | Array GetMultiStepLogits(const ObjectRef& hidden_states)... method ObjectRef (line 200) | ObjectRef FuseEmbedHidden(const ObjectRef& embeddings, const Obj... method Tensor (line 243) | Tensor BatchPrefill(const ObjectRef& embeddings, const std::vect... method ObjectRef (line 352) | ObjectRef BatchPrefillToLastHidden(const ObjectRef& embedding_or... method Tensor (line 420) | Tensor BatchDecode(const ObjectRef& embeddings, const std::vecto... method Tensor (line 488) | Tensor BatchTreeDecode(const ObjectRef& embeddings, const std::v... method ObjectRef (line 561) | ObjectRef BatchDecodeToLastHidden(const ObjectRef& hidden_states... method Tensor (line 612) | Tensor BatchVerify(const ObjectRef& embeddings, const std::vecto... method ObjectRef (line 684) | ObjectRef BatchVerifyToLastHidden(const ObjectRef& embeddings, method CreateKVCache (line 752) | void CreateKVCache(int page_size, int max_num_sequence, int64_t ... method AddNewSequence (line 783) | void AddNewSequence(int64_t seq_id) final { method ForkSequence (line 790) | void ForkSequence(int64_t parent_seq_id, int64_t child_seq_id, i... method RemoveSequence (line 798) | void RemoveSequence(int64_t seq_id) final { method PopNFromKVCache (line 806) | void PopNFromKVCache(int64_t seq_id, int num_tokens) final { method CommitAcceptedTokenTreeNodesToKVCache (line 813) | void CommitAcceptedTokenTreeNodesToKVCache( method EnableSlidingWindowForSeq (line 822) | void EnableSlidingWindowForSeq(int64_t seq_id) final { method IntTuple (line 832) | IntTuple DisaggPrepareKVRecv(int64_t seq_id, int length) final { method DisaggMarkKVSend (line 851) | void DisaggMarkKVSend(int64_t seq_id, int begin_pos, IntTuple co... method ModelMetadata (line 866) | ModelMetadata GetMetadata() const final { return ft_.model_metad... method GetNumAvailablePages (line 868) | int GetNumAvailablePages() const final { method GetCurrentTotalSequenceLength (line 877) | int GetCurrentTotalSequenceLength() const final { method LoadParams (line 888) | void LoadParams() final { this->params_ = ft_.LoadParams(model_,... method SetMaxNumSequence (line 890) | void SetMaxNumSequence(int max_num_sequence) final { method SetPrefillChunkSize (line 896) | void SetPrefillChunkSize(int prefill_chunk_size) final { method LogitProcessor (line 913) | LogitProcessor CreateLogitProcessor(int max_num_token, method Sampler (line 919) | Sampler CreateSampler(int max_num_sample, int num_models, method EstimateHostCPURequirement (line 929) | int EstimateHostCPURequirement() const final { method GetSlidingWindowSize (line 934) | int GetSlidingWindowSize() const final { return sliding_window_s... method GetAttentionSinkSize (line 936) | int GetAttentionSinkSize() const final { return attention_sink_s... method ObjectRef (line 938) | ObjectRef AllocEmbeddingTensor() final { method ObjectRef (line 961) | ObjectRef AllocHiddenStatesTensor() final { method Reset (line 985) | void Reset() final { method DraftTokenWorkspaceManager (line 994) | DraftTokenWorkspaceManager CreateDraftTokenWorkspaceManager(int ... method ObjectRef (line 999) | ObjectRef GatherHiddenStates(const ObjectRef& input, const std::... method ScatterHiddenStates (line 1018) | void ScatterHiddenStates(const ObjectRef& input, const std::vect... method Tensor (line 1028) | Tensor GatherDraftProbs(const Tensor& input, const std::vector GetMedusaLogits(const ObjectRef& hidden_states) { method DebugCallFuncOnAllAllWorker (line 1064) | void DebugCallFuncOnAllAllWorker(const String& func_name, Option... method LoadModelConfigJSON (line 1070) | void LoadModelConfigJSON(const tvm::ffi::json::Object& config) { function TVM_FFI_STATIC_INIT_BLOCK (line 1127) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/model.h type ModelWorkspace (line 39) | struct ModelWorkspace { function ObjectRef (line 49) | ObjectRef hidden_states{nullptr}; FILE: cpp/serve/prefix_cache.cc type mlc (line 10) | namespace mlc { type llm (line 11) | namespace llm { type serve (line 12) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 16) | TVM_FFI_STATIC_INIT_BLOCK() { PrefixCacheObj::RegisterReflection(); } class PrefixCacheImpl (line 21) | class PrefixCacheImpl : public PrefixCacheObj { method PrefixCacheImpl (line 28) | explicit PrefixCacheImpl(size_t max_num_recycling_seqs, PrefixCa... method PrefixCacheMatchedResult (line 48) | PrefixCacheMatchedResult InsertSequence(int64_t seq_id, std::vec... method ExtendSequence (line 149) | void ExtendSequence(int64_t seq_id, const std::vector& ... method CommitSequenceExtention (line 153) | void CommitSequenceExtention() final { method RollBackSequence (line 176) | void RollBackSequence(int64_t seq_id, size_t num_tokens) final { method RecycleSequence (line 190) | void RecycleSequence(int64_t seq_id, bool lazy = true) final { method TryFreeMemory (line 224) | bool TryFreeMemory() final { method HasSequence (line 250) | bool HasSequence(int64_t seq_id) final { return radix_tree_->Has... method Reset (line 255) | void Reset() final { method PrefixCacheMode (line 265) | PrefixCacheMode Mode() final { return PrefixCacheMode::kRadix; } method ReuseRecyclingSequence (line 268) | void ReuseRecyclingSequence(int64_t seq_id) { type SequenceState (line 280) | enum class SequenceState : int { class NoPrefixCache (line 344) | class NoPrefixCache : public PrefixCacheObj { method PrefixCacheMatchedResult (line 355) | PrefixCacheMatchedResult InsertSequence(int64_t seq_id, std::vec... method ExtendSequence (line 367) | void ExtendSequence(int64_t seq_id, const std::vector& ... method CommitSequenceExtention (line 371) | void CommitSequenceExtention() final { method RollBackSequence (line 381) | void RollBackSequence(int64_t seq_id, size_t num_tokens) final { method RecycleSequence (line 394) | void RecycleSequence(int64_t seq_id, bool lazy = true) final { method TryFreeMemory (line 404) | bool TryFreeMemory() final { method HasSequence (line 414) | bool HasSequence(int64_t seq_id) final { method Reset (line 422) | void Reset() final {} method PrefixCacheMode (line 424) | PrefixCacheMode Mode() final { return PrefixCacheMode::kDisable; } function PrefixCache (line 427) | PrefixCache PrefixCache::CreateRadixPrefixCache(size_t max_num_rec... function PrefixCache (line 434) | PrefixCache PrefixCache::CreateNoPrefixCache() { FILE: cpp/serve/prefix_cache.h function namespace (line 20) | namespace mlc { FILE: cpp/serve/radix_tree.cc type mlc (line 11) | namespace mlc { type llm (line 12) | namespace llm { type serve (line 13) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 17) | TVM_FFI_STATIC_INIT_BLOCK() { PagedRadixTreeObj::RegisterReflectio... type SequenceIDNode (line 22) | struct SequenceIDNode { class SequenceIDNodePool (line 35) | class SequenceIDNodePool { method SequenceIDNodePool (line 38) | SequenceIDNodePool() { method SequenceIDNode (line 50) | SequenceIDNode* Allocate(int64_t seq_id, SequenceIDNode* next) { method Free (line 68) | void Free(SequenceIDNode* node) { method Reset (line 77) | void Reset() { method NewNodeBlock_ (line 107) | void NewNodeBlock_() { type RadixPage (line 137) | struct RadixPage { method Extend (line 172) | void Extend(const int32_t* suffix, size_t suffix_length) { method AddSequence (line 185) | void AddSequence(SequenceIDNodePool* pool, int64_t id) { seq_ids... method PopSequence (line 193) | void PopSequence(SequenceIDNodePool* pool, int64_t id) { method GetLocalSequence (line 222) | std::vector GetLocalSequence() { method FindAnyChildSequence (line 236) | int32_t FindAnyChildSequence() { method FindAllChildSequence (line 246) | std::vector FindAllChildSequence() { method Iterate (line 263) | void Iterate(CallbackFunc f) { method RadixPage (line 274) | RadixPage* GetLastSibling() { method RadixPage (line 287) | RadixPage* FindChild(int64_t first_token) { method InsertChild (line 297) | void InsertChild(RadixPage* child) { method RemoveChild (line 307) | void RemoveChild(RadixPage* child) { method Mergeable (line 325) | bool Mergeable() { method MatchPrefix (line 341) | size_t MatchPrefix(const int32_t* prefix, size_t prefix_length) { class RadixPagePool (line 356) | class RadixPagePool { method RadixPagePool (line 359) | RadixPagePool() { method RadixPage (line 369) | RadixPage* Allocate() { method Free (line 389) | void Free(RadixPage* page) { method FreeCapacity (line 400) | size_t FreeCapacity() { return free_page_indices_.size() * kPage... method Reset (line 405) | void Reset() { method NewPageBlock_ (line 443) | void NewPageBlock_() { class PagedRadixTreeImpl (line 460) | class PagedRadixTreeImpl : public PagedRadixTreeObj { method PagedRadixTreeImpl (line 471) | explicit PagedRadixTreeImpl() { method HasSequence (line 487) | bool HasSequence(int64_t seq_id) { return seq2page.find(seq_id) ... method IntTuple (line 495) | IntTuple GetSequence(int64_t seq_id) { method MatchPrefix (line 514) | std::pair> MatchPrefix(const std::v... method GetSequenceLength (line 528) | size_t GetSequenceLength(int64_t seq_id) { method ForkSequence (line 547) | void ForkSequence(int64_t seq_id, int64_t parent_seq_id, size_t ... method AddSequence (line 572) | void AddSequence(int64_t seq_id) { method ExtendSequence (line 585) | void ExtendSequence(int64_t seq_id, const std::vector& ... method RollBackSequence (line 625) | void RollBackSequence(int64_t seq_id, size_t num_tokens) { method RemoveSequence (line 672) | void RemoveSequence(int64_t seq_id) { method FreeCapacity (line 692) | size_t FreeCapacity() { return radix_page_pool->FreeCapacity(); } method Reset (line 694) | void Reset() { method MergePage (line 717) | void MergePage(RadixPage* page) { method RadixPage (line 743) | RadixPage* SplitPage(RadixPage* page, size_t offset) { method MatchSequence (line 779) | std::tuple MatchSequence(RadixPage* ... function PagedRadixTree (line 801) | PagedRadixTree PagedRadixTree::Create() { function TVM_FFI_STATIC_INIT_BLOCK (line 805) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/radix_tree.h function namespace (line 15) | namespace mlc { FILE: cpp/serve/request.cc type mlc (line 13) | namespace mlc { type llm (line 14) | namespace llm { type serve (line 15) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 19) | TVM_FFI_STATIC_INIT_BLOCK() { RequestNode::RegisterReflection(); } function Request (line 47) | Request Request::FromUntokenized(const Request& request, const Tok... function TVM_FFI_STATIC_INIT_BLOCK (line 71) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/request.h function namespace (line 18) | namespace mlc { FILE: cpp/serve/request_state.cc type mlc (line 10) | namespace mlc { type llm (line 11) | namespace llm { type serve (line 12) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 14) | TVM_FFI_STATIC_INIT_BLOCK() { function RequestStreamOutput (line 117) | RequestStreamOutput RequestActionPostProcWorkspace::GetStreamOutpu... FILE: cpp/serve/request_state.h function namespace (line 23) | namespace llm { FILE: cpp/serve/sampler/cpu_sampler.cc type mlc (line 16) | namespace mlc { type llm (line 17) | namespace llm { type serve (line 18) | namespace serve { function TVM_FFI_STATIC_INIT_BLOCK (line 20) | TVM_FFI_STATIC_INIT_BLOCK() { SamplerObj::RegisterReflection(); } function TokenProbPair (line 35) | TokenProbPair SampleTopPFromProb(Tensor prob, int unit_offset, int... function RenormalizeProbByTopP (line 172) | void RenormalizeProbByTopP(Tensor prob, int unit_offset, double to... type detail (line 262) | namespace detail { function ComputeTopProbsImpl (line 266) | std::vector ComputeTopProbsImpl(const float* p_pr... function ComputeTopProbs (line 302) | inline std::vector ComputeTopProbs(Tensor prob, int... class CPUSampler (line 327) | class CPUSampler : public SamplerObj { method CPUSampler (line 329) | explicit CPUSampler(Optional trace_recorder) method Tensor (line 332) | Tensor BatchRenormalizeProbsByTopP(Tensor probs_on_device, ... method BatchSampleTokensWithProbBeforeTopP (line 375) | std::vector BatchSampleTokensWithProbBeforeTopP( method BatchSampleTokensWithProbAfterTopP (line 392) | std::vector BatchSampleTokensWithProbAfterTopP( method BatchVerifyDraftTokensWithProbAfterTopP (line 402) | std::pair>, std::vector> method BatchSampleTokensImpl (line 506) | std::vector BatchSampleTokensImpl(Tensor probs_on_... method Tensor (line 546) | Tensor CopyProbsToCPU(Tensor probs_on_device) { function Sampler (line 582) | Sampler Sampler::CreateCPUSampler(Optional tra... FILE: cpp/serve/sampler/gpu_sampler.cc type mlc (line 14) | namespace mlc { type llm (line 15) | namespace llm { type serve (line 16) | namespace serve { function FlashInferSamplingAvailable (line 18) | inline bool FlashInferSamplingAvailable(Device device) { function CopyArray (line 32) | inline void CopyArray(Tensor src, Tensor dst, TVMStreamHandle copy... function SyncCopyStream (line 37) | inline void SyncCopyStream(Device device, TVMStreamHandle compute_... class GPUSampler (line 49) | class GPUSampler : public SamplerObj { method GPUSampler (line 51) | explicit GPUSampler(int max_num_sample, int vocab_size, Function... method Tensor (line 122) | Tensor BatchRenormalizeProbsByTopP(Tensor probs_on_device, ... method BatchSampleTokensWithProbBeforeTopP (line 177) | std::vector BatchSampleTokensWithProbBeforeTopP( method BatchSampleTokensWithProbAfterTopP (line 188) | std::vector BatchSampleTokensWithProbAfterTopP( method BatchVerifyDraftTokensWithProbAfterTopP (line 199) | std::pair>, std::vector> method BatchSampleTokensImpl (line 358) | std::vector BatchSampleTokensImpl(Tensor probs_on_... method CollectSampleResult (line 409) | std::vector CollectSampleResult(const std::vector<... method ChunkSampleTokensImpl (line 438) | std::vector ChunkSampleTokensImpl(Tensor probs_on_... method Tensor (line 478) | Tensor GenerateUniformSamples(const std::vector& sample_ind... method CheckTopP (line 519) | bool CheckTopP(const Array& generation_cfg, method CheckProbValues (line 544) | bool CheckProbValues(const Array& generation_cfg, method SampleOnGPU (line 565) | std::vector SampleOnGPU(Tensor probs_on_device, Tensor u... method CopyArraysToCPU (line 655) | std::vector CopyArraysToCPU(const std::vector& d... function Sampler (line 746) | Sampler Sampler::CreateGPUSampler(int max_num_sample, int vocab_si... FILE: cpp/serve/sampler/sampler.h function namespace (line 20) | namespace mlc { FILE: cpp/serve/threaded_engine.cc type mlc (line 22) | namespace mlc { type llm (line 23) | namespace llm { type serve (line 24) | namespace serve { type InstructionKind (line 30) | enum class InstructionKind : int { class ThreadedEngineImpl (line 40) | class ThreadedEngineImpl : public ThreadedEngine { method InitThreadedEngine (line 42) | void InitThreadedEngine(Device device, Optional reques... method Reload (line 51) | void Reload(String engine_config_json_str) final { method Unload (line 73) | void Unload() final { method Reset (line 96) | void Reset() final { method AddRequest (line 109) | void AddRequest(Request request) final { method AbortRequest (line 122) | void AbortRequest(const String& request_id) final { method RunBackgroundLoop (line 135) | void RunBackgroundLoop() final { method RunBackgroundStreamBackLoop (line 190) | void RunBackgroundStreamBackLoop() final { method ExitBackgroundLoop (line 222) | void ExitBackgroundLoop() final { method GenerationConfig (line 233) | GenerationConfig GetDefaultGenerationConfig() const final { method Request (line 239) | Request CreateRequest(String id, Array inputs, String gene... method EngineConfig (line 246) | EngineConfig GetCompleteEngineConfig() const final { method String (line 251) | String GetCompleteEngineConfigJSONString() const { method DebugCallFuncOnAllAllWorker (line 255) | void DebugCallFuncOnAllAllWorker(const String& func_name, Option... method EngineReloadImpl (line 270) | void EngineReloadImpl(const std::string& engine_config_json_str) { method EngineUnloadImpl (line 300) | void EngineUnloadImpl() { class ThreadedEngineModule (line 383) | class ThreadedEngineModule : public ThreadedEngineImpl, public ffi... function TVM_FFI_STATIC_INIT_BLOCK (line 403) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/serve/threaded_engine.h function namespace (line 13) | namespace mlc { FILE: cpp/support/debug_utils.h function namespace (line 11) | namespace mlc { FILE: cpp/support/dynamic_bitset.h function namespace (line 15) | namespace mlc { function const (line 89) | bool operator[](int index) const { function Set (line 98) | void Set() { function Reset (line 114) | void Reset() { function Reset (line 120) | void Reset(int index) { Set(index, false); } FILE: cpp/support/encoding.cc type mlc (line 11) | namespace mlc { type llm (line 12) | namespace llm { function PrintAsUTF8 (line 14) | std::string PrintAsUTF8(TCodepoint codepoint) { function PrintAsEscaped (line 39) | std::string PrintAsEscaped( function PrintAsEscaped (line 68) | std::string PrintAsEscaped(uint8_t raw_char) { return PrintAsEscaped... function PrintAsEscaped (line 70) | std::string PrintAsEscaped(std::string raw_str) { function HandleUTF8FirstByte (line 79) | std::tuple HandleUTF8FirstByte(uint8_t byte) { function ParseNextUTF8 (line 108) | std::pair ParseNextUTF8(const char* utf8, U... function ParseUTF8 (line 133) | std::vector ParseUTF8(const char* utf8, UTF8ErrorPolicy ... function HexCharToInt (line 146) | inline int HexCharToInt(char c) { function ParseNextUTF8OrEscaped (line 158) | std::pair ParseNextUTF8OrEscaped( FILE: cpp/support/encoding.h function TCodepoint (line 62) | enum CharHandlingError : TCodepoint { FILE: cpp/support/json_parser.h function namespace (line 18) | namespace mlc { function namespace (line 205) | namespace details { FILE: cpp/support/load_bytes_from_file.h function namespace (line 14) | namespace mlc { FILE: cpp/support/progress_bar.h function namespace (line 12) | namespace mlc { FILE: cpp/support/random.h function namespace (line 12) | namespace mlc { FILE: cpp/support/result.h function namespace (line 14) | namespace mlc { FILE: cpp/support/utils.h function namespace (line 18) | namespace mlc { FILE: cpp/support/vlm_utils.cc type mlc (line 9) | namespace mlc { type llm (line 10) | namespace llm { function CalculateResizeShape (line 12) | void CalculateResizeShape(tvm::runtime::Tensor image_data, std::stri... function CalculatePadShape (line 31) | void CalculatePadShape(tvm::runtime::Tensor image_data, std::string ... function CalculateCropShape (line 47) | void CalculateCropShape(tvm::runtime::Tensor image_data, std::string... FILE: cpp/support/vlm_utils.h function namespace (line 13) | namespace mlc { FILE: cpp/tokenizers/streamer.cc type mlc (line 17) | namespace mlc { type llm (line 18) | namespace llm { function TVM_FFI_STATIC_INIT_BLOCK (line 20) | TVM_FFI_STATIC_INIT_BLOCK() { function TVM_FFI_STATIC_INIT_BLOCK (line 146) | TVM_FFI_STATIC_INIT_BLOCK() { function CreatePartialMatchTable (line 162) | inline std::vector CreatePartialMatchTable(const String& str) { function TVM_FFI_STATIC_INIT_BLOCK (line 269) | TVM_FFI_STATIC_INIT_BLOCK() { FILE: cpp/tokenizers/streamer.h function namespace (line 17) | namespace mlc { FILE: cpp/tokenizers/tokenizers.cc type mlc (line 24) | namespace mlc { type llm (line 25) | namespace llm { function TVM_FFI_STATIC_INIT_BLOCK (line 27) | TVM_FFI_STATIC_INIT_BLOCK() { function String (line 34) | String TokenizerInfoNode::AsJSONString() const { function TokenizerInfo (line 42) | TokenizerInfo TokenizerInfo::FromJSONString(String json_string) { function DynamicBitset (line 104) | const DynamicBitset& TokenizerObj::GetPrefixTokenMask() { function Tokenizer (line 143) | Tokenizer Tokenizer::FromPath(const String& _path, std::optional None: function _debug_cuda_profiler_stop (line 37) | def _debug_cuda_profiler_stop() -> None: FILE: python/mlc_llm/bench/__main__.py function _parse_num_concurrent_requests (line 34) | def _parse_num_concurrent_requests(num_str: Optional[str]) -> Optional[L... function _parse_request_rate (line 43) | def _parse_request_rate(request_rate_str: Optional[str]) -> Optional[Lis... function _parse_mlc_engine_config (line 56) | def _parse_mlc_engine_config(config_str: Optional[str]) -> EngineConfig: function _launch_mlc_server (line 76) | def _launch_mlc_server(args: argparse.argparse.Namespace): function run_pipeline (line 88) | def run_pipeline( function query_mlc_server_metrics (line 119) | def query_mlc_server_metrics(host: str, port: int): function main (line 129) | def main(args: argparse.argparse.Namespace): FILE: python/mlc_llm/bench/api_endpoint.py class APIEndPoint (line 18) | class APIEndPoint: method __init__ (line 23) | def __init__(self, include_server_metrics: bool = False) -> None: method __aenter__ (line 26) | async def __aenter__(self) -> Self: method __aexit__ (line 29) | async def __aexit__(self, exc_type, exc_value, tb) -> None: method __call__ (line 32) | async def __call__(self, request: RequestRecord) -> RequestRecord: class OpenAIChatEndPoint (line 36) | class OpenAIChatEndPoint(APIEndPoint): method __init__ (line 39) | def __init__( # pylint: disable=too-many-arguments method __aenter__ (line 57) | async def __aenter__(self) -> Self: method __aexit__ (line 63) | async def __aexit__(self, exc_type, exc_value, tb) -> None: method __call__ (line 66) | async def __call__( # pylint: disable=too-many-branches,too-many-stat... class OpenAIEndPoint (line 186) | class OpenAIEndPoint(APIEndPoint): method __init__ (line 189) | def __init__( # pylint: disable=too-many-arguments method __aenter__ (line 212) | async def __aenter__(self) -> Self: method __aexit__ (line 218) | async def __aexit__(self, exc_type, exc_value, tb) -> None: method __call__ (line 221) | async def __call__( # pylint: disable=too-many-branches,too-many-stat... class TensorRTLLMEndPoint (line 318) | class TensorRTLLMEndPoint(APIEndPoint): method __init__ (line 321) | def __init__( # pylint: disable=too-many-arguments method __aenter__ (line 333) | async def __aenter__(self) -> Self: method __aexit__ (line 339) | async def __aexit__(self, exc_type, exc_value, tb) -> None: method __call__ (line 342) | async def __call__( # pylint: disable=too-many-branches,too-many-loca... function create_api_endpoint (line 448) | def create_api_endpoint(args: argparse.Namespace) -> APIEndPoint: FILE: python/mlc_llm/bench/dataset.py class Dataset (line 22) | class Dataset: # pylint: disable=too-few-public-methods method generate_request_records (line 35) | def generate_request_records( class ShareGPTDataset (line 46) | class ShareGPTDataset(Dataset): # pylint: disable=too-few-public-methods method __init__ (line 52) | def __init__( method generate_request_records (line 109) | def generate_request_records( class LoogleDataset (line 170) | class LoogleDataset(Dataset): # pylint: disable=too-few-public-methods method __init__ (line 183) | def __init__(self, tokenizer: AutoTokenizer, testset_name: str) -> None: method generate_request_records (line 210) | def generate_request_records( # pylint: disable=too-many-locals class LLMPerfDataset (line 264) | class LLMPerfDataset(Dataset): # pylint: disable=too-few-public-methods method __init__ (line 267) | def __init__(self, dataset_path: str, num_requests: int, tokenizer: Au... method generate_request_records (line 285) | def generate_request_records( # pylint: disable=too-many-arguments,to... class JSONModeEvalDataset (line 345) | class JSONModeEvalDataset(Dataset): # pylint: disable=too-few-public-me... method __init__ (line 348) | def __init__(self, tokenizer: AutoTokenizer) -> None: method generate_request_records (line 365) | def generate_request_records( class ReActDataset (line 407) | class ReActDataset(Dataset): # pylint: disable=too-few-public-methods method __init__ (line 484) | def __init__( # pylint: disable=too-many-locals method generate_request_records (line 550) | def generate_request_records( class WildChatDataset (line 590) | class WildChatDataset(Dataset): # pylint: disable=too-few-public-methods method __init__ (line 595) | def __init__(self, tokenizer: AutoTokenizer, apply_chat_template: bool... method generate_request_records (line 650) | def generate_request_records( # pylint: disable=too-many-locals class AzureLLMInferenceDataset (line 711) | class AzureLLMInferenceDataset(Dataset): # pylint: disable=too-few-publ... method __init__ (line 718) | def __init__(self, dataset_path: str, tokenizer: AutoTokenizer) -> None: method generate_request_records (line 741) | def generate_request_records( # pylint: disable=too-many-locals function create_dataset (line 817) | def create_dataset( # pylint: disable=too-many-return-statements,too-ma... FILE: python/mlc_llm/bench/evaluation/gsm8k.py function extract_answer (line 21) | def extract_answer(text: str, regex: re.Pattern, select_index: int) -> str: function extract_ground_truth (line 34) | def extract_ground_truth(text: str) -> str: function strict_extract_answer (line 39) | def strict_extract_answer(text: str) -> str: function flexible_extract_answer (line 44) | def flexible_extract_answer(text: str) -> str: function create_few_shot_prompt (line 49) | def create_few_shot_prompt(n_shot: int, use_cot: bool, random_order=Fals... function create_prompt (line 157) | def create_prompt(question: str, n_shot: int, use_cot: bool, random_orde... function parse_args (line 167) | def parse_args(): function send_request (line 184) | async def send_request( function evaluate (line 209) | async def evaluate( # pylint: disable=too-many-arguments, too-many-locals FILE: python/mlc_llm/bench/evaluation/mmlu.py function parse_args (line 81) | def parse_args(): function send_request (line 97) | async def send_request( function evaluate (line 128) | async def evaluate( # pylint: disable=too-many-arguments, too-many-locals FILE: python/mlc_llm/bench/request_processor.py class RequestProcessor (line 30) | class RequestProcessor: # pylint: disable=too-few-public-methods method __call__ (line 36) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class LogMessage (line 40) | class LogMessage(RequestProcessor): # pylint: disable=too-few-public-me... method __init__ (line 43) | def __init__(self, message: str) -> None: method __call__ (line 46) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class SampleRequests (line 51) | class SampleRequests(RequestProcessor): # pylint: disable=too-few-publi... method __init__ (line 54) | def __init__(self, num_requests: int, take_first_x_requests: bool = Fa... method __call__ (line 60) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... method _sample_from_plain_request_records (line 71) | def _sample_from_plain_request_records( method _sample_from_grouped_request_records (line 93) | def _sample_from_grouped_request_records( class AttachModelName (line 124) | class AttachModelName(RequestProcessor): # pylint: disable=too-few-publ... method __init__ (line 127) | def __init__(self, model: str) -> None: method __call__ (line 130) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class AttachRequestRateTimestamp (line 136) | class AttachRequestRateTimestamp(RequestProcessor): # pylint: disable=t... method __init__ (line 139) | def __init__(self, request_rate: np.float32) -> None: method __call__ (line 142) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class AttachExecutionFeature (line 151) | class AttachExecutionFeature(RequestProcessor): # pylint: disable=too-f... method __init__ (line 154) | def __init__(self, exec_feature: Dict[str, Any]) -> None: method __call__ (line 157) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class AttachStreamFlag (line 164) | class AttachStreamFlag(RequestProcessor): # pylint: disable=too-few-pub... method __init__ (line 167) | def __init__(self, stream: Optional[bool]) -> None: method __call__ (line 170) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class AttachSamplingOptions (line 178) | class AttachSamplingOptions(RequestProcessor): # pylint: disable=too-fe... method __init__ (line 181) | def __init__(self, temperature: float, top_p: float, ignore_eos: bool)... method __call__ (line 186) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class ScaleTimestamp (line 198) | class ScaleTimestamp(RequestProcessor): # pylint: disable=too-few-publi... method __init__ (line 201) | def __init__(self, timestamp_scale: float): method __call__ (line 204) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class MetricAnalyzer (line 214) | class MetricAnalyzer(RequestProcessor): # pylint: disable=too-few-publi... method __init__ (line 217) | def __init__(self, tokenizer: AutoTokenizer) -> None: method __call__ (line 220) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class WarmupAndRun (line 255) | class WarmupAndRun(RequestProcessor): # pylint: disable=too-few-public-... method __init__ (line 258) | def __init__( # pylint: disable=too-many-arguments method generate_fake_warmup_requests (line 272) | def generate_fake_warmup_requests( # pylint: disable=missing-function... method __call__ (line 291) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... method _process_warmup_requests (line 324) | def _process_warmup_requests(self, warmup_requests: List[RequestRecord... class SequentialProcessor (line 341) | class SequentialProcessor(RequestProcessor): # pylint: disable=too-few-... method __init__ (line 346) | def __init__(self, *processors: RequestProcessor) -> None: method __call__ (line 349) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class Executor (line 355) | class Executor(RequestProcessor): # pylint: disable=too-few-public-methods method __init__ (line 358) | def __init__( method __call__ (line 368) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... class FixedConcurrentRequestExecutor (line 372) | class FixedConcurrentRequestExecutor(Executor): # pylint: disable=too-f... method __init__ (line 375) | def __init__( # pylint: disable=too-many-arguments method __call__ (line 391) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... method _process_task (line 422) | def _process_task( class FixTimestampExecutor (line 484) | class FixTimestampExecutor(Executor): # pylint: disable=too-few-public-... method __init__ (line 487) | def __init__( # pylint: disable=too-many-arguments method __call__ (line 503) | def __call__(self, request_records: List[RequestRecord]) -> List[Reque... method _process_task (line 540) | def _process_task( function create_pipelines (line 603) | def create_pipelines( # pylint: disable=too-many-branches FILE: python/mlc_llm/bench/request_record.py class ServerMetrics (line 14) | class ServerMetrics(BaseModel): class Metrics (line 27) | class Metrics(BaseModel): class RequestRecord (line 45) | class RequestRecord(BaseModel): class GroupedRequestRecord (line 57) | class GroupedRequestRecord(RequestRecord): function generate_metrics_summary (line 67) | def generate_metrics_summary( function _compute_metrics_statistics (line 116) | def _compute_metrics_statistics( function convert_reports_to_df (line 161) | def convert_reports_to_df(reports: List[Dict[str, Any]]) -> pd.DataFrame: function pretty_print_report (line 177) | def pretty_print_report(report: Dict[str, Any]) -> None: # pylint: disa... FILE: python/mlc_llm/cli/calibrate.py function main (line 10) | def main(argv): FILE: python/mlc_llm/cli/chat.py function main (line 8) | def main(argv): FILE: python/mlc_llm/cli/check_device.py function _check_device (line 10) | def _check_device(device: Device) -> bool: function main (line 17) | def main(): FILE: python/mlc_llm/cli/compile.py function main (line 27) | def main(argv): FILE: python/mlc_llm/cli/convert_weight.py function main (line 17) | def main(argv): FILE: python/mlc_llm/cli/delivery.py class OverrideConfigs (line 33) | class OverrideConfigs(BaseModel): class ModelDeliveryTask (line 46) | class ModelDeliveryTask(BaseModel): class ModelDeliveryList (line 71) | class ModelDeliveryList(BaseModel): method from_json (line 83) | def from_json(cls: Type[T], json_dict: Dict[str, Any]) -> T: method to_json (line 93) | def to_json(self) -> Dict[str, Any]: function _clone_repo (line 100) | def _clone_repo(model: Union[str, Path], hf_local_dir: Optional[str]) ->... function _run_quantization (line 120) | def _run_quantization( function _get_current_log (line 207) | def _get_current_log(log: str) -> ModelDeliveryList: function _generate_model_delivery_diff (line 219) | def _generate_model_delivery_diff( # pylint: disable=too-many-locals function _main (line 281) | def _main( # pylint: disable=too-many-locals, too-many-arguments function main (line 369) | def main(): FILE: python/mlc_llm/cli/gen_config.py function main (line 14) | def main(argv): FILE: python/mlc_llm/cli/lib_delivery.py class ModelInfo (line 23) | class ModelInfo: # pylint: disable=too-many-instance-attributes class DeferredScope (line 36) | class DeferredScope: method __init__ (line 39) | def __init__(self): method add (line 42) | def add(self, func: Callable[[], None]): method __enter__ (line 46) | def __enter__(self): method __exit__ (line 49) | def __exit__(self, exc_type, exc_value, traceback): method create_temp_dir (line 54) | def create_temp_dir(self) -> Path: function _run_compilation (line 61) | def _run_compilation(model_info: ModelInfo, repo_dir: Path) -> bool: function _main (line 122) | def _main( # pylint: disable=too-many-locals function main (line 175) | def main(): FILE: python/mlc_llm/cli/model_metadata.py function _extract_metadata (line 19) | def _extract_metadata(model_lib: Path) -> Dict[str, Any]: function _report_all (line 29) | def _report_all(metadata: Dict[str, Any]) -> None: function _read_dynamic_shape (line 46) | def _read_dynamic_shape(shape: List[Union[int, str]], config: Union[Dict... function _compute_memory_usage (line 74) | def _compute_memory_usage(metadata: Dict[str, Any], config: Union[Dict, ... function _report_memory_usage (line 91) | def _report_memory_usage(metadata: Dict[str, Any], config: Union[Dict, C... function main (line 145) | def main(): FILE: python/mlc_llm/cli/package.py function main (line 12) | def main(argv): FILE: python/mlc_llm/cli/router.py function main (line 8) | def main(argv): FILE: python/mlc_llm/cli/serve.py class EngineConfigOverride (line 15) | class EngineConfigOverride: # pylint: disable=too-many-instance-attributes method __repr__ (line 36) | def __repr__(self) -> str: method from_str (line 65) | def from_str(source: str) -> "EngineConfigOverride": function main (line 106) | def main(argv): FILE: python/mlc_llm/cli/worker.py function main (line 32) | def main(): FILE: python/mlc_llm/compiler_pass/attach_cuda_graph_alloc_init_func.py class AttachCUDAGraphAllocInitFunc (line 8) | class AttachCUDAGraphAllocInitFunc: # pylint: disable=too-few-public-me... method __init__ (line 11) | def __init__(self): method transform_module (line 14) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... FILE: python/mlc_llm/compiler_pass/attach_embedding_allocator.py class AttachAllocEmbeddingTensorFunc (line 10) | class AttachAllocEmbeddingTensorFunc: # pylint: disable=too-few-public-... method __init__ (line 13) | def __init__(self, metadata: Dict[str, Any]): method transform_module (line 16) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... FILE: python/mlc_llm/compiler_pass/attach_logit_processor.py class AttachLogitProcessFunc (line 14) | class AttachLogitProcessFunc: # pylint: disable=too-few-public-methods method __init__ (line 17) | def __init__(self, target: tvm.target.Target): method transform_module (line 27) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... function _get_apply_logit_bias_inplace_cpu (line 41) | def _get_apply_logit_bias_inplace_cpu(): function _get_apply_logit_bias_inplace (line 72) | def _get_apply_logit_bias_inplace(target: tvm.target.Target): function _get_apply_penalty_inplace_cpu (line 112) | def _get_apply_penalty_inplace_cpu(): function _get_apply_penalty_inplace (line 156) | def _get_apply_penalty_inplace(target: tvm.target.Target): function _get_apply_bitmask_inplace_cpu (line 210) | def _get_apply_bitmask_inplace_cpu(): function _get_apply_bitmask_inplace (line 246) | def _get_apply_bitmask_inplace(target: tvm.target.Target): FILE: python/mlc_llm/compiler_pass/attach_sampler.py class AttachGPUSamplingFunc (line 15) | class AttachGPUSamplingFunc: # pylint: disable=too-few-public-methods method __init__ (line 18) | def __init__(self, target: tvm.target.Target, variable_bounds: Dict[st... method transform_module (line 29) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... function _attach_multinomial_sampling_func (line 68) | def _attach_multinomial_sampling_func(bb: relax.BlockBuilder): function _attach_argsort_func (line 119) | def _attach_argsort_func(bb: relax.BlockBuilder): function full (line 142) | def full(var_result: T.handle, value: T.int32): function _attach_sample_with_top_p (line 152) | def _attach_sample_with_top_p(bb: relax.BlockBuilder): # pylint: disabl... function _attach_renormalize_by_top_p (line 236) | def _attach_renormalize_by_top_p(bb: relax.BlockBuilder, target: tvm.tar... function _attach_take_probs_func (line 267) | def _attach_take_probs_func(bb: relax.BlockBuilder): function _attach_batch_verifier (line 343) | def _attach_batch_verifier(bb: relax.BlockBuilder): FILE: python/mlc_llm/compiler_pass/attach_softmax_with_temperature.py class AttachSoftmaxWithTemperature (line 15) | class AttachSoftmaxWithTemperature: # pylint: disable=too-few-public-me... method __init__ (line 18) | def __init__( method transform_module (line 24) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class _Rewriter (line 30) | class _Rewriter(PyExprMutator): # pylint: disable=abstract-method method __init__ (line 31) | def __init__( method transform (line 44) | def transform(self) -> IRModule: function _get_lse_and_softmax_func (line 99) | def _get_lse_and_softmax_func( # pylint: disable=too-many-locals,too-ma... FILE: python/mlc_llm/compiler_pass/attach_spec_decode_aux_funcs.py class AttachSpecDecodeAuxFuncs (line 10) | class AttachSpecDecodeAuxFuncs: # pylint: disable=too-few-public-methods method __init__ (line 15) | def __init__(self, tensor_parallel_shards: int): method transform_module (line 18) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... function _get_scatter_2d_inplace (line 40) | def _get_scatter_2d_inplace(dtype: str, global_symbol: str): function _get_gather_2d_inplace (line 58) | def _get_gather_2d_inplace(dtype: str, global_symbol: str): function _add_scatter_hidden_states (line 76) | def _add_scatter_hidden_states(bb: BlockBuilder, tensor_parallel_shards:... function _add_gather_hidden_states (line 102) | def _add_gather_hidden_states(bb: BlockBuilder, tensor_parallel_shards: ... FILE: python/mlc_llm/compiler_pass/attach_support_info.py class AttachVariableBounds (line 13) | class AttachVariableBounds: # pylint: disable=too-few-public-methods method __init__ (line 16) | def __init__(self, variable_bounds: Dict[str, int]): method transform_module (line 21) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class AttachAdditionalPrimFuncs (line 32) | class AttachAdditionalPrimFuncs: # pylint: disable=too-few-public-methods method __init__ (line 35) | def __init__(self, functions: Dict[str, tir.PrimFunc]): method transform_module (line 38) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class AttachMemoryPlanAttr (line 46) | class AttachMemoryPlanAttr: # pylint: disable=too-few-public-methods method transform_module (line 49) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class AttachCUDAGraphSymbolicCaptureHints (line 58) | class AttachCUDAGraphSymbolicCaptureHints: # pylint: disable=too-few-pu... method __init__ (line 61) | def __init__(self, hints: Dict[str, List[str]]): method transform_module (line 64) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class AttachPipelineParallelStages (line 79) | class AttachPipelineParallelStages: # pylint: disable=too-few-public-me... method __init__ (line 82) | def __init__(self, pipeline_parallel_shards: int): method transform_module (line 85) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class AttachSequenceLengthPaddingFactor (line 108) | class AttachSequenceLengthPaddingFactor: # pylint: disable=too-few-publ... method __init__ (line 111) | def __init__(self, target: tvm.target.Target, metadata: Dict[str, Any]): method transform_module (line 115) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... FILE: python/mlc_llm/compiler_pass/blas_dispatch.py class BLASDispatch (line 17) | class BLASDispatch: # pylint: disable=too-few-public-methods,broad-exce... method __init__ (line 20) | def __init__(self, target: tvm.target.Target) -> None: method transform_module (line 34) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... FILE: python/mlc_llm/compiler_pass/clean_up_tir_attrs.py class CleanUpTIRAttrs (line 10) | class CleanUpTIRAttrs: # pylint: disable=too-few-public-methods method __init__ (line 13) | def __init__(self, attrs: List[str]): method transform_module (line 16) | def transform_module( FILE: python/mlc_llm/compiler_pass/dispatch_kv_cache_creation.py function extract_creation_args (line 16) | def extract_creation_args(func: relax.Function) -> Dict[str, Any]: class DispatchKVCacheCreation (line 79) | class DispatchKVCacheCreation: # pylint: disable=too-many-instance-attr... method __init__ (line 82) | def __init__( method transform_module (line 104) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... method attach_kv_cache_metadata (line 135) | def attach_kv_cache_metadata(self, kwargs: Dict[str, Any]): method create_tir_paged_kv_cache (line 144) | def create_tir_paged_kv_cache( method create_flashinfer_paged_kv_cache (line 182) | def create_flashinfer_paged_kv_cache( FILE: python/mlc_llm/compiler_pass/dispatch_triton_kernel.py class _Rewriter (line 21) | class _Rewriter(PyExprMutator): # pylint: disable=abstract-method method __init__ (line 22) | def __init__(self, mod: IRModule, target: tvm.target.Target) -> None: method transform (line 28) | def transform(self) -> tvm.IRModule: # pylint: disable=too-many-locals method visit_call_ (line 44) | def visit_call_(self, call: relax.Call) -> relax.Expr: # pylint: disa... method w8a8_block_fp8_matmul (line 62) | def w8a8_block_fp8_matmul( # pylint: disable=too-many-locals method w8a8_block_fp8_group_matmul (line 106) | def w8a8_block_fp8_group_matmul( # pylint: disable=too-many-locals class DispatchTritonKernel (line 158) | class DispatchTritonKernel: # pylint: disable=too-many-instance-attribu... method __init__ (line 161) | def __init__(self, target: tvm.target.Target) -> None: method transform_module (line 169) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... FILE: python/mlc_llm/compiler_pass/estimate_memory_usage.py class AttachMetadataWithMemoryUsage (line 17) | class AttachMetadataWithMemoryUsage: # pylint: disable=too-few-public-m... method __init__ (line 20) | def __init__(self, metadata: Dict[str, Any]): method transform_module (line 23) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class _MemoryEstimator (line 40) | class _MemoryEstimator(PyExprVisitor): method __init__ (line 43) | def __init__(self) -> None: method run (line 49) | def run(self, mod: IRModule) -> Dict[str, int]: method visit_call_ (line 65) | def visit_call_(self, call: relax.Call) -> None: # pylint: disable=ar... method _builtin_tensor_alloc (line 72) | def _builtin_tensor_alloc(self, shape: relax.Expr, dtype_str: str) -> ... method _storage_alloc (line 83) | def _storage_alloc(self, size: relax.Expr) -> None: FILE: python/mlc_llm/compiler_pass/fuse_add_norm.py function _get_add_rms_norm_decode (line 16) | def _get_add_rms_norm_decode(hidden_size: int, eps: float, TX: int, in_d... function _get_add_rms_norm_prefill (line 87) | def _get_add_rms_norm_prefill(hidden_size: int, eps: float, TX: int, in_... class FuseAddRMSNorm (line 156) | class FuseAddRMSNorm: # pylint: disable=too-few-public-methods method __init__ (line 159) | def __init__(self, target: tvm.target.Target) -> None: method transform_module (line 169) | def transform_module(self, mod: tvm.IRModule, _ctx: tvm.transform.Pass... class _FuseAddRMSNormRewriter (line 175) | class _FuseAddRMSNormRewriter(PyExprMutator): # pylint: disable=abstrac... method __init__ (line 176) | def __init__(self, mod: tvm.IRModule, target: tvm.target.Target): method transform (line 183) | def transform(self) -> tvm.IRModule: # pylint: disable=too-many-locals method visit_call_ (line 193) | def visit_call_(self, call: relax.Call) -> relax.Expr: # pylint: disa... FILE: python/mlc_llm/compiler_pass/fuse_dequantize_matmul_ewise.py class FuseDequantizeMatmulEwise (line 9) | class FuseDequantizeMatmulEwise: # pylint: disable=too-few-public-methods method transform_module (line 12) | def transform_module( function _pattern (line 37) | def _pattern(match_ewise: int, n_aux_tensor: int): FILE: python/mlc_llm/compiler_pass/fuse_dequantize_take.py class FuseDequantizeTake (line 15) | class FuseDequantizeTake: # pylint: disable=too-few-public-methods method transform_module (line 18) | def transform_module( # pylint: disable=too-many-locals function _pattern (line 52) | def _pattern(n_aux_tensor: int, match_tir_vars: bool): FILE: python/mlc_llm/compiler_pass/fuse_dequantize_transpose.py class FuseDequantizeTranspose (line 11) | class FuseDequantizeTranspose: # pylint: disable=too-few-public-methods method transform_module (line 14) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class _DequantizeTransposeFuser (line 20) | class _DequantizeTransposeFuser(PyExprMutator): # pylint: disable=abstr... method __init__ (line 21) | def __init__( method transform (line 28) | def transform(self) -> IRModule: method visit_call_ (line 37) | def visit_call_( # pylint: disable=arguments-renamed FILE: python/mlc_llm/compiler_pass/fuse_ft_dequantize_matmul_epilogue.py class FuseFTDequantizeEpilogue (line 13) | class FuseFTDequantizeEpilogue: # pylint: disable=too-few-public-methods method transform_module (line 16) | def transform_module( function fuse_bias (line 32) | def fuse_bias(func: relax.Function) -> relax.Function: function fuse_activation (line 98) | def fuse_activation(func: relax.Function) -> relax.Function: function fuse_residual_binary (line 188) | def fuse_residual_binary(func: relax.Function) -> relax.Function: function fuse_residual_unary (line 267) | def fuse_residual_unary(func: relax.Function) -> relax.Function: FILE: python/mlc_llm/compiler_pass/fuse_transpose_matmul.py class FuseTransposeMatmul (line 10) | class FuseTransposeMatmul: # pylint: disable=too-few-public-methods method transform_module (line 13) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... function _pattern (line 31) | def _pattern(): class _TransposeMatmulFuser (line 59) | class _TransposeMatmulFuser(PyExprMutator): # pylint: disable=abstract-... method __init__ (line 60) | def __init__(self, mod): method visit_call_ (line 63) | def visit_call_( # pylint: disable=arguments-renamed FILE: python/mlc_llm/compiler_pass/lift_global_buffer_alloc.py class LiftTIRGlobalBufferAlloc (line 13) | class LiftTIRGlobalBufferAlloc: # pylint: disable=too-few-public-methods method transform_module (line 16) | def transform_module( class _TIRGlobalAllocRewriter (line 26) | class _TIRGlobalAllocRewriter(PyExprMutator): # pylint: disable=abstrac... method __init__ (line 27) | def __init__(self, mod: IRModule): method transform (line 35) | def transform(self) -> IRModule: method visit_call_ (line 54) | def visit_call_(self, call: relax.Call): # pylint: disable=arguments-... function remove_global_buf_alloc (line 93) | def remove_global_buf_alloc( function _has_symbolic_var (line 148) | def _has_symbolic_var(tensor_sinfo: relax.TensorStructInfo) -> bool: function _resolve_tir_var_mapping (line 156) | def _resolve_tir_var_mapping( # pylint: disable=too-many-locals FILE: python/mlc_llm/compiler_pass/low_batch_specialization.py class LowBatchGemvSpecialize (line 12) | class LowBatchGemvSpecialize: # pylint: disable=too-few-public-methods method transform_module (line 15) | def transform_module( FILE: python/mlc_llm/compiler_pass/pipeline.py class _LogProgress (line 49) | class _LogProgress: # pylint: disable=too-few-public-methods method __init__ (line 52) | def __init__(self, *args): method transform_module (line 55) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class _DebugDump (line 62) | class _DebugDump: # pylint: disable=too-few-public-methods method __init__ (line 66) | def __init__(self, file_name: str, file_path: Optional[Path], show_met... method transform_module (line 71) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... function _mlc_llm_pipeline (line 82) | def _mlc_llm_pipeline( # pylint: disable=too-many-arguments FILE: python/mlc_llm/compiler_pass/pipeline_parallel_rewrite.py class PipelineParallelRewrite (line 12) | class PipelineParallelRewrite: # pylint: disable=too-few-public-methods method transform_module (line 15) | def transform_module( class _PipelineParallelRewriter (line 25) | class _PipelineParallelRewriter(PyExprMutator): # pylint: disable=abstr... method __init__ (line 26) | def __init__(self, mod: IRModule): method transform (line 35) | def transform(self) -> IRModule: # pylint: disable=too-many-locals method _create_stage_func (line 105) | def _create_stage_func( # pylint: disable=too-many-arguments,too-many... method visit_var_binding_ (line 202) | def visit_var_binding_(self, binding: relax.VarBinding) -> None: method visit_call_ (line 240) | def visit_call_(self, call: relax.Call) -> relax.Call: # pylint: disa... method _prepare_stage_func_params_and_args (line 249) | def _prepare_stage_func_params_and_args( method _update_struct_info (line 261) | def _update_struct_info( method _copy_undefined_var (line 291) | def _copy_undefined_var( method _update_shape (line 301) | def _update_shape( function _extract_pipeline_stages (line 311) | def _extract_pipeline_stages( function _analyze_required_func_params (line 363) | def _analyze_required_func_params( class _RequiredFuncParamAnalyzer (line 376) | class _RequiredFuncParamAnalyzer(PyExprVisitor): method __init__ (line 379) | def __init__(self, func_params: List[relax.Var]) -> None: method run (line 383) | def run(self, stage_bindings: List[relax.Binding]) -> List[relax.Var]: method visit_var_ (line 390) | def visit_var_(self, var: relax.Var) -> None: # pylint: disable=argum... FILE: python/mlc_llm/compiler_pass/scatter_tuple_get_item.py class ScatterTupleGetItem (line 14) | class ScatterTupleGetItem: # pylint: disable=too-few-public-methods method transform_module (line 17) | def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassCont... class _Scatter (line 23) | class _Scatter(PyExprMutator): # pylint: disable=abstract-method method __init__ (line 24) | def __init__(self, mod: IRModule) -> None: method transform (line 29) | def transform(self) -> IRModule: method visit_var_binding_ (line 38) | def visit_var_binding_(self, binding: relax.VarBinding): method visit_dataflow_var_ (line 43) | def visit_dataflow_var_( # pylint: disable=arguments-renamed FILE: python/mlc_llm/contrib/embeddings/embeddings.py function _extract_metadata (line 20) | def _extract_metadata(mod: Module): function _load_params (line 24) | def _load_params( function _get_tvm_module (line 37) | def _get_tvm_module( class DefaultDebugInstrument (line 52) | class DefaultDebugInstrument: method __init__ (line 61) | def __init__(self, debug_out: Path): method reset (line 75) | def reset(self, debug_out: Path): method __call__ (line 89) | def __call__(self, func, name, before_run, ret_val, *args): class MLCEmbeddings (line 111) | class MLCEmbeddings: # pylint: disable=too-few-public-methods method __init__ (line 137) | def __init__( # pylint: disable=too-many-arguments method embed (line 153) | def embed(self, queries: List[str]) -> tvm.runtime.Tensor: method _tokenize_queries (line 173) | def _tokenize_queries(self, queries: List[str]) -> Tuple[np.ndarray, n... FILE: python/mlc_llm/contrib/embeddings/openai.py class MLCEmbeddings (line 18) | class MLCEmbeddings(OpenAIEmbeddings): method _chunk_tokens (line 19) | def _chunk_tokens(self, texts: Sequence[str]) -> Tuple[List[List], Lis... method _batch_embed (line 59) | def _batch_embed( method _abatch_embed (line 82) | async def _abatch_embed( method _get_len_safe_embeddings (line 107) | def _get_len_safe_embeddings( # pylint: disable=too-many-locals,unuse... method _aget_len_safe_embeddings (line 142) | async def _aget_len_safe_embeddings( # pylint: disable=too-many-local... method embed_documents (line 178) | def embed_documents( method aembed_documents (line 202) | async def aembed_documents( method embed_query (line 224) | def embed_query(self, text: str) -> List[float]: method aembed_query (line 235) | async def aembed_query(self, text: str) -> List[float]: FILE: python/mlc_llm/conversation_template/registry.py class ConvTemplateRegistry (line 8) | class ConvTemplateRegistry: method register_conv_template (line 14) | def register_conv_template(conv_template: Conversation, override: bool... method get_conv_template (line 30) | def get_conv_template(name: str) -> Optional[Conversation]: FILE: python/mlc_llm/interface/calibrate.py class CalibrationObserver (line 17) | class CalibrationObserver: method get (line 25) | def get(): method callback (line 33) | def callback( method save_params (line 51) | def save_params(self, output: str): function sample_requests (line 63) | def sample_requests( function send_calibration_requests (line 106) | async def send_calibration_requests( function calibrate (line 131) | def calibrate( FILE: python/mlc_llm/interface/chat.py function _print_help_str (line 18) | def _print_help_str(): function _set_up_key_bindings (line 33) | def _set_up_key_bindings(): class ChatCompletionOverride (line 48) | class ChatCompletionOverride(ConfigOverrideBase): # pylint: disable=too... method from_str (line 60) | def from_str(source: str) -> "ChatCompletionOverride": class ModelConfigOverride (line 83) | class ModelConfigOverride(ConfigOverrideBase): # pylint: disable=too-ma... method from_str (line 95) | def from_str(source: str) -> "ModelConfigOverride": class ChatState (line 118) | class ChatState: method __init__ (line 156) | def __init__(self, engine: Union[JSONFFIEngine, MLCEngine]): method slide_history (line 165) | def slide_history(self): method process_system_prompts (line 171) | def process_system_prompts(self): method generate (line 183) | def generate(self, prompt: str): method stats (line 222) | def stats(self): method metrics (line 240) | def metrics(self): method reset (line 244) | def reset(self): method chat (line 249) | def chat(self): function chat (line 285) | def chat( FILE: python/mlc_llm/interface/compile.py class CompileArgs (line 28) | class CompileArgs: # pylint: disable=too-many-instance-attributes method __post_init__ (line 42) | def __post_init__(self) -> None: method display (line 45) | def display(self) -> None: function _apply_preproc_to_params_and_check_pipeline (line 62) | def _apply_preproc_to_params_and_check_pipeline( function _infer_kv_state_kind (line 98) | def _infer_kv_state_kind(model_type) -> str: function _compile (line 106) | def _compile(args: CompileArgs, model_config: ConfigBase): function compile (line 226) | def compile( # pylint: disable=too-many-arguments,redefined-builtin FILE: python/mlc_llm/interface/compiler_flags.py class IPCAllReduceStrategyType (line 14) | class IPCAllReduceStrategyType(enum.IntEnum): class OptimizationFlags (line 24) | class OptimizationFlags: method __repr__ (line 34) | def __repr__(self) -> str: method from_str (line 49) | def from_str(source: str) -> "OptimizationFlags": method update (line 84) | def update(self, target, quantization) -> None: class ModelConfigOverride (line 141) | class ModelConfigOverride(ConfigOverrideBase): # pylint: disable=too-ma... method __repr__ (line 153) | def __repr__(self) -> str: method from_str (line 170) | def from_str(source: str) -> "ModelConfigOverride": FILE: python/mlc_llm/interface/convert_weight.py class ConversionArgs (line 30) | class ConversionArgs: # pylint: disable=too-many-instance-attributes method display (line 42) | def display(self) -> None: function _resolve_base_model_dir (line 62) | def _resolve_base_model_dir(source: Path) -> Path: function _merge_lora_adapter_with_base_model (line 67) | def _merge_lora_adapter_with_base_model(base_source: Path, lora_adapter:... function _convert_args (line 102) | def _convert_args(args: ConversionArgs) -> None: # pylint: disable=too-... function convert_weight (line 215) | def convert_weight( # pylint: disable=too-many-arguments FILE: python/mlc_llm/interface/gen_config.py function apply_system_defaults_for_missing_fields (line 29) | def apply_system_defaults_for_missing_fields(mlc_chat_config: MLCChatCon... function check_string (line 36) | def check_string(s: str) -> bool: function txt2rwkv_tokenizer (line 48) | def txt2rwkv_tokenizer(vocab: Path, out: Path) -> None: function json2rwkv_tokenizer (line 73) | def json2rwkv_tokenizer(vocab: Path, out: Path) -> None: function gen_config (line 90) | def gen_config( # pylint: disable=too-many-locals,too-many-arguments,to... FILE: python/mlc_llm/interface/jit.py class JITResult (line 34) | class JITResult: function log_jit_policy (line 41) | def log_jit_policy(): function jit (line 50) | def jit( # pylint: disable=too-many-locals,too-many-statements FILE: python/mlc_llm/interface/package.py function build_model_library (line 21) | def build_model_library( # pylint: disable=too-many-branches,too-many-l... function validate_model_lib (line 162) | def validate_model_lib( # pylint: disable=too-many-locals,too-many-stat... function build_android_binding (line 264) | def build_android_binding(mlc_llm_source_dir: Path, output: Path) -> None: function build_iphone_binding (line 308) | def build_iphone_binding(mlc_llm_source_dir: Path, output: Path) -> None: function build_macabi_binding (line 325) | def build_macabi_binding(mlc_llm_source_dir: Path, output: Path) -> None: function package (line 349) | def package( FILE: python/mlc_llm/interface/router.py function serve (line 17) | def serve( FILE: python/mlc_llm/interface/serve.py function serve (line 24) | def serve( FILE: python/mlc_llm/json_ffi/engine.py class EngineState (line 24) | class EngineState: method get_request_stream_callback (line 27) | def get_request_stream_callback(self) -> Callable[[str], None]: method _sync_request_stream_callback (line 35) | def _sync_request_stream_callback(self, chat_completion_stream_respons... method handle_chat_completion (line 39) | def handle_chat_completion( class BackgroundLoops (line 76) | class BackgroundLoops: method __init__ (line 79) | def __init__(self, ffi: dict): method __del__ (line 94) | def __del__(self): method terminate (line 97) | def terminate(self): class Completions (line 106) | class Completions: method __init__ (line 113) | def __init__(self, ffi: dict, state: EngineState, background_loops: Ba... method create (line 118) | def create( # pylint: disable=too-many-arguments,too-many-locals class Chat (line 201) | class Chat: method __init__ (line 206) | def __init__(self, ffi: dict, state: EngineState, background_loops: Ba... class JSONFFIEngine (line 210) | class JSONFFIEngine: method __init__ (line 213) | def __init__( # pylint: disable=too-many-arguments,too-many-locals method metrics (line 273) | def metrics(self) -> EngineMetrics: method _raw_chat_completion (line 277) | def _raw_chat_completion( method terminate (line 285) | def terminate(self): method _test_reload (line 289) | def _test_reload(self): method _test_reset (line 292) | def _test_reset(self): method _test_unload (line 295) | def _test_unload(self): FILE: python/mlc_llm/libinfo.py function get_env_paths (line 11) | def get_env_paths(env_var, splitter): function get_dll_directories (line 18) | def get_dll_directories(): function find_lib_path (line 40) | def find_lib_path(name, optional=False): FILE: python/mlc_llm/loader/huggingface_loader.py class HuggingFaceLoader (line 25) | class HuggingFaceLoader: # pylint: disable=too-few-public-methods method __init__ (line 55) | def __init__( method load (line 101) | def load( method _load_mlc_param (line 135) | def _load_mlc_param(self, mlc_name: str, device: Optional[Device]) -> ... method _load_or_quantize (line 160) | def _load_or_quantize(self, mlc_name, param, device: Device): method _load_file (line 184) | def _load_file(self, path: Path) -> None: method _unload_file (line 196) | def _unload_file(self, path: Path) -> None: function _loading_order (line 205) | def _loading_order(param_map: ExternMapping, torch_to_path: Dict[str, Pa... FILE: python/mlc_llm/loader/mapping.py class ExternMapping (line 19) | class ExternMapping: method add_mapping (line 48) | def add_mapping( method add_unused (line 58) | def add_unused(self, name: str): class QuantizeMapping (line 64) | class QuantizeMapping: FILE: python/mlc_llm/loader/standard_loader.py function _default_export_spec (line 18) | def _default_export_spec(model: nn.Module) -> object: function make_standard_hf_loader (line 22) | def make_standard_hf_loader( # pylint: disable=too-many-arguments,too-m... FILE: python/mlc_llm/loader/stats.py class Stats (line 14) | class Stats: method timer (line 51) | def timer(self, attr): method mem_add (line 63) | def mem_add(self, nbytes: int): method mem_rm (line 70) | def mem_rm(self, nbytes: int): method log_time_info (line 75) | def log_time_info(self, weight_format: str): method log_mem_usage (line 89) | def log_mem_usage(self): FILE: python/mlc_llm/loader/utils.py function check_parameter_usage (line 20) | def check_parameter_usage(param_map: "ExternMapping", extern_weights: Se... function load_torch_shard (line 39) | def load_torch_shard(path: Path) -> Iterator[Tuple[str, np.ndarray]]: function load_safetensor_shard (line 55) | def load_safetensor_shard(path: Path) -> Iterator[Tuple[str, np.ndarray]]: FILE: python/mlc_llm/model/baichuan/baichuan_model.py class BaichuanConfig (line 23) | class BaichuanConfig(ConfigBase): # pylint: disable=too-many-instance-a... method __post_init__ (line 45) | def __post_init__(self): class BaichuanAttention (line 86) | class BaichuanAttention(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 87) | def __init__(self, config: BaichuanConfig): method forward (line 99) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class BaichuanMLP (line 114) | class BaichuanMLP(nn.Module): method __init__ (line 115) | def __init__(self, config: BaichuanConfig): method forward (line 129) | def forward(self, x): class BaichuanDecoderLayer (line 135) | class BaichuanDecoderLayer(nn.Module): method __init__ (line 136) | def __init__(self, config: BaichuanConfig): method forward (line 169) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 176) | def _apply_residual(self, out, residual): class BaichuanModel (line 182) | class BaichuanModel(nn.Module): method __init__ (line 183) | def __init__(self, config: BaichuanConfig): method forward (line 191) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class BaichuanForCausalLM (line 199) | class BaichuanForCausalLM(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 200) | def __init__(self, config: BaichuanConfig): method to (line 213) | def to(self, dtype: Optional[str] = None): method batch_forward (line 218) | def batch_forward( method embed (line 234) | def embed(self, input_ids: Tensor): method prefill (line 239) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 253) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 262) | def batch_prefill( method batch_decode (line 273) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 277) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 281) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 307) | def get_default_spec(self): FILE: python/mlc_llm/model/bert/bert_loader.py function huggingface (line 17) | def huggingface( function huggingface_bge (line 107) | def huggingface_bge(model_config: BertConfig, quantization: Quantization... FILE: python/mlc_llm/model/bert/bert_model.py class BertConfig (line 22) | class BertConfig(ConfigBase): # pylint: disable=too-many-instance-attri... method __post_init__ (line 42) | def __post_init__(self): class BertSelfAttention (line 87) | class BertSelfAttention(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 88) | def __init__(self, config: BertConfig): method forward (line 103) | def forward(self, hidden_states: Tensor, attention_mask: Tensor): class BertSelfOutput (line 116) | class BertSelfOutput(nn.Module): method __init__ (line 117) | def __init__(self, config: BertConfig): method forward (line 121) | def forward(self, hidden_states: Tensor, input_tensor: Tensor): class BertAttention (line 127) | class BertAttention(nn.Module): method __init__ (line 128) | def __init__(self, config: BertConfig): method forward (line 132) | def forward(self, hidden_states: Tensor, attention_mask: Tensor): class BertIntermediate (line 147) | class BertIntermediate(nn.Module): method __init__ (line 148) | def __init__(self, config: BertConfig): method forward (line 152) | def forward(self, hidden_states: Tensor): class BertOutput (line 158) | class BertOutput(nn.Module): method __init__ (line 159) | def __init__(self, config: BertConfig): method forward (line 163) | def forward(self, hidden_states: Tensor, input_tensor: Tensor): class BertLayer (line 169) | class BertLayer(nn.Module): method __init__ (line 170) | def __init__(self, config: BertConfig): method forward (line 175) | def forward(self, hidden_states: Tensor, attention_mask: Tensor): class BertEncoder (line 182) | class BertEncoder(nn.Module): method __init__ (line 183) | def __init__(self, config: BertConfig): method forward (line 186) | def forward(self, hidden_states: Tensor, attention_mask: Tensor): class BertEmbeddings (line 192) | class BertEmbeddings(nn.Module): method __init__ (line 193) | def __init__(self, config: BertConfig): method forward (line 203) | def forward(self, input_ids: Tensor, token_type_ids: Tensor, position_... class BertModel (line 213) | class BertModel(nn.Module): method __init__ (line 214) | def __init__(self, config: BertConfig): method to (line 219) | def to(self, dtype: Optional[str] = None): method forward (line 224) | def forward(self, inputs: Tensor, attention_mask: Tensor): method prefill (line 245) | def prefill(self, inputs: Tensor, attention_mask: Tensor): method get_default_spec (line 265) | def get_default_spec(self): FILE: python/mlc_llm/model/chatglm3/chatglm3_loader.py function huggingface (line 14) | def huggingface(model_config: GLMConfig, quantization: Quantization) -> ... FILE: python/mlc_llm/model/chatglm3/chatglm3_model.py class GLMConfig (line 23) | class GLMConfig(ConfigBase): # pylint: disable=too-many-instance-attrib... method __post_init__ (line 47) | def __post_init__(self): class GLMAttention (line 92) | class GLMAttention(nn.Module): # pylint: disable=too-many-instance-attr... method __init__ (line 93) | def __init__(self, config: GLMConfig): method forward (line 119) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class GLMMLP (line 134) | class GLMMLP(nn.Module): method __init__ (line 135) | def __init__(self, config: GLMConfig): method forward (line 160) | def forward(self, x): class GLMBlock (line 167) | class GLMBlock(nn.Module): method __init__ (line 168) | def __init__(self, config: GLMConfig): method forward (line 226) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 233) | def _apply_residual(self, out, residual): class GLMTransformer (line 239) | class GLMTransformer(nn.Module): method __init__ (line 242) | def __init__(self, config: GLMConfig): method forward (line 259) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class ChatGLMModel (line 267) | class ChatGLMModel(nn.Module): method __init__ (line 268) | def __init__(self, config: GLMConfig): method forward (line 273) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class ChatGLMForCausalLM (line 279) | class ChatGLMForCausalLM(nn.Module): # pylint: disable=too-many-instanc... method __init__ (line 280) | def __init__(self, config: GLMConfig): method to (line 296) | def to(self, dtype: Optional[str] = None): method batch_forward (line 301) | def batch_forward( method embed (line 317) | def embed(self, input_ids: Tensor): method prefill (line 322) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 336) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 345) | def batch_prefill( method batch_decode (line 356) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 360) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 364) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 390) | def get_default_spec(self): FILE: python/mlc_llm/model/cohere/cohere_loader.py function _cohere_name_transform (line 19) | def _cohere_name_transform(name: str) -> str: function awq (line 33) | def awq(model_config: CohereConfig, quantization: Quantization) -> Exter... FILE: python/mlc_llm/model/cohere/cohere_model.py class CohereConfig (line 23) | class CohereConfig(ConfigBase): # pylint: disable=too-many-instance-att... method __post_init__ (line 42) | def __post_init__(self): class CohereMLP (line 92) | class CohereMLP(nn.Module): method __init__ (line 93) | def __init__(self, config: CohereConfig): method forward (line 106) | def forward(self, x): class CohereAttention (line 114) | class CohereAttention(nn.Module): method __init__ (line 115) | def __init__(self, config: CohereConfig): method forward (line 135) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class CohereDecoderLayer (line 151) | class CohereDecoderLayer(nn.Module): method __init__ (line 152) | def __init__(self, config: CohereConfig): method forward (line 182) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_parallel_residual (line 190) | def _apply_parallel_residual(self, mlp_out, residual): class CohereNorm (line 196) | class CohereNorm(nn.Module): method __init__ (line 197) | def __init__( method forward (line 205) | def forward(self, x: Tensor) -> Tensor: class CohereEmbedding (line 215) | class CohereEmbedding(nn.Embedding): method lm_head_forward (line 216) | def lm_head_forward(self, x: nn.Tensor): class CohereModel (line 224) | class CohereModel(nn.Module): method __init__ (line 225) | def __init__(self, config: CohereConfig): method forward (line 233) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class CohereForCausalLM (line 241) | class CohereForCausalLM(nn.Module): method __init__ (line 243) | def __init__(self, config: CohereConfig) -> None: method to (line 256) | def to(self, dtype: Optional[str] = None): method batch_forward (line 261) | def batch_forward( method prefill (line 277) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 294) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 303) | def batch_prefill( method batch_decode (line 314) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 318) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method embed (line 322) | def embed(self, input_ids: Tensor): method create_paged_kv_cache (line 328) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 354) | def get_default_spec(self): FILE: python/mlc_llm/model/deepseek/deepseek_loader.py function huggingface (line 16) | def huggingface(model_config: DeepseekConfig, quantization: Quantization... FILE: python/mlc_llm/model/deepseek/deepseek_model.py class DeepseekConfig (line 25) | class DeepseekConfig(ConfigBase): # pylint: disable=too-many-instance-a... method __post_init__ (line 56) | def __post_init__(self): class DeepseekAttention (line 97) | class DeepseekAttention(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 98) | def __init__(self, config: DeepseekConfig): method forward (line 125) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class DeepseekMLP (line 149) | class DeepseekMLP(nn.Module): method __init__ (line 150) | def __init__(self, config: DeepseekConfig, intermediate_size=None): method forward (line 165) | def forward(self, x: Tensor): class DeepseekMoE (line 171) | class DeepseekMoE(nn.Module): # pylint: disable=too-many-instance-attri... method __init__ (line 172) | def __init__(self, config: DeepseekConfig): method forward (line 196) | def forward(self, x: Tensor): # pylint: disable=too-many-locals class DeepseekDecoderLayer (line 245) | class DeepseekDecoderLayer(nn.Module): # pylint: disable=too-many-insta... method __init__ (line 246) | def __init__(self, config: DeepseekConfig, layer_idx: int): method forward (line 315) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 324) | def _apply_residual(self, out, residual): class DeepseekModel (line 330) | class DeepseekModel(nn.Module): method __init__ (line 331) | def __init__(self, config: DeepseekConfig): method forward (line 342) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class DeepseekForCausalLM (line 350) | class DeepseekForCausalLM(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 351) | def __init__(self, config: DeepseekConfig): method to (line 365) | def to(self, dtype: Optional[str] = None): method batch_forward (line 370) | def batch_forward( method embed (line 386) | def embed(self, input_ids: Tensor): method prefill (line 391) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 406) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 415) | def batch_prefill( method batch_decode (line 426) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 430) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 434) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 460) | def get_default_spec(self): FILE: python/mlc_llm/model/deepseek_v2/deepseek_v2_loader.py function huggingface (line 17) | def huggingface( # pylint: disable=too-many-locals,too-many-statements FILE: python/mlc_llm/model/deepseek_v2/deepseek_v2_model.py class DeepseekV2Config (line 27) | class DeepseekV2Config(ConfigBase): # pylint: disable=too-many-instance... method __post_init__ (line 65) | def __post_init__(self): class DeepseekV2MLP (line 128) | class DeepseekV2MLP(nn.Module): method __init__ (line 129) | def __init__(self, config: DeepseekV2Config, hidden_size=None, interme... method forward (line 145) | def forward(self, x: Tensor) -> Tensor: function yarn_get_mscale (line 151) | def yarn_get_mscale(scale=1, mscale=1): class DeepseekV2YarnRotaryEmbedding (line 157) | class DeepseekV2YarnRotaryEmbedding(nn.Module): method __init__ (line 158) | def __init__(self, config: DeepseekV2Config): method forward (line 163) | def forward( class DeepseekV2Attention (line 212) | class DeepseekV2Attention(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 213) | def __init__(self, config: DeepseekV2Config): method forward (line 272) | def forward( # pylint: disable=too-many-arguments method self_attn (line 318) | def self_attn( # pylint: disable=too-many-arguments method cross_attn (line 341) | def cross_attn( class DeepseekV2MoE (line 390) | class DeepseekV2MoE(nn.Module): # pylint: disable=too-many-instance-att... method __init__ (line 391) | def __init__(self, config: DeepseekV2Config): method forward (line 434) | def forward(self, x: Tensor): method to (line 519) | def to(self, dtype: Optional[str] = None): class DeepseekV2DecoderLayer (line 526) | class DeepseekV2DecoderLayer(nn.Module): method __init__ (line 527) | def __init__(self, config: DeepseekV2Config, layer_idx: int): method forward (line 607) | def forward( # pylint: disable=too-many-arguments method _apply_residual (line 625) | def _apply_residual(self, out, residual): class DeepseekV2Model (line 631) | class DeepseekV2Model(nn.Module): method __init__ (line 632) | def __init__(self, config: DeepseekV2Config): method forward (line 642) | def forward( class DeepseekV2ForCausalLM (line 658) | class DeepseekV2ForCausalLM(nn.Module): # pylint: disable=too-many-inst... method __init__ (line 659) | def __init__(self, config: DeepseekV2Config): method to (line 678) | def to(self, dtype: Optional[str] = None): method batch_forward (line 683) | def batch_forward( method embed (line 700) | def embed(self, input_ids: Tensor): method prefill (line 705) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method extend (line 719) | def extend(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 733) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 742) | def batch_prefill( method batch_extend (line 755) | def batch_extend( method batch_decode (line 768) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 772) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 776) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 804) | def get_default_spec(self): FILE: python/mlc_llm/model/eagle/eagle_loader.py function awq (line 26) | def awq(model_config: EagleConfig, quantization: Quantization) -> Extern... FILE: python/mlc_llm/model/eagle/eagle_model.py class EagleConfig (line 22) | class EagleConfig(LlamaConfig): class EagleDecoderLayer (line 31) | class EagleDecoderLayer(nn.Module): method __init__ (line 32) | def __init__(self, config: EagleConfig, index: int): method forward (line 64) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 73) | def _apply_residual(self, out, residual): class EagleForCausalLM (line 79) | class EagleForCausalLM(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 80) | def __init__(self, config: EagleConfig): method fuse_embed_hidden_states (line 103) | def fuse_embed_hidden_states(self, input_embed: Tensor, hidden_states:... method forward_to_last_hidden_states (line 108) | def forward_to_last_hidden_states(self, hidden_states: Tensor, paged_k... method forward (line 113) | def forward(self, input_embed: Tensor, hidden_states: Tensor, paged_kv... method to (line 118) | def to(self, dtype: Optional[str] = None): method batch_forward (line 123) | def batch_forward( method embed (line 136) | def embed(self, input_ids: Tensor): method prefill_to_last_hidden_states (line 141) | def prefill_to_last_hidden_states(self, hidden_states: Tensor, paged_k... method decode_to_last_hidden_states (line 147) | def decode_to_last_hidden_states(self, hidden_states: Tensor, paged_kv... method batch_prefill_to_last_hidden_states (line 153) | def batch_prefill_to_last_hidden_states( method batch_decode_to_last_hidden_states (line 161) | def batch_decode_to_last_hidden_states( method create_paged_kv_cache (line 167) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 193) | def get_default_spec(self): FILE: python/mlc_llm/model/gemma/gemma_loader.py function huggingface (line 15) | def huggingface(model_config: GemmaConfig, quantization: Quantization) -... FILE: python/mlc_llm/model/gemma/gemma_model.py class GemmaConfig (line 21) | class GemmaConfig(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 41) | def __post_init__(self): class GemmaEmbedding (line 91) | class GemmaEmbedding(nn.Embedding): method lm_head_forward (line 96) | def lm_head_forward(self, x: nn.Tensor): class GemmaMLP (line 104) | class GemmaMLP(nn.Module): method __init__ (line 105) | def __init__(self, config: GemmaConfig): method forward (line 120) | def forward(self, x: Tensor): class GemmaAttention (line 126) | class GemmaAttention(nn.Module): # pylint: disable=too-many-instance-at... method __init__ (line 127) | def __init__(self, config: GemmaConfig): method forward (line 148) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class GemmaDecoderLayer (line 164) | class GemmaDecoderLayer(nn.Module): method __init__ (line 165) | def __init__(self, config: GemmaConfig): method forward (line 196) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 203) | def _apply_residual(self, out, residual): class GemmaModel (line 209) | class GemmaModel(nn.Module): method __init__ (line 210) | def __init__(self, config: GemmaConfig): method forward (line 219) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class GemmaForCausalLM (line 228) | class GemmaForCausalLM(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 229) | def __init__(self, config: GemmaConfig): method to (line 241) | def to(self, dtype: Optional[str] = None): method get_logits (line 246) | def get_logits(self, hidden_states: Tensor): method batch_forward (line 252) | def batch_forward( method embed (line 266) | def embed(self, input_ids: Tensor): method prefill (line 271) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 283) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 290) | def batch_prefill( method batch_decode (line 301) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 305) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 309) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 335) | def get_default_spec(self): FILE: python/mlc_llm/model/gemma2/gemma2_loader.py function huggingface (line 15) | def huggingface(model_config: Gemma2Config, quantization: Quantization) ... FILE: python/mlc_llm/model/gemma2/gemma2_model.py class Gemma2Config (line 23) | class Gemma2Config(GemmaConfig): method __post_init__ (line 35) | def __post_init__(self): class Gemma2Attention (line 45) | class Gemma2Attention(GemmaAttention): method __init__ (line 46) | def __init__(self, config: Gemma2Config): class Gemma2DecoderLayer (line 51) | class Gemma2DecoderLayer(nn.Module): method __init__ (line 52) | def __init__(self, config: Gemma2Config): method forward (line 89) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_post_matmul_norm (line 101) | def _apply_post_matmul_norm(self, out: Tensor, norm: nn.Tensor): class Gemma2Model (line 107) | class Gemma2Model(GemmaModel): method __init__ (line 108) | def __init__(self, config: Gemma2Config): class Gemma2ForCausalLM (line 115) | class Gemma2ForCausalLM(GemmaForCausalLM): # pylint: disable=too-many-i... method __init__ (line 116) | def __init__(self, config: Gemma2Config): method get_logits (line 121) | def get_logits(self, hidden_states: Tensor): FILE: python/mlc_llm/model/gemma3/gemma3_loader.py function huggingface (line 15) | def huggingface(model_config: Gemma3Config, quantization: Quantization) ... FILE: python/mlc_llm/model/gemma3/gemma3_model.py class Gemma3TextConfig (line 22) | class Gemma3TextConfig(ConfigBase): # pylint: disable=too-many-instance... method __post_init__ (line 46) | def __post_init__(self): class Gemma3Config (line 96) | class Gemma3Config(ConfigBase): # pylint: disable=too-many-instance-att... method __post_init__ (line 109) | def __post_init__(self): class Gemma3MLP (line 134) | class Gemma3MLP(nn.Module): method __init__ (line 135) | def __init__(self, config: Gemma3Config): method forward (line 154) | def forward(self, x: Tensor): class Gemma3Attention (line 160) | class Gemma3Attention(nn.Module): # pylint: disable=too-many-instance-a... method __init__ (line 161) | def __init__(self, config: Gemma3Config): method forward (line 201) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class Gemma3DecoderLayer (line 224) | class Gemma3DecoderLayer(nn.Module): method __init__ (line 225) | def __init__(self, config: Gemma3Config): method forward (line 263) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_post_matmul_norm (line 275) | def _apply_post_matmul_norm(self, out: Tensor, norm: nn.Tensor): class Gemma3TextModel (line 281) | class Gemma3TextModel(nn.Module): method __init__ (line 282) | def __init__(self, config: Gemma3Config): method forward (line 296) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class Gemma3LanguageModel (line 305) | class Gemma3LanguageModel(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 306) | def __init__(self, config: Gemma3Config): method to (line 320) | def to(self, dtype: Optional[str] = None): method get_logits (line 325) | def get_logits(self, hidden_states: Tensor): method batch_forward (line 331) | def batch_forward( method embed (line 345) | def embed(self, input_ids: Tensor): method prefill (line 350) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 362) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 369) | def batch_prefill( method batch_decode (line 380) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 384) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 388) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 425) | def get_default_spec(self): class Gemma3ForCausalLM (line 490) | class Gemma3ForCausalLM(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 491) | def __init__(self, config: Gemma3Config): method to (line 499) | def to(self, dtype: Optional[str] = None): method get_logits (line 505) | def get_logits(self, hidden_states: Tensor): method batch_forward (line 511) | def batch_forward( method embed (line 525) | def embed(self, input_ids: Tensor): method prefill (line 530) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 542) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 549) | def batch_prefill( method batch_decode (line 560) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 564) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 568) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 607) | def get_default_spec(self): FILE: python/mlc_llm/model/gpt2/gpt2_loader.py function huggingface (line 14) | def huggingface(model_config: GPT2Config, quantization: Quantization) ->... FILE: python/mlc_llm/model/gpt2/gpt2_model.py class GPT2Config (line 23) | class GPT2Config(ConfigBase): # pylint: disable=too-many-instance-attri... method __post_init__ (line 40) | def __post_init__(self): class GPT2Attention (line 83) | class GPT2Attention(nn.Module): # pylint: disable=too-many-instance-att... method __init__ (line 84) | def __init__(self, config: GPT2Config): method forward (line 102) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class GPT2MLP (line 127) | class GPT2MLP(nn.Module): method __init__ (line 128) | def __init__(self, config: GPT2Config): method forward (line 139) | def forward(self, hidden_states: Tensor): class GPT2Block (line 146) | class GPT2Block(nn.Module): method __init__ (line 147) | def __init__(self, config: GPT2Config): method forward (line 179) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 192) | def _apply_residual(self, out, residual): class GPT2Model (line 198) | class GPT2Model(nn.Module): method __init__ (line 199) | def __init__(self, config: GPT2Config): method forward (line 206) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class GPT2LMHeadModel (line 221) | class GPT2LMHeadModel(nn.Module): # pylint: disable=too-many-instance-a... method __init__ (line 222) | def __init__(self, config: GPT2Config): method to (line 232) | def to(self, dtype: Optional[str] = None): method batch_forward (line 237) | def batch_forward( method embed (line 253) | def embed(self, input_ids: Tensor): method prefill (line 258) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 272) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 281) | def batch_prefill( method batch_decode (line 292) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 296) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 300) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 326) | def get_default_spec(self): FILE: python/mlc_llm/model/gpt_bigcode/gpt_bigcode_model.py class GPTBigCodeConfig (line 23) | class GPTBigCodeConfig(ConfigBase): # pylint: disable=too-many-instance... method __post_init__ (line 39) | def __post_init__(self): class GPTBigCodeMLP (line 75) | class GPTBigCodeMLP(nn.Module): method __init__ (line 76) | def __init__(self, config: GPTBigCodeConfig): method forward (line 82) | def forward(self, x: Tensor): class GPTBigCodeAttention (line 89) | class GPTBigCodeAttention(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 90) | def __init__(self, config: GPTBigCodeConfig): method forward (line 109) | def forward( class GPTBigCodeBlock (line 131) | class GPTBigCodeBlock(nn.Module): method __init__ (line 132) | def __init__(self, config: GPTBigCodeConfig): method forward (line 157) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class GPTBigCodeModel (line 165) | class GPTBigCodeModel(nn.Module): method __init__ (line 166) | def __init__(self, config: GPTBigCodeConfig): method forward (line 173) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class GPTBigCodeForCausalLM (line 188) | class GPTBigCodeForCausalLM(nn.Module): # pylint: disable=too-many-inst... method __init__ (line 189) | def __init__(self, config: GPTBigCodeConfig): method to (line 200) | def to(self, dtype: Optional[str] = None): method batch_forward (line 205) | def batch_forward( method embed (line 221) | def embed(self, input_ids: Tensor): method prefill (line 226) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 240) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 249) | def batch_prefill( method batch_decode (line 260) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 264) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 268) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 294) | def get_default_spec(self): FILE: python/mlc_llm/model/gpt_j/gpt_j_model.py class GPTJConfig (line 25) | class GPTJConfig(ConfigBase): # pylint: disable=too-many-instance-attri... method __post_init__ (line 44) | def __post_init__(self): class GPTJAttention (line 85) | class GPTJAttention(nn.Module): # pylint: disable=too-many-instance-att... method __init__ (line 86) | def __init__(self, config: GPTJConfig): method forward (line 100) | def forward( # pylint: disable=too-many-locals class GPTJMLP (line 129) | class GPTJMLP(nn.Module): method __init__ (line 130) | def __init__(self, config: GPTJConfig): # in MLP: intermediate_size= ... method forward (line 137) | def forward(self, hidden_states: Tensor): class GPTJBlock (line 144) | class GPTJBlock(nn.Module): method __init__ (line 145) | def __init__(self, config: GPTJConfig): method forward (line 172) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 180) | def _apply_residual(self, out, residual): class GPTJModel (line 186) | class GPTJModel(nn.Module): method __init__ (line 187) | def __init__(self, config: GPTJConfig): method forward (line 194) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class GPTJForCausalLM (line 202) | class GPTJForCausalLM(nn.Module): # pylint: disable=too-many-instance-a... method __init__ (line 203) | def __init__(self, config: GPTJConfig): method to (line 218) | def to(self, dtype: Optional[str] = None): method batch_forward (line 223) | def batch_forward( method embed (line 239) | def embed(self, input_ids: Tensor): method prefill (line 244) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 258) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 267) | def batch_prefill( method batch_decode (line 278) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 282) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 286) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 314) | def get_default_spec(self): FILE: python/mlc_llm/model/gpt_neox/gpt_neox_loader.py function huggingface (line 16) | def huggingface(model_config: GPTNeoXConfig, quantization: Quantization)... FILE: python/mlc_llm/model/gpt_neox/gpt_neox_model.py class GPTNeoXConfig (line 23) | class GPTNeoXConfig(ConfigBase): # pylint: disable=too-many-instance-at... method __post_init__ (line 43) | def __post_init__(self): class GPTNeoXAttention (line 90) | class GPTNeoXAttention(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 93) | def __init__(self, config: GPTNeoXConfig): method forward (line 112) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class GPTNeoXMLP (line 131) | class GPTNeoXMLP(nn.Module): method __init__ (line 132) | def __init__(self, config: GPTNeoXConfig): method forward (line 152) | def forward(self, hidden_states: Tensor): class GPTNeoXLayer (line 166) | class GPTNeoXLayer(nn.Module): method __init__ (line 167) | def __init__(self, config: GPTNeoXConfig): method forward (line 205) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 226) | def _apply_residual(self, out, residual): class GPTNeoXModel (line 232) | class GPTNeoXModel(nn.Module): method __init__ (line 233) | def __init__(self, config: GPTNeoXConfig): method forward (line 238) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class GPTNeoXForCausalLM (line 247) | class GPTNeoXForCausalLM(nn.Module): # pylint: disable=too-many-instanc... method __init__ (line 248) | def __init__(self, config: GPTNeoXConfig): method to (line 266) | def to(self, dtype: Optional[str] = None): method batch_forward (line 271) | def batch_forward( method embed (line 287) | def embed(self, input_ids: Tensor): method prefill (line 292) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 306) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 315) | def batch_prefill( method batch_decode (line 326) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 330) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 334) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 361) | def get_default_spec(self): FILE: python/mlc_llm/model/internlm/internlm_model.py class InternLMConfig (line 23) | class InternLMConfig(ConfigBase): # pylint: disable=too-many-instance-a... method __post_init__ (line 44) | def __post_init__(self): class InternLMAttention (line 85) | class InternLMAttention(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 86) | def __init__(self, config: InternLMConfig): method forward (line 102) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class InternLMMLP (line 117) | class InternLMMLP(nn.Module): method __init__ (line 118) | def __init__(self, config: InternLMConfig): method forward (line 133) | def forward(self, x): class InternLMDecoderLayer (line 139) | class InternLMDecoderLayer(nn.Module): method __init__ (line 140) | def __init__(self, config: InternLMConfig): method forward (line 187) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 194) | def _apply_residual(self, out, residual): class InternLMModel (line 200) | class InternLMModel(nn.Module): method __init__ (line 201) | def __init__(self, config: InternLMConfig): method forward (line 208) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class InternLMForCausalLM (line 216) | class InternLMForCausalLM(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 217) | def __init__(self, config: InternLMConfig): method to (line 230) | def to(self, dtype: Optional[str] = None): method batch_forward (line 235) | def batch_forward( method embed (line 251) | def embed(self, input_ids: Tensor): method prefill (line 256) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 270) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 279) | def batch_prefill( method batch_decode (line 290) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 294) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 298) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 324) | def get_default_spec(self): FILE: python/mlc_llm/model/internlm2/internlm2_loader.py function huggingface (line 17) | def huggingface(model_config: InternLM2ForCausalLM, quantization: Quanti... FILE: python/mlc_llm/model/internlm2/internlm2_model.py class InternLM2Config (line 23) | class InternLM2Config(ConfigBase): # pylint: disable=too-many-instance-... method __post_init__ (line 46) | def __post_init__(self): class InternLM2Attention (line 87) | class InternLM2Attention(nn.Module): # pylint: disable=too-many-instanc... method __init__ (line 88) | def __init__(self, config: InternLM2Config): method forward (line 108) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class InternLM2MLP (line 123) | class InternLM2MLP(nn.Module): method __init__ (line 124) | def __init__(self, config: InternLM2Config): method forward (line 138) | def forward(self, x: Tensor): class InternLM2DecoderLayer (line 144) | class InternLM2DecoderLayer(nn.Module): method __init__ (line 145) | def __init__(self, config: InternLM2Config): method forward (line 179) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 190) | def _apply_residual(self, out, residual): class InternLM2Model (line 196) | class InternLM2Model(nn.Module): method __init__ (line 197) | def __init__(self, config: InternLM2Config): method forward (line 205) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class InternLM2ForCausalLM (line 213) | class InternLM2ForCausalLM(nn.Module): # pylint: disable=R0902 method __init__ (line 214) | def __init__(self, config: InternLM2Config): method to (line 227) | def to(self, dtype: Optional[str] = None): method batch_forward (line 232) | def batch_forward( method embed (line 248) | def embed(self, input_ids: Tensor): method prefill (line 253) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 267) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 276) | def batch_prefill( method batch_decode (line 287) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 291) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 295) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 321) | def get_default_spec(self): FILE: python/mlc_llm/model/llama/llama_loader.py function awq (line 25) | def awq(model_config: LlamaConfig, quantization: Quantization) -> Extern... FILE: python/mlc_llm/model/llama/llama_model.py class LlamaConfig (line 23) | class LlamaConfig(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 45) | def __post_init__(self): # pylint: disable=too-many-branches class LlamaFFN (line 108) | class LlamaFFN(nn.Module): method __init__ (line 109) | def __init__(self, config: LlamaConfig): method forward (line 124) | def forward(self, x: Tensor): class LlamaEmbedding (line 130) | class LlamaEmbedding(nn.Embedding): method lm_head_forward (line 133) | def lm_head_forward(self, x: nn.Tensor): class LlamaAttention (line 141) | class LlamaAttention(nn.Module): # pylint: disable=too-many-instance-at... method __init__ (line 142) | def __init__(self, config: LlamaConfig): method forward (line 159) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class LlamaDecoderLayer (line 175) | class LlamaDecoderLayer(nn.Module): method __init__ (line 176) | def __init__(self, config: LlamaConfig): method forward (line 206) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 213) | def _apply_residual(self, out, residual): class LlamaModel (line 219) | class LlamaModel(nn.Module): method __init__ (line 220) | def __init__(self, config: LlamaConfig): method forward (line 239) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class LlamaForCausalLM (line 249) | class LlamaForCausalLM(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 250) | def __init__(self, config: LlamaConfig): method to (line 284) | def to(self, dtype: Optional[str] = None): method batch_forward (line 289) | def batch_forward( method batch_forward_to_last_hidden_states (line 304) | def batch_forward_to_last_hidden_states( method embed (line 314) | def embed(self, input_ids: Tensor): method get_logits (line 319) | def get_logits(self, hidden_states: Tensor): method batch_select_last_hidden_states (line 329) | def batch_select_last_hidden_states(self, hidden_states: Tensor, logit... method prefill (line 336) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 348) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method prefill_to_last_hidden_states (line 355) | def prefill_to_last_hidden_states(self, input_embed: Tensor, paged_kv_... method decode_to_last_hidden_states (line 361) | def decode_to_last_hidden_states(self, input_embed: Tensor, paged_kv_c... method batch_prefill (line 367) | def batch_prefill( method batch_decode (line 376) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 380) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_prefill_to_last_hidden_states (line 384) | def batch_prefill_to_last_hidden_states( method batch_decode_to_last_hidden_states (line 390) | def batch_decode_to_last_hidden_states( method batch_verify_to_last_hidden_states (line 396) | def batch_verify_to_last_hidden_states( method create_paged_kv_cache (line 402) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 431) | def get_default_spec(self): FILE: python/mlc_llm/model/llama4/llama4_loader.py function huggingface (line 16) | def huggingface(model_config: Llama4Config, quantization: Quantization) ... FILE: python/mlc_llm/model/llama4/llama4_model.py class Llama4TextConfig (line 26) | class Llama4TextConfig(ConfigBase): # pylint: disable=too-many-instance... method __post_init__ (line 56) | def __post_init__(self): # pylint: disable=too-many-branches class Llama4Config (line 96) | class Llama4Config(ConfigBase): # pylint: disable=too-many-instance-att... method __post_init__ (line 111) | def __post_init__(self) -> None: class Llama4TextMLP (line 161) | class Llama4TextMLP(nn.Module): method __init__ (line 162) | def __init__(self, config: Llama4Config): method forward (line 181) | def forward(self, x: Tensor): class LlamaEmbedding (line 189) | class LlamaEmbedding(nn.Embedding): method lm_head_forward (line 192) | def lm_head_forward(self, x: nn.Tensor): class Llama4TextL2Norm (line 200) | class Llama4TextL2Norm(nn.Module): method __init__ (line 201) | def __init__(self, eps, hidden_size): method forward (line 205) | def forward(self, x): class Llama4TextAttention (line 210) | class Llama4TextAttention(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 211) | def __init__(self, config: Llama4Config, layer_idx): method forward (line 264) | def forward( # pylint: disable=too-many-locals class Llama4TextExperts (line 338) | class Llama4TextExperts(nn.Module): method __init__ (line 339) | def __init__(self, config: Llama4Config): method forward (line 353) | def forward(self, hidden_states): class Llama4Router (line 362) | class Llama4Router(nn.Module): method __init__ (line 363) | def __init__(self, config: Llama4Config): method forward (line 373) | def forward(self, hidden_states): class Llama4TextMoe (line 390) | class Llama4TextMoe(nn.Module): method __init__ (line 391) | def __init__(self, config: Llama4Config): method forward (line 399) | def forward(self, hidden_states): class Llama4TextDecoderLayer (line 419) | class Llama4TextDecoderLayer(nn.Module): method __init__ (line 420) | def __init__(self, config: Llama4Config, layer_idx): method forward (line 488) | def forward( method _apply_residual (line 510) | def _apply_residual(self, out, residual): class Llama4TextModel (line 516) | class Llama4TextModel(nn.Module): method __init__ (line 517) | def __init__(self, config: Llama4Config): method forward (line 533) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class Llama4ForCausalLM (line 545) | class Llama4ForCausalLM(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 546) | def __init__(self, config: Llama4Config): method to (line 564) | def to(self, dtype: Optional[str] = None): method batch_forward (line 569) | def batch_forward( method batch_forward_to_last_hidden_states (line 584) | def batch_forward_to_last_hidden_states( method embed (line 594) | def embed(self, input_ids: Tensor): method get_logits (line 599) | def get_logits(self, hidden_states: Tensor): method batch_select_last_hidden_states (line 609) | def batch_select_last_hidden_states(self, hidden_states: Tensor, logit... method prefill (line 616) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 628) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method prefill_to_last_hidden_states (line 635) | def prefill_to_last_hidden_states(self, input_embed: Tensor, paged_kv_... method decode_to_last_hidden_states (line 641) | def decode_to_last_hidden_states(self, input_embed: Tensor, paged_kv_c... method batch_prefill (line 647) | def batch_prefill( method batch_decode (line 656) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 660) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_prefill_to_last_hidden_states (line 664) | def batch_prefill_to_last_hidden_states( method batch_decode_to_last_hidden_states (line 670) | def batch_decode_to_last_hidden_states( method batch_verify_to_last_hidden_states (line 676) | def batch_verify_to_last_hidden_states( method create_paged_kv_cache (line 682) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 710) | def get_default_spec(self): FILE: python/mlc_llm/model/llava/llava_loader.py function _num_layers (line 19) | def _num_layers(config: object) -> int: function awq (line 31) | def awq(model_config: LlavaConfig, quantization: Quantization) -> Extern... FILE: python/mlc_llm/model/llava/llava_model.py class LlavaConfig (line 36) | class LlavaConfig(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 53) | def __post_init__(self) -> None: method get_hf_config (line 90) | def get_hf_config(self, text_config_dict: Dict[str, Any]) -> Dict[str,... class LlavaMultiModalProjector (line 121) | class LlavaMultiModalProjector(nn.Module): method __init__ (line 122) | def __init__(self, config: LlavaConfig): method forward (line 133) | def forward(self, image_features: Tensor) -> Tensor: class LlavaForCausalLM (line 140) | class LlavaForCausalLM(Module): method __init__ (line 141) | def __init__(self, config: LlavaConfig): method to (line 151) | def to(self, dtype: Optional[str] = None): method embed (line 157) | def embed(self, input_ids: Tensor) -> Tensor: method image_preprocess (line 160) | def image_preprocess(self, pixel_values: Tensor) -> Tensor: method image_embed (line 179) | def image_embed(self, pixel_values: Tensor) -> Tensor: method batch_forward (line 196) | def batch_forward( method prefill (line 206) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 211) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 216) | def batch_prefill( method batch_decode (line 224) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 227) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 230) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 258) | def get_default_spec(self): FILE: python/mlc_llm/model/medusa/medusa_model.py class MedusaConfig (line 15) | class MedusaConfig(ConfigBase): # pylint: disable=too-many-instance-att... class ResBlock (line 35) | class ResBlock(nn.Module): method __init__ (line 38) | def __init__(self, hidden_size): method forward (line 43) | def forward(self, x): class MedusaModel (line 47) | class MedusaModel(nn.Module): method __init__ (line 50) | def __init__(self, config: MedusaConfig): method get_default_spec (line 63) | def get_default_spec(self): method get_logits (line 75) | def get_logits(self, hidden_states: nn.Tensor): method to (line 81) | def to(self, dtype: Optional[str] = None): FILE: python/mlc_llm/model/minicpm/minicpm_loader.py function huggingface (line 16) | def huggingface(model_config: MiniCPMConfig, quantization: Quantization)... FILE: python/mlc_llm/model/minicpm/minicpm_model.py class MiniCPMConfig (line 26) | class MiniCPMConfig(ConfigBase): # pylint: disable=too-many-instance-at... method __post_init__ (line 54) | def __post_init__(self): class MiniCPMAttention (line 95) | class MiniCPMAttention(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 96) | def __init__(self, config: MiniCPMConfig): method forward (line 120) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class MiniCPMEmbedding (line 144) | class MiniCPMEmbedding(nn.Embedding): method lm_head_forward (line 149) | def lm_head_forward(self, x: nn.Tensor): class MiniCPMMLP (line 157) | class MiniCPMMLP(nn.Module): method __init__ (line 158) | def __init__(self, config: MiniCPMConfig): method forward (line 171) | def forward(self, x: Tensor): class MiniCPMMoE (line 177) | class MiniCPMMoE(nn.Module): method __init__ (line 178) | def __init__(self, config: MiniCPMConfig): method forward (line 197) | def forward(self, x: Tensor): # pylint: disable=too-many-locals class MiniCPMDecoderLayer (line 255) | class MiniCPMDecoderLayer(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 256) | def __init__(self, config: MiniCPMConfig): method forward (line 304) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 321) | def _apply_residual(self, out, residual): class MiniCPMModel (line 327) | class MiniCPMModel(nn.Module): method __init__ (line 328) | def __init__(self, config: MiniCPMConfig): method forward (line 336) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class MiniCPMForCausalLM (line 344) | class MiniCPMForCausalLM(nn.Module): # pylint: disable=too-many-instanc... method __init__ (line 345) | def __init__(self, config: MiniCPMConfig): method to (line 363) | def to(self, dtype: Optional[str] = None): method batch_forward (line 368) | def batch_forward( method embed (line 387) | def embed(self, input_ids: Tensor): method prefill (line 392) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 409) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 421) | def batch_prefill( method batch_decode (line 432) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 436) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 440) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 466) | def get_default_spec(self): FILE: python/mlc_llm/model/ministral3/ministral3_loader.py function _dequantize_block_scale_weight (line 17) | def _dequantize_block_scale_weight( # pylint: disable=too-many-locals function huggingface (line 44) | def huggingface( # pylint: disable=too-many-locals,too-many-statements FILE: python/mlc_llm/model/ministral3/ministral3_model.py class Ministral3Config (line 25) | class Ministral3Config(ConfigBase): # pylint: disable=too-many-instance... method from_dict (line 52) | def from_dict( # type: ignore[override] method __post_init__ (line 68) | def __post_init__(self): # pylint: disable=too-many-branches,too-many... class Ministral3Embedding (line 178) | class Ministral3Embedding(nn.Embedding): method lm_head_forward (line 183) | def lm_head_forward(self, x: nn.Tensor): class Ministral3MLP (line 194) | class Ministral3MLP(nn.Module): method __init__ (line 197) | def __init__(self, config: Ministral3Config): method forward (line 213) | def forward(self, x: Tensor): function yarn_get_sm_scale (line 219) | def yarn_get_sm_scale(scale=1, mscale=1): class Ministral3Attention (line 225) | class Ministral3Attention(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 228) | def __init__(self, config: Ministral3Config): method forward (line 252) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class Ministral3DecoderLayer (line 268) | class Ministral3DecoderLayer(nn.Module): method __init__ (line 271) | def __init__(self, config: Ministral3Config): method forward (line 301) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 308) | def _apply_residual(self, out, residual): class Ministral3Model (line 314) | class Ministral3Model(nn.Module): method __init__ (line 317) | def __init__(self, config: Ministral3Config): method forward (line 327) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class Mistral3ForConditionalGeneration (line 335) | class Mistral3ForConditionalGeneration(nn.Module): # pylint: disable=to... method __init__ (line 336) | def __init__(self, config: Ministral3Config): method _mark_modules_no_quant (line 357) | def _mark_modules_no_quant(self, modules: Tuple[str, ...]): method to (line 371) | def to(self, dtype: Optional[str] = None): method batch_forward (line 376) | def batch_forward( method embed (line 396) | def embed(self, input_ids: Tensor): method prefill (line 401) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 419) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 432) | def batch_prefill( method batch_decode (line 443) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 447) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 451) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 478) | def get_default_spec(self): FILE: python/mlc_llm/model/mistral/mistral_loader.py function awq (line 25) | def awq(model_config: MistralConfig, quantization: Quantization) -> Exte... FILE: python/mlc_llm/model/mistral/mistral_model.py class MistralConfig (line 23) | class MistralConfig(ConfigBase): # pylint: disable=too-many-instance-at... method __post_init__ (line 43) | def __post_init__(self): # pylint: disable=too-many-branches class MistralMLP (line 98) | class MistralMLP(nn.Module): method __init__ (line 101) | def __init__(self, config: MistralConfig): method forward (line 116) | def forward(self, x: Tensor): class MistralAttention (line 122) | class MistralAttention(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 125) | def __init__(self, config: MistralConfig): method forward (line 141) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class MistralDecoderLayer (line 157) | class MistralDecoderLayer(nn.Module): method __init__ (line 160) | def __init__(self, config: MistralConfig): method forward (line 190) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 197) | def _apply_residual(self, out, residual): class MistralModel (line 203) | class MistralModel(nn.Module): method __init__ (line 206) | def __init__(self, config: MistralConfig): method forward (line 215) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class MistralForCausalLM (line 223) | class MistralForCausalLM(nn.Module): # pylint: disable=too-many-instanc... method __init__ (line 226) | def __init__(self, config: MistralConfig): method to (line 240) | def to(self, dtype: Optional[str] = None): method batch_forward (line 245) | def batch_forward( method embed (line 261) | def embed(self, input_ids: Tensor): method prefill (line 266) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 280) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 289) | def batch_prefill( method batch_decode (line 300) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 304) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 308) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 334) | def get_default_spec(self): FILE: python/mlc_llm/model/mixtral/mixtral_loader.py function huggingface (line 16) | def huggingface(model_config: MixtralConfig, quantization: Quantization)... FILE: python/mlc_llm/model/mixtral/mixtral_model.py class MixtralConfig (line 25) | class MixtralConfig(LlamaConfig): # pylint: disable=too-many-instance-a... class MixtralMoE (line 35) | class MixtralMoE(nn.Module): method __init__ (line 38) | def __init__(self, config: MixtralConfig): method forward (line 67) | def forward(self, x: Tensor): class MixtralDecoderLayer (line 125) | class MixtralDecoderLayer(nn.Module): method __init__ (line 128) | def __init__(self, config: MixtralConfig): method forward (line 155) | def forward(self, hidden_states: Tensor, attention_mask: Tensor, total... method batch_forward (line 163) | def batch_forward(self, hidden_states: Tensor, paged_kv_cache: PagedKV... method _apply_residual (line 170) | def _apply_residual(self, out, residual): class MixtralModel (line 176) | class MixtralModel(LlamaModel): method __init__ (line 179) | def __init__(self, config: MixtralConfig): class MixtralForCausalLM (line 186) | class MixtralForCausalLM(LlamaForCausalLM): method __init__ (line 189) | def __init__(self, config: MixtralConfig): FILE: python/mlc_llm/model/model.py class EmbeddingMetadata (line 65) | class EmbeddingMetadata: class Model (line 86) | class Model: method __post_init__ (line 123) | def __post_init__(self): FILE: python/mlc_llm/model/nemotron/nemotron_model.py class NemotronConfig (line 23) | class NemotronConfig(ConfigBase): # pylint: disable=too-many-instance-a... method __post_init__ (line 48) | def __post_init__(self): # pylint: disable=too-many-branches class NemotronMLP (line 75) | class NemotronMLP(nn.Module): method __init__ (line 78) | def __init__(self, config: NemotronConfig): method forward (line 88) | def forward(self, x: Tensor) -> Tensor: class NemotronEmbedding (line 96) | class NemotronEmbedding(nn.Embedding): method lm_head_forward (line 99) | def lm_head_forward(self, x: Tensor): class NemotronLayerNorm1P (line 107) | class NemotronLayerNorm1P(nn.LayerNorm): method __init__ (line 110) | def __init__(self, normalized_shape: int, eps: float = 1e-5, elementwi... method forward (line 113) | def forward(self, x: Tensor) -> Tensor: class NemotronAttention (line 124) | class NemotronAttention(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 125) | def __init__(self, config: NemotronConfig): method forward (line 142) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class NemotronDecoderLayer (line 158) | class NemotronDecoderLayer(nn.Module): method __init__ (line 159) | def __init__(self, config: NemotronConfig): method forward (line 184) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 191) | def _apply_residual(self, out, residual): class NemotronModel (line 197) | class NemotronModel(nn.Module): method __init__ (line 198) | def __init__(self, config: NemotronConfig): method forward (line 217) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class NemotronForCausalLM (line 227) | class NemotronForCausalLM(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 228) | def __init__(self, config: NemotronConfig): method to (line 263) | def to(self, dtype: Optional[str] = None): method batch_forward (line 268) | def batch_forward( method batch_forward_to_last_hidden_states (line 283) | def batch_forward_to_last_hidden_states( method embed (line 293) | def embed(self, input_ids: Tensor): method get_logits (line 298) | def get_logits(self, hidden_states: Tensor): method batch_select_last_hidden_states (line 308) | def batch_select_last_hidden_states(self, hidden_states: Tensor, logit... method prefill (line 315) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 327) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method prefill_to_last_hidden_states (line 334) | def prefill_to_last_hidden_states(self, input_embed: Tensor, paged_kv_... method decode_to_last_hidden_states (line 340) | def decode_to_last_hidden_states(self, input_embed: Tensor, paged_kv_c... method batch_prefill (line 346) | def batch_prefill( method batch_decode (line 355) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 359) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_prefill_to_last_hidden_states (line 363) | def batch_prefill_to_last_hidden_states( method batch_decode_to_last_hidden_states (line 369) | def batch_decode_to_last_hidden_states( method batch_verify_to_last_hidden_states (line 375) | def batch_verify_to_last_hidden_states( method create_paged_kv_cache (line 381) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 411) | def get_default_spec(self): FILE: python/mlc_llm/model/olmo/olmo_loader.py function awq (line 25) | def awq(model_config: OLMoConfig, quantization: Quantization) -> ExternM... FILE: python/mlc_llm/model/olmo/olmo_model.py class OLMoConfig (line 25) | class OLMoConfig(ConfigBase): # pylint: disable=too-many-instance-attri... method __post_init__ (line 47) | def __post_init__(self): # pylint: disable=too-many-branches class OLMoEmbedding (line 107) | class OLMoEmbedding(nn.Embedding): method lm_head_forward (line 110) | def lm_head_forward(self, x: nn.Tensor): class OLMoAttention (line 118) | class OLMoAttention(nn.Module): # pylint: disable=missing-class-docstring method __init__ (line 119) | def __init__(self, config: OLMoConfig): method forward (line 141) | def forward( # pylint: disable=missing-function-docstring class OLMoFFN (line 175) | class OLMoFFN(nn.Module): # pylint: disable=missing-class-docstring method __init__ (line 176) | def __init__(self, config: OLMoConfig): method forward (line 196) | def forward(self, x: Tensor): # pylint: disable=missing-function-docs... class OLMoDecoderLayer (line 205) | class OLMoDecoderLayer(nn.Module): # pylint: disable=missing-class-docs... method __init__ (line 206) | def __init__(self, config: OLMoConfig): method _apply_residual (line 243) | def _apply_residual(self, out, residual): method forward (line 248) | def forward( # pylint: disable=missing-function-docstring class OLMoModel (line 258) | class OLMoModel(nn.Module): # pylint: disable=missing-class-docstring method __init__ (line 259) | def __init__(self, config: OLMoConfig): method forward (line 282) | def forward( # pylint: disable=missing-function-docstring class OLMoForCausalLM (line 294) | class OLMoForCausalLM( # pylint: disable=missing-class-docstring,too-ma... method __init__ (line 297) | def __init__(self, config: OLMoConfig): method to (line 329) | def to(self, dtype: Optional[str] = None): method batch_forward (line 334) | def batch_forward( # pylint: disable=missing-function-docstring method batch_forward_to_last_hidden_states (line 348) | def batch_forward_to_last_hidden_states( # pylint: disable=missing-fu... method embed (line 357) | def embed(self, input_ids: Tensor): # pylint: disable=missing-functio... method get_logits (line 362) | def get_logits(self, hidden_states: Tensor): # pylint: disable=missin... method batch_select_last_hidden_states (line 372) | def batch_select_last_hidden_states( # pylint: disable=missing-functi... method prefill (line 381) | def prefill( # pylint: disable=missing-function-docstring method decode (line 397) | def decode( # pylint: disable=missing-function-docstring method prefill_to_last_hidden_states (line 405) | def prefill_to_last_hidden_states( # pylint: disable=missing-function... method decode_to_last_hidden_states (line 412) | def decode_to_last_hidden_states( # pylint: disable=missing-function-... method batch_prefill (line 419) | def batch_prefill( # pylint: disable=missing-function-docstring method batch_decode (line 428) | def batch_decode( # pylint: disable=missing-function-docstring method batch_verify (line 434) | def batch_verify( # pylint: disable=missing-function-docstring method batch_prefill_to_last_hidden_states (line 440) | def batch_prefill_to_last_hidden_states( # pylint: disable=missing-fu... method batch_decode_to_last_hidden_states (line 446) | def batch_decode_to_last_hidden_states( # pylint: disable=missing-fun... method batch_verify_to_last_hidden_states (line 452) | def batch_verify_to_last_hidden_states( # pylint: disable=missing-fun... method create_paged_kv_cache (line 458) | def create_paged_kv_cache( # pylint: disable=missing-function-docstri... method get_default_spec (line 486) | def get_default_spec(self): # pylint: disable=missing-function-docstring FILE: python/mlc_llm/model/orion/orion_model.py class OrionConfig (line 23) | class OrionConfig(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 41) | def __post_init__(self): class OrionFFN (line 90) | class OrionFFN(nn.Module): method __init__ (line 91) | def __init__(self, config: OrionConfig): method forward (line 106) | def forward(self, x: Tensor): class OrionAttention (line 112) | class OrionAttention(nn.Module): # pylint: disable=too-many-instance-at... method __init__ (line 113) | def __init__(self, config: OrionConfig): method forward (line 130) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class OrionDecoderLayer (line 146) | class OrionDecoderLayer(nn.Module): method __init__ (line 147) | def __init__(self, config: OrionConfig): method forward (line 177) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 184) | def _apply_residual(self, out, residual): class OrionModel (line 190) | class OrionModel(nn.Module): method __init__ (line 191) | def __init__(self, config: OrionConfig): method forward (line 200) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class OrionForCausalLM (line 208) | class OrionForCausalLM(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 209) | def __init__(self, config: OrionConfig): method to (line 222) | def to(self, dtype: Optional[str] = None): method batch_forward (line 227) | def batch_forward( method embed (line 243) | def embed(self, input_ids: Tensor): method prefill (line 248) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 262) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 271) | def batch_prefill( method batch_decode (line 282) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 286) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 290) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 316) | def get_default_spec(self): FILE: python/mlc_llm/model/phi/phi_loader.py function huggingface (line 16) | def huggingface(model_config: PhiConfig, quantization: Quantization) -> ... function phi1_huggingface (line 87) | def phi1_huggingface(model_config: Phi1Config, quantization: Quantizatio... FILE: python/mlc_llm/model/phi/phi_model.py class Phi1Config (line 23) | class Phi1Config(ConfigBase): # pylint: disable=too-many-instance-attri... method __post_init__ (line 42) | def __post_init__(self): class PhiConfig (line 91) | class PhiConfig(ConfigBase): # pylint: disable=too-many-instance-attrib... method __post_init__ (line 111) | def __post_init__(self): method from_phi1 (line 149) | def from_phi1(config: Phi1Config) -> "PhiConfig": class PhiMLP (line 174) | class PhiMLP(nn.Module): method __init__ (line 175) | def __init__(self, config: PhiConfig): method forward (line 186) | def forward(self, hidden_states: Tensor): class PhiMHA (line 194) | class PhiMHA(nn.Module): # pylint: disable=too-many-instance-attributes method __init__ (line 195) | def __init__(self, config: PhiConfig): method forward (line 211) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class PhiParallelBlock (line 227) | class PhiParallelBlock(nn.Module): method __init__ (line 228) | def __init__(self, config: PhiConfig): method forward (line 259) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_parallel_residual (line 276) | def _apply_parallel_residual(self, attn_out, mlp_out, residual): class PhiCausalLMHead (line 284) | class PhiCausalLMHead(nn.Module): method __init__ (line 285) | def __init__(self, config: PhiConfig) -> None: method forward (line 291) | def forward(self, hidden_states: Tensor): class PhiModel (line 300) | class PhiModel(nn.Module): method __init__ (line 301) | def __init__(self, config: PhiConfig) -> None: method forward (line 306) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class PhiForCausalLM (line 314) | class PhiForCausalLM(nn.Module): method __init__ (line 316) | def __init__(self, config: Union[PhiConfig, Phi1Config]) -> None: method to (line 335) | def to(self, dtype: Optional[str] = None): method batch_forward (line 340) | def batch_forward( method prefill (line 356) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 372) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 381) | def batch_prefill( method batch_decode (line 392) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 396) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method embed (line 400) | def embed(self, input_ids: Tensor): method create_paged_kv_cache (line 406) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 433) | def get_default_spec(self): FILE: python/mlc_llm/model/phi3/phi3_loader.py function phi3_huggingface (line 14) | def phi3_huggingface(model_config: Phi3Config, quantization: Quantizatio... FILE: python/mlc_llm/model/phi3/phi3_model.py class Phi3Config (line 23) | class Phi3Config(ConfigBase): # pylint: disable=too-many-instance-attri... method __post_init__ (line 47) | def __post_init__(self): class Phi3Embedding (line 102) | class Phi3Embedding(nn.Embedding): method lm_head_forward (line 105) | def lm_head_forward(self, x: nn.Tensor): class Phi3MLP (line 113) | class Phi3MLP(nn.Module): method __init__ (line 114) | def __init__(self, config: Phi3Config): method forward (line 125) | def forward(self, hidden_states: Tensor): class PhiMHA (line 132) | class PhiMHA(nn.Module): # pylint: disable=too-many-instance-attributes method __init__ (line 133) | def __init__(self, config: Phi3Config): method forward (line 153) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class Phi3ParallelBlock (line 169) | class Phi3ParallelBlock(nn.Module): method __init__ (line 170) | def __init__(self, config: Phi3Config): method forward (line 204) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_parallel_residual (line 211) | def _apply_parallel_residual(self, mlp_out, residual): class Phi3Model (line 217) | class Phi3Model(nn.Module): method __init__ (line 218) | def __init__(self, config: Phi3Config) -> None: method forward (line 224) | def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): class Phi3ForCausalLM (line 232) | class Phi3ForCausalLM(nn.Module): method __init__ (line 234) | def __init__(self, config: Phi3Config) -> None: method to (line 258) | def to(self, dtype: Optional[str] = None): method get_logits (line 263) | def get_logits(self, hidden_states: Tensor): method batch_forward (line 273) | def batch_forward( method prefill (line 286) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 298) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 305) | def batch_prefill( method batch_decode (line 316) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 320) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method embed (line 324) | def embed(self, input_ids: Tensor): method create_paged_kv_cache (line 330) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 359) | def get_default_spec(self): FILE: python/mlc_llm/model/phi3v/phi3v_image.py class ImageProjection (line 16) | class ImageProjection(Module): # pylint: disable=too-many-instance-attr... method __init__ (line 17) | def __init__(self, config: ConfigBase): method forward (line 25) | def forward(self, image_features: Tensor) -> Tensor: class Phi3ImageEmbedding (line 55) | class Phi3ImageEmbedding(Module): method __init__ (line 56) | def __init__(self, config: ConfigBase): method apply_schedule (line 69) | def apply_schedule(self, sch, block, bdx=32, tile=[32, 32]): method dyn_repeat_4d_tensor (line 80) | def dyn_repeat_4d_tensor(self, input_tensor, r0, r1, r2, r3) -> Tensor: method dyn_concate_dim_2 (line 119) | def dyn_concate_dim_2(self, input_1, input_2) -> Tensor: method dyn_concate_dim_1 (line 158) | def dyn_concate_dim_1(self, input_1, input_2) -> Tensor: method get_img_features (line 192) | def get_img_features(self, img_embeds: Tensor) -> Tensor: method reshape_hd_patches_2x2merge (line 197) | def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop): method add_image_newline (line 267) | def add_image_newline(self, image_features_hd): method forward (line 283) | def forward(self, pixel_values: Tensor, h_crop, w_crop) -> Tensor: FILE: python/mlc_llm/model/phi3v/phi3v_loader.py function huggingface (line 15) | def huggingface(model_config: Phi3VConfig, quantization: Quantization) -... FILE: python/mlc_llm/model/phi3v/phi3v_model.py class Phi3VConfig (line 38) | class Phi3VConfig(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 63) | def __post_init__(self): class Phi3VForCausalLM (line 130) | class Phi3VForCausalLM(nn.Module): method __init__ (line 132) | def __init__(self, config: Phi3VConfig) -> None: method to (line 161) | def to(self, dtype: Optional[str] = None): method batch_forward (line 166) | def batch_forward( method prefill (line 182) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 198) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 207) | def batch_prefill( method batch_decode (line 218) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 222) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method embed (line 226) | def embed(self, input_ids: Tensor): method image_preprocess (line 233) | def image_preprocess( method image_embed (line 283) | def image_embed( # pylint: disable=too-many-arguments method create_paged_kv_cache (line 296) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 324) | def get_default_spec(self): FILE: python/mlc_llm/model/qwen/qwen_model.py class QWenConfig (line 23) | class QWenConfig(ConfigBase): # pylint: disable=too-many-instance-attri... method __post_init__ (line 42) | def __post_init__(self): class QWenAttention (line 83) | class QWenAttention(nn.Module): # pylint: disable=too-many-instance-att... method __init__ (line 84) | def __init__(self, config: QWenConfig): method forward (line 98) | def forward( # pylint: disable=too-many-locals class QWenMLP (line 118) | class QWenMLP(nn.Module): method __init__ (line 119) | def __init__(self, config: QWenConfig): method forward (line 133) | def forward(self, x: Tensor): class QWenBlock (line 139) | class QWenBlock(nn.Module): method __init__ (line 140) | def __init__(self, config: QWenConfig): method forward (line 174) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 181) | def _apply_residual(self, out, residual): class QWenModel (line 187) | class QWenModel(nn.Module): method __init__ (line 188) | def __init__(self, config: QWenConfig): method forward (line 194) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class QWenLMHeadModel (line 202) | class QWenLMHeadModel(nn.Module): # pylint: disable=too-many-instance-a... method __init__ (line 203) | def __init__(self, config: QWenConfig): method to (line 215) | def to(self, dtype: Optional[str] = None): method batch_forward (line 220) | def batch_forward( method embed (line 235) | def embed(self, input_ids: Tensor): method prefill (line 240) | def prefill(self, inputs: Tensor, paged_kv_cache: PagedKVCache): method decode (line 258) | def decode(self, inputs: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 267) | def batch_prefill(self, inputs: Tensor, logit_positions: Tensor, paged... method batch_decode (line 273) | def batch_decode(self, inputs: Tensor, paged_kv_cache: PagedKVCache): method batch_verify (line 277) | def batch_verify(self, inputs: Tensor, paged_kv_cache: PagedKVCache): method create_paged_kv_cache (line 281) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 307) | def get_default_spec(self): FILE: python/mlc_llm/model/qwen2/qwen2_model.py class QWen2Config (line 24) | class QWen2Config(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 45) | def __post_init__(self): class QWen2Attention (line 86) | class QWen2Attention(nn.Module): # pylint: disable=too-many-instance-at... method __init__ (line 87) | def __init__(self, config: QWen2Config): method forward (line 107) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class Qwen2Embedding (line 131) | class Qwen2Embedding(nn.Embedding): method lm_head_forward (line 136) | def lm_head_forward(self, x: nn.Tensor): class QWen2MLP (line 144) | class QWen2MLP(nn.Module): method __init__ (line 145) | def __init__(self, config: QWen2Config): method forward (line 156) | def forward(self, x: Tensor): class QWen2DecoderLayer (line 162) | class QWen2DecoderLayer(nn.Module): method __init__ (line 163) | def __init__(self, config: QWen2Config): method forward (line 198) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 207) | def _apply_residual(self, out, residual): class QWen2Model (line 213) | class QWen2Model(nn.Module): method __init__ (line 214) | def __init__(self, config: QWen2Config): method forward (line 221) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class QWen2LMHeadModel (line 229) | class QWen2LMHeadModel(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 230) | def __init__(self, config: QWen2Config): method to (line 247) | def to(self, dtype: Optional[str] = None): method batch_forward (line 252) | def batch_forward( method embed (line 272) | def embed(self, input_ids: Tensor): method prefill (line 277) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 294) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 306) | def batch_prefill( method batch_decode (line 317) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 321) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 325) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 351) | def get_default_spec(self): FILE: python/mlc_llm/model/qwen2_5_vl/qwen2_5_vl_model.py class Qwen25VLVisionTokenConfig (line 47) | class Qwen25VLVisionTokenConfig: class Qwen25VLVisionGridConfig (line 57) | class Qwen25VLVisionGridConfig: class Qwen25VLAttentionState (line 66) | class Qwen25VLAttentionState: class Qwen25VLConfig (line 77) | class Qwen25VLConfig(ConfigBase): # pylint: disable=too-many-instance-a... method __post_init__ (line 106) | def __post_init__(self): # pylint: disable=too-many-branches method image_token_id (line 160) | def image_token_id(self) -> int: method video_token_id (line 164) | def video_token_id(self) -> int: method vision_start_token_id (line 168) | def vision_start_token_id(self) -> int: method vision_end_token_id (line 172) | def vision_end_token_id(self) -> int: method spatial_merge_size (line 176) | def spatial_merge_size(self) -> int: method temporal_patch_size (line 180) | def temporal_patch_size(self) -> int: method tokens_per_second (line 184) | def tokens_per_second(self) -> float: method vision_metadata (line 188) | def vision_metadata(self) -> VisionPositionMetadata: class Qwen25VLEmbedding (line 198) | class Qwen25VLEmbedding(nn.Embedding): method lm_head_forward (line 201) | def lm_head_forward(self, x: Tensor): class Qwen25VLAttention (line 206) | class Qwen25VLAttention(nn.Module): method __init__ (line 207) | def __init__(self, config: Qwen25VLConfig): method head_dim (line 240) | def head_dim(self) -> int: method num_attention_heads (line 244) | def num_attention_heads(self) -> int: method num_key_value_heads (line 248) | def num_key_value_heads(self) -> int: method forward (line 251) | def forward( # pylint: disable=too-many-locals class Qwen25VLMLP (line 274) | class Qwen25VLMLP(nn.Module): method __init__ (line 275) | def __init__(self, config: Qwen25VLConfig): method forward (line 286) | def forward(self, x: Tensor): class Qwen25VLDecoderLayer (line 292) | class Qwen25VLDecoderLayer(nn.Module): method __init__ (line 293) | def __init__(self, config: Qwen25VLConfig): method _set_tp (line 304) | def _set_tp(self, config: Qwen25VLConfig): method forward (line 328) | def forward( method _apply_residual (line 343) | def _apply_residual(self, out: Tensor, residual: Tensor) -> Tensor: class Qwen25VLModel (line 349) | class Qwen25VLModel(nn.Module): method __init__ (line 350) | def __init__(self, config: Qwen25VLConfig): method forward (line 364) | def forward( class Qwen25VLLMHeadModel (line 377) | class Qwen25VLLMHeadModel(nn.Module): method __init__ (line 378) | def __init__(self, config: Qwen25VLConfig): method to (line 386) | def to(self, dtype: Optional[str] = None): method _apply_lm_head (line 391) | def _apply_lm_head(self, hidden_states: Tensor): method _set_mrope_delta (line 400) | def _set_mrope_delta(self, paged_kv_cache: PagedKVCache, deltas: Tensor): method _get_mrope_delta (line 404) | def _get_mrope_delta(self, paged_kv_cache: PagedKVCache, batch: int) -... method _build_decode_position_ids (line 411) | def _build_decode_position_ids( method prefill (line 425) | def prefill( method decode (line 444) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 452) | def batch_prefill( # pylint: disable=too-many-arguments method batch_forward (line 467) | def batch_forward( # pylint: disable=too-many-arguments method batch_decode (line 482) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 490) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method embed (line 493) | def embed(self, input_ids: Tensor): method create_paged_kv_cache (line 498) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 526) | def get_default_spec(self): FILE: python/mlc_llm/model/qwen2_moe/qwen2_moe_loader.py function huggingface (line 16) | def huggingface(model_config: Qwen2MoeConfig, quantization: Quantization... FILE: python/mlc_llm/model/qwen2_moe/qwen2_moe_model.py class Qwen2MoeConfig (line 23) | class Qwen2MoeConfig(QWen2Config): # pylint: disable=too-many-instance-... class Qwen2MoeMLP (line 37) | class Qwen2MoeMLP(nn.Module): method __init__ (line 38) | def __init__(self, config: Qwen2MoeConfig, intermediate_size: Optional... method forward (line 50) | def forward(self, x: Tensor): class Qwen2MoeSparseMoeBlock (line 56) | class Qwen2MoeSparseMoeBlock(nn.Module): # pylint: disable=too-many-ins... method __init__ (line 59) | def __init__(self, config: Qwen2MoeConfig): method forward (line 90) | def forward(self, x: Tensor): class Qwen2MoeDecoderLayer (line 141) | class Qwen2MoeDecoderLayer(nn.Module): method __init__ (line 142) | def __init__(self, config: Qwen2MoeConfig): method forward (line 193) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 202) | def _apply_residual(self, out, residual): class Qwen2MoeModel (line 208) | class Qwen2MoeModel(nn.Module): method __init__ (line 209) | def __init__(self, config: Qwen2MoeConfig): method forward (line 216) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class Qwen2MoeForCausalLM (line 224) | class Qwen2MoeForCausalLM(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 225) | def __init__(self, config: Qwen2MoeConfig): method to (line 240) | def to(self, dtype: Optional[str] = None): method batch_forward (line 245) | def batch_forward( method embed (line 261) | def embed(self, input_ids: Tensor): method prefill (line 266) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 280) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 289) | def batch_prefill( method batch_decode (line 300) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 304) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 308) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 334) | def get_default_spec(self): FILE: python/mlc_llm/model/qwen3/qwen3_loader.py function huggingface (line 17) | def huggingface( function huggingface_embedding (line 150) | def huggingface_embedding(model_config: Qwen3Config, quantization: Quant... FILE: python/mlc_llm/model/qwen3/qwen3_model.py class Qwen3Config (line 24) | class Qwen3Config(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 47) | def __post_init__(self): class Qwen3Attention (line 109) | class Qwen3Attention(nn.Module): # pylint: disable=too-many-instance-at... method __init__ (line 110) | def __init__(self, config: Qwen3Config): method forward (line 134) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class Qwen3Embedding (line 162) | class Qwen3Embedding(nn.Embedding): method lm_head_forward (line 167) | def lm_head_forward(self, x: nn.Tensor): class Qwen3MLP (line 175) | class Qwen3MLP(nn.Module): method __init__ (line 176) | def __init__(self, config: Qwen3Config): method forward (line 187) | def forward(self, x: Tensor): class Qwen3DecoderLayer (line 193) | class Qwen3DecoderLayer(nn.Module): method __init__ (line 194) | def __init__(self, config: Qwen3Config): method forward (line 230) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 239) | def _apply_residual(self, out, residual): class Qwen3Model (line 245) | class Qwen3Model(nn.Module): method __init__ (line 246) | def __init__(self, config: Qwen3Config): method forward (line 253) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class Qwen3LMHeadModel (line 261) | class Qwen3LMHeadModel(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 262) | def __init__(self, config: Qwen3Config): method to (line 280) | def to(self, dtype: Optional[str] = None): method batch_forward (line 285) | def batch_forward( method embed (line 305) | def embed(self, input_ids: Tensor): method prefill (line 310) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 327) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 339) | def batch_prefill( method batch_decode (line 350) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 354) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 358) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 384) | def get_default_spec(self): class Qwen3EmbeddingModel (line 449) | class Qwen3EmbeddingModel(Qwen3LMHeadModel): method prefill_to_last_hidden_states (line 457) | def prefill_to_last_hidden_states(self, input_embed: Tensor, paged_kv_... method decode_to_last_hidden_states (line 462) | def decode_to_last_hidden_states(self, input_embed: Tensor, paged_kv_c... method batch_prefill_to_last_hidden_states (line 467) | def batch_prefill_to_last_hidden_states( method batch_decode_to_last_hidden_states (line 474) | def batch_decode_to_last_hidden_states( method get_default_spec (line 481) | def get_default_spec(self): FILE: python/mlc_llm/model/qwen3_moe/qwen3_moe_loader.py function huggingface (line 17) | def huggingface(model_config: Qwen3MoeConfig, quantization: Quantization... FILE: python/mlc_llm/model/qwen3_moe/qwen3_moe_model.py class Qwen3MoeConfig (line 23) | class Qwen3MoeConfig(Qwen3Config): # pylint: disable=too-many-instance-... class Qwen3MoeMLP (line 36) | class Qwen3MoeMLP(nn.Module): method __init__ (line 37) | def __init__(self, config: Qwen3MoeConfig, intermediate_size: Optional... method forward (line 49) | def forward(self, x: Tensor): class Qwen3MoeSparseMoeBlock (line 55) | class Qwen3MoeSparseMoeBlock(nn.Module): # pylint: disable=too-many-ins... method __init__ (line 58) | def __init__(self, config: Qwen3MoeConfig): method forward (line 88) | def forward(self, x: Tensor): class Qwen3MoeDecoderLayer (line 146) | class Qwen3MoeDecoderLayer(nn.Module): method __init__ (line 147) | def __init__(self, config: Qwen3MoeConfig): method forward (line 190) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 199) | def _apply_residual(self, out, residual): class Qwen3MoeModel (line 205) | class Qwen3MoeModel(nn.Module): method __init__ (line 206) | def __init__(self, config: Qwen3MoeConfig): method forward (line 213) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class Qwen3MoeForCausalLM (line 221) | class Qwen3MoeForCausalLM(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 222) | def __init__(self, config: Qwen3MoeConfig): method to (line 238) | def to(self, dtype: Optional[str] = None): method batch_forward (line 243) | def batch_forward( method embed (line 259) | def embed(self, input_ids: Tensor): method prefill (line 264) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 278) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 287) | def batch_prefill( method batch_decode (line 298) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 302) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 306) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 332) | def get_default_spec(self): FILE: python/mlc_llm/model/rwkv5/rwkv5_loader.py function huggingface (line 15) | def huggingface(model_config: RWKV5Config, quantization: Quantization) -... FILE: python/mlc_llm/model/rwkv5/rwkv5_model.py class StateID (line 19) | class StateID: class RWKV5Config (line 28) | class RWKV5Config(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 46) | def __post_init__(self): function create_wkv5_func (line 64) | def create_wkv5_func( function token_shift (line 132) | def token_shift(state: Tensor, x: Tensor): function last_token (line 142) | def last_token(x: Tensor): class RWKV5_FNN (line 152) | class RWKV5_FNN(nn.Module): method __init__ (line 153) | def __init__(self, config: RWKV5Config, layer_id: int): method forward (line 162) | def forward(self, x: Tensor, state: RNNState): class RWKV5_Attention (line 175) | class RWKV5_Attention(nn.Module): # pylint: disable=too-many-instance-a... method __init__ (line 178) | def __init__(self, config: RWKV5Config, layer_id: int): method forward (line 203) | def forward(self, x: Tensor, state: RNNState): # pylint: disable=too-... method to (line 253) | def to(self, dtype: Optional[str] = None): class RWKV5_Layer (line 274) | class RWKV5_Layer(nn.Module): method __init__ (line 275) | def __init__(self, config: RWKV5Config, layer_id: int): method forward (line 295) | def forward(self, x: Tensor, state: RNNState) -> Tensor: class RWKV5_Model (line 307) | class RWKV5_Model(nn.Module): method __init__ (line 310) | def __init__(self, config: RWKV5Config): method forward (line 321) | def forward(self, input_embed: Tensor, state: RNNState): class RWKV5_ForCausalLM (line 329) | class RWKV5_ForCausalLM(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 332) | def __init__(self, config: RWKV5Config): method to (line 341) | def to(self, dtype: Optional[str] = None): method embed (line 346) | def embed(self, input_ids: Tensor): method forward (line 349) | def forward( method prefill (line 365) | def prefill(self, input_embed: Tensor, state: RNNState): method decode (line 369) | def decode(self, input_embed: Tensor, state: RNNState): method batch_prefill (line 373) | def batch_prefill(self, input_embeds: Tensor, logit_positions: Tensor,... method batch_decode (line 377) | def batch_decode(self, input_embeds: Tensor, state: RNNState): method batch_verify (line 381) | def batch_verify(self, input_embeds: Tensor, state: RNNState): method create_rnn_state (line 385) | def create_rnn_state( method get_default_spec (line 403) | def get_default_spec(self): FILE: python/mlc_llm/model/rwkv6/rwkv6_loader.py function huggingface (line 13) | def huggingface(model_config: RWKV6Config, quantization: Quantization) -... FILE: python/mlc_llm/model/rwkv6/rwkv6_model.py class StateID (line 19) | class StateID: class RWKV6Config (line 28) | class RWKV6Config(ConfigBase): # pylint: disable=too-many-instance-attr... method __post_init__ (line 46) | def __post_init__(self): function create_wkv6_func (line 64) | def create_wkv6_func( function token_shift (line 129) | def token_shift(state: Tensor, x: Tensor): function last_token (line 139) | def last_token(x: Tensor): function unbind_to_five (line 148) | def unbind_to_five(x: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, T... class RWKV6_FNN (line 163) | class RWKV6_FNN(nn.Module): method __init__ (line 164) | def __init__(self, config: RWKV6Config, layer_id: int): method forward (line 173) | def forward(self, x: Tensor, state: RNNState): class RWKV6_Attention (line 190) | class RWKV6_Attention(nn.Module): # pylint: disable=too-many-instance-a... method __init__ (line 193) | def __init__(self, config: RWKV6Config, layer_id: int): method forward (line 229) | def forward(self, x: Tensor, state: RNNState): # pylint: disable=too-... method to (line 298) | def to(self, dtype: Optional[str] = None): class RWKV6_Layer (line 325) | class RWKV6_Layer(nn.Module): method __init__ (line 326) | def __init__(self, config: RWKV6Config, layer_id: int): method forward (line 346) | def forward(self, x: Tensor, state: RNNState) -> Tensor: class RWKV6_Model (line 358) | class RWKV6_Model(nn.Module): method __init__ (line 361) | def __init__(self, config: RWKV6Config): method forward (line 372) | def forward(self, input_embed: Tensor, state: RNNState): class RWKV6_ForCausalLM (line 380) | class RWKV6_ForCausalLM(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 383) | def __init__(self, config: RWKV6Config): method to (line 393) | def to(self, dtype: Optional[str] = None): method embed (line 398) | def embed(self, input_ids: Tensor): method forward (line 401) | def forward( method prefill (line 417) | def prefill(self, input_embed: Tensor, state: RNNState): method decode (line 421) | def decode(self, input_embed: Tensor, state: RNNState): method batch_prefill (line 425) | def batch_prefill(self, input_embeds: Tensor, logit_positions: Tensor,... method batch_decode (line 429) | def batch_decode(self, input_embeds: Tensor, state: RNNState): method batch_verify (line 433) | def batch_verify(self, input_embeds: Tensor, state: RNNState): method create_rnn_state (line 437) | def create_rnn_state( method get_default_spec (line 455) | def get_default_spec(self): FILE: python/mlc_llm/model/stable_lm/stablelm_model.py class StableLmConfig (line 23) | class StableLmConfig(ConfigBase): # pylint: disable=too-many-instance-a... method __post_init__ (line 43) | def __post_init__(self): class StableLmAttention (line 84) | class StableLmAttention(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 85) | def __init__(self, config: StableLmConfig): method forward (line 107) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class StableLmMLP (line 122) | class StableLmMLP(nn.Module): method __init__ (line 123) | def __init__(self, config: StableLmConfig): method forward (line 137) | def forward(self, x: Tensor): class StableLmDecoderLayer (line 143) | class StableLmDecoderLayer(nn.Module): method __init__ (line 144) | def __init__(self, config: StableLmConfig): method forward (line 179) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 186) | def _apply_residual(self, out, residual): class StableLmModel (line 192) | class StableLmModel(nn.Module): method __init__ (line 193) | def __init__(self, config: StableLmConfig): method forward (line 201) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class StableLmForCausalLM (line 209) | class StableLmForCausalLM(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 210) | def __init__(self, config: StableLmConfig): method to (line 225) | def to(self, dtype: Optional[str] = None): method batch_forward (line 230) | def batch_forward( method embed (line 246) | def embed(self, input_ids: Tensor): method prefill (line 251) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 265) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 274) | def batch_prefill( method batch_decode (line 285) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 289) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 293) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 320) | def get_default_spec(self): FILE: python/mlc_llm/model/starcoder2/starcoder2_loader.py function huggingface (line 16) | def huggingface(model_config: Starcoder2Config, quantization: Quantizati... FILE: python/mlc_llm/model/starcoder2/starcoder2_model.py class Starcoder2Config (line 23) | class Starcoder2Config(ConfigBase): # pylint: disable=too-many-instance... method __post_init__ (line 46) | def __post_init__(self): class Starcoder2Attention (line 87) | class Starcoder2Attention(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 88) | def __init__(self, config: Starcoder2Config): method forward (line 115) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... class Starcoder2MLP (line 130) | class Starcoder2MLP(nn.Module): method __init__ (line 131) | def __init__(self, config: Starcoder2Config): method forward (line 147) | def forward(self, hidden_states: Tensor): class Starcoder2DecoderLayer (line 154) | class Starcoder2DecoderLayer(nn.Module): method __init__ (line 155) | def __init__(self, config: Starcoder2Config): method forward (line 200) | def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache,... method _apply_residual (line 207) | def _apply_residual(self, out, residual): class Starcoder2Model (line 213) | class Starcoder2Model(nn.Module): method __init__ (line 214) | def __init__(self, config: Starcoder2Config): method forward (line 222) | def forward(self, inputs: Tensor, paged_kv_cache: PagedKVCache): class Starcoder2ForCausalLM (line 230) | class Starcoder2ForCausalLM(nn.Module): # pylint: disable=too-many-inst... method __init__ (line 231) | def __init__(self, config: Starcoder2Config): method to (line 245) | def to(self, dtype: Optional[str] = None): method batch_forward (line 250) | def batch_forward( method embed (line 266) | def embed(self, input_ids: Tensor): method prefill (line 271) | def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method decode (line 285) | def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): method batch_prefill (line 294) | def batch_prefill( method batch_decode (line 305) | def batch_decode(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method batch_verify (line 309) | def batch_verify(self, input_embeds: Tensor, paged_kv_cache: PagedKVCa... method create_paged_kv_cache (line 313) | def create_paged_kv_cache( # pylint: disable=too-many-arguments method get_default_spec (line 339) | def get_default_spec(self): FILE: python/mlc_llm/model/vision/clip_vision.py class CLIPVisionConfig (line 30) | class CLIPVisionConfig(ConfigBase): # pylint: disable=too-many-instance... class CLIPVisionEmbeddings (line 49) | class CLIPVisionEmbeddings(Module): # pylint: disable=too-many-instance... method __init__ (line 50) | def __init__(self, config: CLIPVisionConfig): method forward (line 69) | def forward(self, pixel_values: Tensor) -> Tensor: function sigmoid (line 94) | def sigmoid(x: Tensor, name: str = "sigmoid") -> Tensor: class QuickGELU (line 112) | class QuickGELU(Module): method forward (line 113) | def forward(self, input_tensor: Tensor) -> Tensor: class CLIPMLP (line 117) | class CLIPMLP(Module): method __init__ (line 118) | def __init__(self, config: CLIPVisionConfig): method forward (line 124) | def forward(self, hidden_states: Tensor) -> Tensor: class CLIPAttention (line 131) | class CLIPAttention(Module): # pylint: disable=too-many-instance-attrib... method __init__ (line 132) | def __init__(self, config: CLIPVisionConfig): method forward (line 148) | def forward( class CLIPEncoderLayer (line 164) | class CLIPEncoderLayer(Module): method __init__ (line 165) | def __init__(self, config: CLIPVisionConfig): method forward (line 173) | def forward(self, hidden_states: Tensor) -> Tensor: class CLIPEncoder (line 187) | class CLIPEncoder(Module): method __init__ (line 188) | def __init__(self, config: CLIPVisionConfig): method forward (line 194) | def forward(self, inputs_embeds: Tensor) -> Tensor: class CLIPVisionTransformer (line 205) | class CLIPVisionTransformer(Module): method __init__ (line 206) | def __init__(self, config: CLIPVisionConfig): method forward (line 214) | def forward(self, pixel_values: Tensor) -> Tensor: class CLIPVisionModel (line 221) | class CLIPVisionModel(Module): method __init__ (line 224) | def __init__(self, config: CLIPVisionConfig): method forward (line 228) | def forward(self, pixel_values: Tensor) -> Tensor: FILE: python/mlc_llm/model/vision/image_processing.py function _var (line 10) | def _var(dtype, size=1): class ImageProcessor (line 15) | class ImageProcessor(Module): method __init__ (line 16) | def __init__(self): method apply_schedule (line 20) | def apply_schedule(self, sch, block, bdx=32, tile=[32, 32]): method resize (line 30) | def resize(self, image: Tensor, params): # image layout:NCHW method crop (line 95) | def crop(self, image: Tensor, crop_size): method rescale (line 148) | def rescale(self, image: Tensor, rescale_factor=1 / 255.0, o_dtype="fl... method normalize (line 187) | def normalize(self, image: Tensor, o_dtype="float32"): method pad (line 238) | def pad(self, image: Tensor, dtype="uint8"): method preprocess (line 285) | def preprocess(self, pixel_values): FILE: python/mlc_llm/nn/expert.py class MixtralExperts (line 9) | class MixtralExperts(nn.Module): method __init__ (line 12) | def __init__(self, num_local_experts, in_features, out_features, tenso... method forward (line 20) | def forward(self, x: Tensor, indptr: Tensor): # pylint: disable=inval... FILE: python/mlc_llm/nn/kv_cache.py class PagedKVCache (line 14) | class PagedKVCache(TVMPagedKVCache): # pylint: disable=too-few-public-m... method create_generic (line 18) | def create_generic( # pylint: disable=too-many-locals FILE: python/mlc_llm/nn/rnn_state.py class RNNState (line 11) | class RNNState(Object): method create (line 15) | def create( method get (line 69) | def get( method set (line 111) | def set(self, layer_id: int, state_id: int, value: Tensor) -> "RNNState": method create_get_func (line 139) | def create_get_func( method create_set_func (line 236) | def create_set_func( FILE: python/mlc_llm/op/attention.py function attention (line 18) | def attention( # pylint: disable=invalid-name,too-many-locals,too-many-... FILE: python/mlc_llm/op/batch_matmul.py function quantized_bmm (line 11) | def quantized_bmm( FILE: python/mlc_llm/op/batch_spec_verify.py function batch_spec_verify (line 10) | def batch_spec_verify(vocab_size): FILE: python/mlc_llm/op/cutlass.py function group_gemm (line 9) | def group_gemm( function fp8_gemm (line 82) | def fp8_gemm( function fp8_groupwise_scaled_gemm (line 140) | def fp8_groupwise_scaled_gemm( # pylint: disable=too-many-arguments function fp8_groupwise_scaled_bmm (line 211) | def fp8_groupwise_scaled_bmm( # pylint: disable=too-many-arguments function fp8_groupwise_scaled_group_gemm (line 283) | def fp8_groupwise_scaled_group_gemm( # pylint: disable=too-many-argumen... FILE: python/mlc_llm/op/extern.py class ExternModuleStore (line 25) | class ExternModuleStore: function enable (line 40) | def enable(target: Target, flashinfer: bool, faster_transformer: bool, c... function get_store (line 59) | def get_store() -> ExternModuleStore: function configure (line 64) | def configure() -> None: FILE: python/mlc_llm/op/ft_gemm.py function faster_transformer_dequantize_gemm (line 11) | def faster_transformer_dequantize_gemm( # pylint: disable=too-many-argu... function faster_transformer_moe_gemm (line 96) | def faster_transformer_moe_gemm( # pylint: disable=too-many-arguments FILE: python/mlc_llm/op/moe_matmul.py function gemv (line 13) | def gemv(x: Tensor, w: Tensor, indptr: Tensor) -> Tensor: function dequantize_gemv (line 78) | def dequantize_gemv( # pylint: disable=too-many-arguments function dequantize_float8_gemv (line 180) | def dequantize_float8_gemv( function dequantize_block_scale_float8_gemv (line 299) | def dequantize_block_scale_float8_gemv( function group_gemm (line 386) | def group_gemm(x: Tensor, w: Tensor, indptr: Tensor): # pylint: disable... function dequantize_group_gemm (line 565) | def dequantize_group_gemm( FILE: python/mlc_llm/op/moe_misc.py function moe_sum (line 15) | def moe_sum(x: Tensor, dim: int) -> Tensor: function _gating_topk_init_local_top_k (line 36) | def _gating_topk_init_local_top_k(k_val, dtype, local_top_k, local_top_k... function _gating_topk_process_value (line 43) | def _gating_topk_process_value( # pylint: disable=too-many-arguments function gating_topk (line 66) | def gating_topk(scores: Tensor, k: int) -> Tuple[Tensor, Tensor]: function gating_softmax_topk (line 138) | def gating_softmax_topk( # pylint: disable=too-many-statements function group_limited_greedy_topk (line 243) | def group_limited_greedy_topk( # pylint: disable=too-many-arguments function moe_cumsum (line 376) | def moe_cumsum(expert_indices: Tensor, num_local_experts: int) -> Tensor: function get_indices (line 458) | def get_indices(cumsum: Tensor, expert_indices: Tensor) -> Tuple[Tensor,... function get_indptr (line 533) | def get_indptr( function scatter_output (line 613) | def scatter_output(x: Tensor, indices: Tensor) -> Tensor: FILE: python/mlc_llm/op/mrope.py function _rotate_half (line 14) | def _rotate_half(x: Tensor) -> Tensor: function _repeat_mrope_section (line 21) | def _repeat_mrope_section(section: Sequence[int]) -> Tuple[int, ...]: function _split_indices_from_sizes (line 29) | def _split_indices_from_sizes(sizes: Sequence[int]) -> List[int]: function _reorder_cos_sin (line 39) | def _reorder_cos_sin( class MultimodalRotaryEmbedding (line 58) | class MultimodalRotaryEmbedding(nn.Module): method __init__ (line 61) | def __init__( method forward (line 78) | def forward(self, reference: Tensor, position_ids: Tensor) -> Tuple[Te... function apply_multimodal_rotary_pos_emb (line 122) | def apply_multimodal_rotary_pos_emb( # pylint: disable=too-many-arguments class VisionPositionMetadata (line 145) | class VisionPositionMetadata: method merged_hw (line 154) | def merged_hw(self, height: int, width: int) -> Tuple[int, int]: function _text_chunk (line 165) | def _text_chunk(length: int, offset: int) -> np.ndarray: function _grid_chunk (line 175) | def _grid_chunk( # pylint: disable=too-many-arguments function _find_token_index (line 197) | def _find_token_index(tokens: Sequence[int], token_id: int, start: int) ... function _next_chunk_offset (line 204) | def _next_chunk_offset(chunks: Sequence[np.ndarray]) -> int: function _count_vision_items (line 210) | def _count_vision_items( function _next_vision_block (line 224) | def _next_vision_block( function _load_grid_for_block (line 239) | def _load_grid_for_block( # pylint: disable=too-many-arguments function _build_sequence_position_ids (line 262) | def _build_sequence_position_ids( # pylint: disable=too-many-arguments,... function _text_only_position_ids (line 345) | def _text_only_position_ids( function get_mrope_position_ids (line 363) | def get_mrope_position_ids( # pylint: disable=too-many-arguments,too-ma... FILE: python/mlc_llm/op/pipeline_parallel.py function pipeline_stage_boundary (line 9) | def pipeline_stage_boundary(*tensors: Tensor) -> List[Tensor]: FILE: python/mlc_llm/op/top_p_pivot.py function top_p_pivot (line 13) | def top_p_pivot(pN, target: tvm.target.Target): function top_p_renorm (line 270) | def top_p_renorm(target: tvm.target.Target = None): FILE: python/mlc_llm/op/triton.py function _get_triton_w8a8_block_fp8_gemm (line 22) | def _get_triton_w8a8_block_fp8_gemm(): function _get_triton_w8a8_block_fp8_group_gemm (line 113) | def _get_triton_w8a8_block_fp8_group_gemm(): function get_tir_w8a8_block_fp8_matmul (line 273) | def get_tir_w8a8_block_fp8_matmul( # pylint: disable=too-many-arguments... function get_tir_w8a8_block_fp8_group_matmul (line 373) | def get_tir_w8a8_block_fp8_group_matmul( # pylint: disable=too-many-arg... function _compute_expert_id_per_block (line 497) | def _compute_expert_id_per_block( function fp8_groupwise_scaled_gemm (line 568) | def fp8_groupwise_scaled_gemm( # pylint: disable=too-many-arguments,too... function fp8_groupwise_scaled_group_gemm (line 668) | def fp8_groupwise_scaled_group_gemm( # pylint: disable=too-many-argumen... FILE: python/mlc_llm/protocol/conversation_protocol.py class MessagePlaceholders (line 10) | class MessagePlaceholders(Enum): class Conversation (line 23) | class Conversation(BaseModel): method __init__ (line 85) | def __init__(self, role_templates: Optional[Dict[str, str]] = None, **... method check_message_seps (line 98) | def check_message_seps(cls, seps: List[str]) -> List[str]: method to_json_dict (line 104) | def to_json_dict(self) -> Dict[str, Any]: method from_json_dict (line 109) | def from_json_dict(cls: Type[T], json_dict: Dict[str, Any]) -> T: method as_prompt (line 114) | def as_prompt(self, config=None) -> List[Any]: function _get_url_from_item (line 199) | def _get_url_from_item(item: Dict) -> str: function _combine_consecutive_messages (line 217) | def _combine_consecutive_messages(messages: List[Any]) -> List[Any]: FILE: python/mlc_llm/protocol/debug_protocol.py class DisaggConfig (line 8) | class DisaggConfig(BaseModel): class DebugConfig (line 29) | class DebugConfig(BaseModel): FILE: python/mlc_llm/protocol/error_protocol.py class BadRequestError (line 10) | class BadRequestError(ValueError): method __init__ (line 13) | def __init__(self, *args: object) -> None: class ErrorResponse (line 17) | class ErrorResponse(BaseModel): function create_error_response (line 25) | def create_error_response(status_code: HTTPStatus, message: str) -> fast... function bad_request_error_handler (line 33) | async def bad_request_error_handler(_request: fastapi.Request, e: BadReq... FILE: python/mlc_llm/protocol/generation_config.py class GenerationConfig (line 12) | class GenerationConfig(BaseModel): # pylint: FILE: python/mlc_llm/protocol/microserving_protocol.py class PrepRecvRequest (line 8) | class PrepRecvRequest(CompletionRequest): class PrepRecvResponse (line 22) | class PrepRecvResponse(BaseModel): class RemoteSendRequest (line 39) | class RemoteSendRequest(CompletionRequest): class StartGenerateRequest (line 63) | class StartGenerateRequest(CompletionRequest): FILE: python/mlc_llm/protocol/mlc_chat_config.py class MLCChatConfig (line 25) | class MLCChatConfig(BaseModel): method get_system_defaults_for_missing_fields (line 66) | def get_system_defaults_for_missing_fields(self) -> Dict[str, Any]: FILE: python/mlc_llm/protocol/openai_api_protocol.py class ListResponse (line 27) | class ListResponse(BaseModel): class TopLogProbs (line 32) | class TopLogProbs(BaseModel): class LogProbsContent (line 38) | class LogProbsContent(BaseModel): class LogProbs (line 45) | class LogProbs(BaseModel): class CompletionLogProbs (line 49) | class CompletionLogProbs(BaseModel): class CompletionUsage (line 58) | class CompletionUsage(BaseModel): class StreamOptions (line 67) | class StreamOptions(BaseModel): class EmbeddingRequest (line 74) | class EmbeddingRequest(BaseModel): method validate_input (line 87) | def validate_input(cls, v): class EmbeddingObject (line 98) | class EmbeddingObject(BaseModel): class EmbeddingUsage (line 104) | class EmbeddingUsage(BaseModel): class EmbeddingResponse (line 109) | class EmbeddingResponse(BaseModel): class ModelResponse (line 123) | class ModelResponse(BaseModel): class RequestResponseFormat (line 137) | class RequestResponseFormat(BaseModel): class CompletionRequest (line 146) | class CompletionRequest(BaseModel): method check_penalty_range (line 174) | def check_penalty_range(cls, penalty_value: Optional[float]) -> Option... method check_logit_bias (line 182) | def check_logit_bias( method check_logprobs (line 197) | def check_logprobs(self) -> "CompletionRequest": class CompletionResponseChoice (line 206) | class CompletionResponseChoice(BaseModel): class CompletionResponse (line 213) | class CompletionResponse(BaseModel): class ChatFunction (line 229) | class ChatFunction(BaseModel): class ChatTool (line 235) | class ChatTool(BaseModel): class ChatFunctionCall (line 240) | class ChatFunctionCall(BaseModel): class ChatToolCall (line 245) | class ChatToolCall(BaseModel): class ChatCompletionMessage (line 251) | class ChatCompletionMessage(BaseModel): class ChatCompletionRequest (line 259) | class ChatCompletionRequest(BaseModel): method check_penalty_range (line 289) | def check_penalty_range(cls, penalty_value: Optional[float]) -> Option... method check_logit_bias (line 297) | def check_logit_bias( method check_logprobs (line 312) | def check_logprobs(self) -> "ChatCompletionRequest": method check_stream_options (line 323) | def check_stream_options(self) -> "ChatCompletionRequest": method check_debug_config (line 332) | def check_debug_config(self) -> "ChatCompletionRequest": method check_message_validity (line 348) | def check_message_validity(self) -> None: method check_function_call_usage (line 366) | def check_function_call_usage(self, conv_template: Conversation) -> None: class ChatCompletionResponseChoice (line 415) | class ChatCompletionResponseChoice(BaseModel): class ChatCompletionStreamResponseChoice (line 422) | class ChatCompletionStreamResponseChoice(BaseModel): class ChatCompletionResponse (line 429) | class ChatCompletionResponse(BaseModel): class ChatCompletionStreamResponse (line 443) | class ChatCompletionStreamResponse(BaseModel): function openai_api_get_unsupported_fields (line 460) | def openai_api_get_unsupported_fields( FILE: python/mlc_llm/quantization/awq_quantization.py function _make_divisible (line 15) | def _make_divisible(c, divisor): # pylint: disable=invalid-name function _calculate_zeros_width (line 19) | def _calculate_zeros_width(in_features, group_size=128, pack_num=8): class AWQQuantize (line 35) | class AWQQuantize: # pylint: disable=too-many-instance-attributes method __post_init__ (line 53) | def __post_init__(self): method quantize_model (line 70) | def quantize_model( method _dequantize (line 133) | def _dequantize( class AWQQuantizeLinear (line 175) | class AWQQuantizeLinear(nn.Module): # pylint: disable=too-many-instance... method __init__ (line 178) | def __init__( # pylint: disable=too-many-arguments method from_linear (line 213) | def from_linear(linear: nn.Linear, config: AWQQuantize) -> "AWQQuantiz... method forward (line 238) | def forward(self, x: nn.Tensor) -> nn.Tensor: # pylint: disable=inval... method to (line 271) | def to(self, dtype: Optional[str] = None) -> None: FILE: python/mlc_llm/quantization/block_scale_quantization.py class BlockScaleQuantize (line 23) | class BlockScaleQuantize: # pylint: disable=too-many-instance-attributes method __post_init__ (line 34) | def __post_init__(self): method quantize_model (line 47) | def quantize_model( class BlockScaleQuantizeLinear (line 181) | class BlockScaleQuantizeLinear(nn.Module): # pylint: disable=too-many-i... method __init__ (line 184) | def __init__( # pylint: disable=too-many-arguments method from_linear (line 214) | def from_linear( method forward (line 259) | def forward(self, x: nn.Tensor) -> nn.Tensor: method to (line 323) | def to(self, dtype: Optional[str] = None) -> None: class BlockScaleQuantizeLinearStaticActivation (line 334) | class BlockScaleQuantizeLinearStaticActivation(BlockScaleQuantizeLinear): method __init__ (line 337) | def __init__( # pylint: disable=too-many-arguments method from_linear (line 360) | def from_linear( method forward (line 410) | def forward(self, x: nn.Tensor) -> nn.Tensor: class BlockScaleQuantizeMixtralExperts (line 460) | class BlockScaleQuantizeMixtralExperts(nn.Module): # pylint: disable=to... method __init__ (line 463) | def __init__( # pylint: disable=too-many-arguments method from_mixtral_experts (line 488) | def from_mixtral_experts( method forward (line 533) | def forward(self, x: nn.Tensor, indptr: nn.Tensor) -> nn.Tensor: method to (line 594) | def to(self, dtype: Optional[str] = None) -> None: function rowwise_group_quant_fp8 (line 603) | def rowwise_group_quant_fp8( # pylint: disable=too-many-arguments function static_activation_group_quant_fp8 (line 701) | def static_activation_group_quant_fp8( function broadcast_activation_scale (line 735) | def broadcast_activation_scale( function dequantize_float8_groupwise_scaled_gemv (line 752) | def dequantize_float8_groupwise_scaled_gemv( FILE: python/mlc_llm/quantization/fp8_quantization.py class FP8PerTensorQuantizeMixtralExperts (line 14) | class FP8PerTensorQuantizeMixtralExperts( method __init__ (line 19) | def __init__( method from_mixtral_experts (line 32) | def from_mixtral_experts( method forward (line 72) | def forward(self, x: nn.Tensor, indptr: nn.Tensor) -> nn.Tensor: # py... FILE: python/mlc_llm/quantization/ft_quantization.py class FTQuantize (line 29) | class FTQuantize: # pylint: disable=too-many-instance-attributes method fallback_group_quantize (line 42) | def fallback_group_quantize(self) -> GroupQuantize: method __post_init__ (line 61) | def __post_init__(self): method quantize_model (line 76) | def quantize_model( method quantize_weight (line 171) | def quantize_weight(self, weight: Tensor) -> List[Tensor]: method _quantize (line 256) | def _quantize( # pylint: disable=too-many-locals class FTQuantizeLinear (line 325) | class FTQuantizeLinear(nn.Module): # pylint: disable=too-many-instance-... method __init__ (line 328) | def __init__( # pylint: disable=too-many-arguments method from_linear (line 357) | def from_linear(src: nn.Linear, config: FTQuantize) -> "FTQuantizeLine... method forward (line 385) | def forward(self, x: nn.Tensor) -> nn.Tensor: # pylint: disable=inval... method to (line 403) | def to(self, dtype: Optional[str] = None) -> None: FILE: python/mlc_llm/quantization/group_quantization.py class GroupQuantize (line 28) | class GroupQuantize: # pylint: disable=too-many-instance-attributes method __post_init__ (line 46) | def __post_init__(self): method quantize_model (line 65) | def quantize_model( method _dequantize (line 155) | def _dequantize( method quantize_weight (line 188) | def quantize_weight( method _quantize (line 237) | def _quantize( # pylint: disable=too-many-locals class GroupQuantizeLinear (line 311) | class GroupQuantizeLinear(nn.Module): # pylint: disable=too-many-instan... method __init__ (line 314) | def __init__( # pylint: disable=too-many-arguments method from_linear (line 358) | def from_linear(src: nn.Linear, config: GroupQuantize) -> "GroupQuanti... method forward (line 392) | def forward(self, x: nn.Tensor) -> nn.Tensor: # pylint: disable=inval... method to (line 441) | def to(self, dtype: Optional[str] = None) -> None: class GroupQuantizeEmbedding (line 454) | class GroupQuantizeEmbedding(nn.Module): method __init__ (line 457) | def __init__(self, num: Union[int, tir.Var], dim: int, config: GroupQu... method from_embedding (line 468) | def from_embedding(embedding: nn.Embedding, config: GroupQuantize) -> ... method forward (line 488) | def forward(self, x: nn.Tensor): # pylint: disable=invalid-name method lm_head_forward (line 526) | def lm_head_forward(self, x: nn.Tensor): class GroupQuantizeMixtralExperts (line 561) | class GroupQuantizeMixtralExperts(nn.Module): # pylint: disable=too-man... method __init__ (line 564) | def __init__( method from_mixtral_experts (line 590) | def from_mixtral_experts( method forward (line 621) | def forward(self, x: nn.Tensor, indptr: nn.Tensor) -> nn.Tensor: # py... FILE: python/mlc_llm/quantization/model_quantization.py function make_quantization_functions (line 20) | def make_quantization_functions( # pylint: disable=too-many-arguments, ... function make_awq_quant (line 139) | def make_awq_quant( FILE: python/mlc_llm/quantization/no_quantization.py class NoQuantize (line 7) | class NoQuantize: # pylint: disable=too-many-instance-attributes method __post_init__ (line 14) | def __post_init__(self): FILE: python/mlc_llm/quantization/per_tensor_quantization.py class PerTensorQuantize (line 30) | class PerTensorQuantize: # pylint: disable=too-many-instance-attributes method __post_init__ (line 53) | def __post_init__(self): method quantize_model (line 61) | def quantize_model( method quantize_weight (line 169) | def quantize_weight(self, weight) -> List[Tensor]: method quantize_float8 (line 221) | def quantize_float8( # pylint: disable=too-many-locals method _dequantize (line 286) | def _dequantize( method dequantize_float8 (line 301) | def dequantize_float8( class PerTensorQuantizeLinear (line 326) | class PerTensorQuantizeLinear(nn.Module): # pylint: disable=too-many-in... method __init__ (line 329) | def __init__( # pylint: disable=too-many-arguments method from_linear (line 363) | def from_linear( method forward (line 402) | def forward(self, x: nn.Tensor) -> nn.Tensor: # pylint: disable=inval... method to (line 491) | def to(self, dtype: Optional[str] = None) -> None: class PerTensorQuantizeEmbedding (line 505) | class PerTensorQuantizeEmbedding(nn.Module): method __init__ (line 508) | def __init__(self, num: Union[int, tir.Var], dim: int, config: PerTens... method from_embedding (line 521) | def from_embedding( method forward (line 543) | def forward(self, x: nn.Tensor): # pylint: disable=invalid-name method lm_head_forward (line 576) | def lm_head_forward(self, x: nn.Tensor): class PerTensorQuantizeMixtralExperts (line 606) | class PerTensorQuantizeMixtralExperts(nn.Module): # pylint: disable=too... method __init__ (line 611) | def __init__( method from_mixtral_experts (line 641) | def from_mixtral_experts( method forward (line 675) | def forward(self, x: nn.Tensor, indptr: nn.Tensor) -> nn.Tensor: # py... FILE: python/mlc_llm/quantization/utils.py function convert_uint_to_float (line 14) | def convert_uint_to_float( # pylint: disable=too-many-arguments function is_final_fc (line 50) | def is_final_fc(name: str) -> bool: function is_moe_gate (line 56) | def is_moe_gate(name: str, node: nn.Linear) -> bool: function compile_quantize_func (line 61) | def compile_quantize_func(mod: IRModule, device) -> Callable: function apply_sharding (line 86) | def apply_sharding(shard_strategy, name: str, weight: nn.Parameter): function convert_uint_packed_fp8_to_float (line 98) | def convert_uint_packed_fp8_to_float( # pylint: disable=too-many-arguments function pack_weight (line 137) | def pack_weight( FILE: python/mlc_llm/router/router.py class Router (line 17) | class Router: # pylint: disable=too-many-instance-attributes method __init__ (line 20) | def __init__( method terminate (line 106) | def terminate(self): method handle_completion (line 111) | async def handle_completion( method translate_request (line 133) | async def translate_request( method _pick_endpoint (line 150) | def _pick_endpoint(self, endpoint_ids: Iterable[int]) -> int: method _handle_completion_round_robin (line 161) | async def _handle_completion_round_robin( method _handle_completion_disagg (line 221) | async def _handle_completion_disagg( # pylint: disable=too-many-locals method send_prepare_receive (line 312) | async def send_prepare_receive( method send_remote_send (line 341) | async def send_remote_send( method send_start_generate (line 360) | async def send_start_generate( FILE: python/mlc_llm/serve/config.py class EngineConfig (line 9) | class EngineConfig: # pylint: disable=too-many-instance-attributes method asjson (line 162) | def asjson(self) -> str: method from_json (line 167) | def from_json(json_str: str) -> "EngineConfig": FILE: python/mlc_llm/serve/data.py class Data (line 14) | class Data(Object): # pylint: disable=too-few-public-methods method __init__ (line 17) | def __init__(self): # pylint: disable=super-init-not-called class TextData (line 22) | class TextData(Data): method __init__ (line 31) | def __init__(self, text: str): method text (line 35) | def text(self) -> str: method __str__ (line 39) | def __str__(self) -> str: class TokenData (line 44) | class TokenData(Data): # pylint: disable=too-few-public-methods method __init__ (line 53) | def __init__(self, token_ids: List[int]): method token_ids (line 57) | def token_ids(self) -> List[int]: class ImageData (line 64) | class ImageData(Data): method __init__ (line 73) | def __init__(self, image: Tensor, embed_size: int): method image (line 78) | def image(self) -> Tensor: method __len__ (line 82) | def __len__(self): method from_url (line 87) | def from_url(url: str, config: Dict) -> "ImageData": method get_embed_size (line 120) | def get_embed_size(config: Dict) -> int: method get_input_size (line 128) | def get_input_size(config: Dict) -> int: class SingleRequestStreamOutput (line 135) | class SingleRequestStreamOutput: class RequestStreamOutput (line 161) | class RequestStreamOutput(Object): # pylint: disable=too-few-public-met... method unpack (line 178) | def unpack(self) -> Tuple[str, List[SingleRequestStreamOutput]]: FILE: python/mlc_llm/serve/embedding_engine.py class AsyncEmbeddingEngine (line 19) | class AsyncEmbeddingEngine: # pylint: disable=too-many-instance-attributes method __init__ (line 43) | def __init__( # pylint: disable=too-many-branches method _init_encoder (line 89) | def _init_encoder(self, model: str) -> None: method _init_decoder (line 115) | def _init_decoder(self, model: str) -> None: method embed (line 164) | def embed(self, inputs: List[str]) -> Tuple[List[List[float]], int]: method async_embed (line 183) | async def async_embed(self, inputs: List[str]) -> Tuple[List[List[floa... method _embed_encoder (line 203) | def _embed_encoder( # pylint: disable=too-many-locals method _embed_decoder (line 275) | def _embed_decoder(self, inputs: List[str]) -> Tuple[List[List[float]]... method _build_sub_batches (line 332) | def _build_sub_batches( method _batch_embed_decoder (line 362) | def _batch_embed_decoder( # pylint: disable=too-many-arguments,too-ma... method _sequential_embed_decoder (line 422) | def _sequential_embed_decoder( # pylint: disable=too-many-arguments,t... method terminate (line 479) | def terminate(self) -> None: method __del__ (line 486) | def __del__(self): FILE: python/mlc_llm/serve/engine.py class AsyncChat (line 37) | class AsyncChat: # pylint: disable=too-few-public-methods method __init__ (line 40) | def __init__(self, engine: weakref.ReferenceType) -> None: class Chat (line 45) | class Chat: # pylint: disable=too-few-public-methods method __init__ (line 48) | def __init__(self, engine: weakref.ReferenceType) -> None: class AsyncChatCompletion (line 53) | class AsyncChatCompletion: # pylint: disable=too-few-public-methods method __init__ (line 61) | def __init__(self, engine: weakref.ReferenceType) -> None: method create (line 65) | async def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 120) | async def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 174) | async def create( # pylint: disable=too-many-arguments,too-many-locals class ChatCompletion (line 250) | class ChatCompletion: # pylint: disable=too-few-public-methods method __init__ (line 258) | def __init__(self, engine: weakref.ReferenceType) -> None: method create (line 262) | def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 317) | def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 369) | def create( # pylint: disable=too-many-arguments,too-many-locals class AsyncCompletion (line 445) | class AsyncCompletion: # pylint: disable=too-few-public-methods method __init__ (line 453) | def __init__(self, engine: weakref.ReferenceType) -> None: method create (line 457) | async def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 512) | async def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 564) | async def create( # pylint: disable=too-many-arguments,too-many-locals class Completion (line 640) | class Completion: # pylint: disable=too-few-public-methods method __init__ (line 648) | def __init__(self, engine: weakref.ReferenceType) -> None: method create (line 652) | def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 707) | def create( # pylint: disable=too-many-arguments,too-many-locals method create (line 759) | def create( # pylint: disable=too-many-arguments,too-many-locals class AsyncMLCEngine (line 835) | class AsyncMLCEngine(engine_base.MLCEngineBase): method __init__ (line 885) | def __init__( # pylint: disable=too-many-arguments,too-many-locals method abort (line 907) | async def abort(self, request_id: str) -> None: method metrics (line 917) | async def metrics(self) -> engine_base.EngineMetrics: method _chat_completion (line 928) | async def _chat_completion( # pylint: disable=too-many-arguments,too-... method _completion (line 1072) | async def _completion( # pylint: disable=too-many-arguments,too-many-... method _handle_chat_completion (line 1190) | async def _handle_chat_completion( method _handle_completion (line 1256) | async def _handle_completion( method _generate (line 1328) | async def _generate( method _abort (line 1404) | def _abort(self, request_id: str): class MLCEngine (line 1410) | class MLCEngine(engine_base.MLCEngineBase): method __init__ (line 1460) | def __init__( # pylint: disable=too-many-arguments,too-many-locals method abort (line 1482) | def abort(self, request_id: str) -> None: method metrics (line 1492) | def metrics(self) -> engine_base.EngineMetrics: method _chat_completion (line 1503) | def _chat_completion( # pylint: disable=too-many-arguments,too-many-l... method _completion (line 1637) | def _completion( # pylint: disable=too-many-arguments,too-many-locals method _handle_chat_completion (line 1754) | def _handle_chat_completion( method _handle_completion (line 1802) | def _handle_completion( method _generate (line 1856) | def _generate( # pylint: disable=too-many-locals method _request_stream_callback_impl (line 1928) | def _request_stream_callback_impl( FILE: python/mlc_llm/serve/engine_base.py class ModelInfo (line 35) | class ModelInfo: function _check_engine_config (line 55) | def _check_engine_config( function _parse_models (line 95) | def _parse_models( function _process_model_args (line 112) | def _process_model_args( function _print_engine_mode_logging_msg (line 177) | def _print_engine_mode_logging_msg( class EngineMetrics (line 218) | class EngineMetrics: method __init__ (line 223) | def __init__(self, metrics): method __str__ (line 226) | def __str__(self): method __repr__ (line 229) | def __repr__(self): method __getitem__ (line 232) | def __getitem__(self, key): method prometheus_text (line 235) | def prometheus_text(self) -> str: function _query_engine_metrics (line 266) | def _query_engine_metrics(engine): function _async_query_engine_metrics (line 281) | async def _async_query_engine_metrics(engine): class CallbackStreamOutput (line 302) | class CallbackStreamOutput: class AsyncRequestStream (line 328) | class AsyncRequestStream: method __init__ (line 351) | def __init__(self) -> None: method push (line 355) | def push(self, item_or_exception: Union[List[CallbackStreamOutput], Ex... method finish (line 368) | def finish(self) -> None: method __aiter__ (line 373) | def __aiter__(self): method __anext__ (line 376) | async def __anext__(self) -> List[CallbackStreamOutput]: class EngineState (line 385) | class EngineState: method __init__ (line 413) | def __init__(self, enable_tracing: bool) -> None: method record_event (line 418) | def record_event(self, request_id: str, event: str) -> None: method get_request_stream_callback (line 439) | def get_request_stream_callback( method async_lazy_init_event_loop (line 462) | def async_lazy_init_event_loop(self) -> None: method _async_request_stream_callback (line 469) | def _async_request_stream_callback(self, delta_outputs: List[data.Requ... method _async_request_stream_callback_impl (line 487) | def _async_request_stream_callback_impl( method _sync_request_stream_callback (line 543) | def _sync_request_stream_callback(self, delta_outputs: List[data.Reque... class MLCEngineBase (line 551) | class MLCEngineBase: # pylint: disable=too-many-instance-attributes,too... method __init__ (line 568) | def __init__( # pylint: disable=too-many-arguments,too-many-locals method __del__ (line 654) | def __del__(self): method terminate (line 658) | def terminate(self): method _debug_call_func_on_all_worker (line 671) | def _debug_call_func_on_all_worker( method reset (line 677) | def reset(self): function process_chat_completion_request (line 682) | def process_chat_completion_request( # pylint: disable=too-many-arguments function process_chat_completion_stream_output (line 778) | def process_chat_completion_stream_output( # pylint: disable=too-many-a... function process_completion_request (line 887) | def process_completion_request( # pylint: disable=too-many-arguments function get_logprobs_from_delta (line 969) | def get_logprobs_from_delta( function process_completion_stream_output (line 1006) | def process_completion_stream_output( # pylint: disable=too-many-arguments function create_completion_suffix_response (line 1104) | def create_completion_suffix_response( function convert_function_str_to_json (line 1151) | def convert_function_str_to_json(stringified_calls: str) -> List[Union[D... function process_function_call_output (line 1176) | def process_function_call_output( function wrap_chat_completion_response (line 1212) | def wrap_chat_completion_response( # pylint: disable=too-many-arguments function wrap_completion_response (line 1252) | def wrap_completion_response( # pylint: disable=too-many-arguments FILE: python/mlc_llm/serve/engine_utils.py function get_unsupported_fields (line 15) | def get_unsupported_fields(request: RequestProtocol) -> List[str]: function openai_api_get_generation_config (line 30) | def openai_api_get_generation_config(request: RequestProtocol) -> Dict[s... function get_generation_config (line 63) | def get_generation_config( function random_uuid (line 96) | def random_uuid() -> str: function check_unsupported_fields (line 101) | def check_unsupported_fields(request: RequestProtocol) -> None: function check_and_get_prompts_length (line 111) | def check_and_get_prompts_length( function process_prompts (line 129) | def process_prompts( function convert_prompts_to_data (line 170) | def convert_prompts_to_data( class ErrorCleanupScope (line 185) | class ErrorCleanupScope: method __init__ (line 253) | def __init__(self, cleanup: Callable): method __enter__ (line 256) | def __enter__(self): method __exit__ (line 259) | def __exit__(self, exc_type, exc_value, traceback) -> None: function load_embedding_params (line 268) | def load_embedding_params(model_weight_path, device, model_metadata) -> ... function get_embedding_metadata (line 293) | def get_embedding_metadata(config: Dict[str, Any]) -> Optional[Dict[str,... function detect_embedding_model_type (line 311) | def detect_embedding_model_type(mod) -> Literal["encoder", "decoder"]: FILE: python/mlc_llm/serve/entrypoints/debug_entrypoints.py function debug_dump_event_trace (line 17) | async def debug_dump_event_trace(request: fastapi.Request): function debug_cuda_profiler_start (line 61) | async def debug_cuda_profiler_start(_request: fastapi.Request): function debug_cuda_profiler_stop (line 74) | async def debug_cuda_profiler_stop(_request: fastapi.Request): function debug_dump_engine_metrics (line 87) | async def debug_dump_engine_metrics(request: fastapi.Request): function debug_reset_engine_stats (line 110) | async def debug_reset_engine_stats(request: fastapi.Request): FILE: python/mlc_llm/serve/entrypoints/metrics_entrypoints.py function metrics (line 14) | async def metrics(_request: fastapi.Request): FILE: python/mlc_llm/serve/entrypoints/microserving_entrypoints.py function prep_recv (line 23) | async def prep_recv(request: PrepRecvRequest, raw_request: fastapi.Reque... function remote_send (line 49) | async def remote_send(request: RemoteSendRequest, raw_request: fastapi.R... function start_generate (line 67) | async def start_generate(request: StartGenerateRequest, raw_request: fas... FILE: python/mlc_llm/serve/entrypoints/openai_entrypoints.py function verify_api_key (line 29) | def verify_api_key(request: fastapi.Request): function request_embedding (line 46) | async def request_embedding(request: EmbeddingRequest): function request_models (line 125) | async def request_models() -> ListResponse: function request_completion (line 137) | async def request_completion(request: CompletionRequest, raw_request: fa... function request_chat_completion (line 241) | async def request_chat_completion( FILE: python/mlc_llm/serve/event_trace_recorder.py class EventTraceRecorder (line 10) | class EventTraceRecorder(Object): method __init__ (line 13) | def __init__(self) -> None: # pylint: disable=super-init-not-called method add_event (line 19) | def add_event(self, request_id: str, event: str) -> None: method dump_json (line 39) | def dump_json(self) -> str: FILE: python/mlc_llm/serve/radix_tree.py class PagedRadixTree (line 12) | class PagedRadixTree(Object): method __init__ (line 15) | def __init__(self): # pylint: disable=super-init-not-called method match (line 21) | def match(self, tokens: Union[ShapeTuple, List, Tuple]) -> Tuple[int, ... method add (line 44) | def add(self, seq_id: int) -> None: method remove (line 55) | def remove(self, seq_id: int) -> None: method extend (line 66) | def extend(self, seq_id: int, tokens: Union[ShapeTuple, List, Tuple]) ... method rollback (line 81) | def rollback(self, seq_id: int, num_tokens: int) -> None: method fork (line 94) | def fork(self, seq_id: int, parent_seq_id: int, forked_offset: int) ->... method get (line 112) | def get(self, seq_id: int) -> ShapeTuple: method get_length (line 128) | def get_length(self, seq_id: int) -> int: method free_capacity (line 144) | def free_capacity(self) -> int: FILE: python/mlc_llm/serve/request.py class Request (line 15) | class Request(Object): method inputs (line 27) | def inputs(self) -> List[Data]: method generation_config (line 32) | def generation_config(self) -> GenerationConfig: FILE: python/mlc_llm/serve/server/popen_server.py class PopenServer (line 18) | class PopenServer: # pylint: disable=too-many-instance-attributes method __init__ (line 25) | def __init__( # pylint: disable=too-many-arguments method start (line 59) | def start( # pylint: disable=too-many-branches,too-many-statements method terminate (line 163) | def terminate(self) -> None: method __enter__ (line 197) | def __enter__(self): method __exit__ (line 202) | def __exit__(self, exc_type, exc_val, exc_tb): FILE: python/mlc_llm/serve/server/server_context.py class ServerContext (line 11) | class ServerContext: method __init__ (line 19) | def __init__(self) -> None: method __enter__ (line 24) | def __enter__(self): method __exit__ (line 30) | def __exit__(self, exc_type, exc_value, traceback): method current (line 40) | def current(): method add_model (line 44) | def add_model(self, hosted_model: str, engine: AsyncMLCEngine) -> None: method get_engine (line 50) | def get_engine(self, model: Optional[str]) -> Optional[AsyncMLCEngine]: method get_model_list (line 57) | def get_model_list(self) -> List[str]: method add_embedding_engine (line 61) | def add_embedding_engine(self, hosted_model: str, engine: "AsyncEmbedd... method get_embedding_engine (line 67) | def get_embedding_engine(self, model: Optional[str]) -> Optional["Asyn... FILE: python/mlc_llm/serve/sync_engine.py function _create_tvm_module (line 35) | def _create_tvm_module( class SyncMLCEngine (line 45) | class SyncMLCEngine: method __init__ (line 85) | def __init__( # pylint: disable=too-many-arguments,too-many-locals method generate (line 156) | def generate( # pylint: disable=too-many-locals method create_request (line 290) | def create_request( method add_request (line 320) | def add_request(self, request: Request) -> None: method abort_request (line 330) | def abort_request(self, request_id: str) -> None: method step (line 340) | def step(self) -> None: method reset (line 354) | def reset(self) -> None: method metrics (line 358) | def metrics(self) -> EngineMetrics: FILE: python/mlc_llm/support/argparse.py class ArgumentParser (line 7) | class ArgumentParser(argparse.ArgumentParser): method error (line 10) | def error(self, message): FILE: python/mlc_llm/support/auto_config.py function detect_mlc_chat_config (line 21) | def detect_mlc_chat_config(mlc_chat_config: str) -> Path: function detect_config (line 74) | def detect_config(config: str) -> Path: function detect_model_type (line 120) | def detect_model_type(model_type: str, config: Path) -> "Model": function detect_quantization (line 160) | def detect_quantization(quantization_arg: str, config: Path) -> "Quantiz... FILE: python/mlc_llm/support/auto_device.py function detect_device (line 24) | def detect_device(device_hint: str) -> Optional[Device]: function device2str (line 47) | def device2str(device: Device) -> str: function _device_exists (line 52) | def _device_exists(device: Device) -> bool: FILE: python/mlc_llm/support/auto_target.py function detect_target_and_host (line 31) | def detect_target_and_host(target_hint: str, host_hint: str = "auto") ->... function _detect_target_gpu (line 64) | def _detect_target_gpu(hint: str) -> Tuple[Target, BuildFunc]: function _detect_target_host (line 105) | def _detect_target_host(hint: str) -> Target: function _is_device (line 118) | def _is_device(device: str): function _add_system_lib_prefix (line 126) | def _add_system_lib_prefix(mod: IRModule, prefix: str, is_system_lib: bo... function _build_metal_x86_64 (line 142) | def _build_metal_x86_64(): function _build_iphone (line 161) | def _build_iphone(): function _build_android (line 186) | def _build_android(): function _build_android_so (line 209) | def _build_android_so(): function _build_webgpu (line 232) | def _build_webgpu(): function _build_mali (line 272) | def _build_mali(): function _build_default (line 291) | def _build_default(): function detect_cuda_arch_list (line 314) | def detect_cuda_arch_list(target: Target) -> List[int]: function _register_cuda_hook (line 332) | def _register_cuda_hook(target: Target): function detect_system_lib_prefix (line 365) | def detect_system_lib_prefix( FILE: python/mlc_llm/support/auto_weight.py function detect_weight (line 16) | def detect_weight( function _guess_weight_format (line 93) | def _guess_weight_format(weight_path: Path) -> Tuple[Path, str]: function _check_pytorch (line 118) | def _check_pytorch(weight_path: Path) -> Optional[Path]: function _check_safetensor (line 141) | def _check_safetensor(weight_path: Path) -> Optional[Path]: FILE: python/mlc_llm/support/config.py class ConfigBase (line 28) | class ConfigBase: method from_dict (line 35) | def from_dict(cls: Type[ConfigClass], source: Dict[str, Any]) -> Confi... method from_file (line 54) | def from_file(cls: Type[ConfigClass], source: Path) -> ConfigClass: method asdict (line 73) | def asdict(self): class ConfigOverrideBase (line 86) | class ConfigOverrideBase: method apply (line 91) | def apply(self, config): FILE: python/mlc_llm/support/constants.py function _check (line 11) | def _check(): function _get_cache_dir (line 26) | def _get_cache_dir() -> Path: function _get_dso_suffix (line 49) | def _get_dso_suffix() -> str: function _get_test_model_path (line 59) | def _get_test_model_path() -> List[Path]: function _get_read_only_weight_caches (line 74) | def _get_read_only_weight_caches() -> List[Path]: FILE: python/mlc_llm/support/convert_tiktoken.py function bpe (line 13) | def bpe( function generate_vocab_and_merges (line 33) | def generate_vocab_and_merges(encoder, mergeable_ranks): function convert_tiktoken (line 64) | def convert_tiktoken(model_path, output_dir, context_window_size=None): FILE: python/mlc_llm/support/download_cache.py function log_download_cache_policy (line 27) | def log_download_cache_policy(): function _ensure_directory_not_exist (line 36) | def _ensure_directory_not_exist(path: Path, force_redo: bool) -> None: function git_clone (line 47) | def git_clone(url: str, destination: Path, ignore_lfs: bool) -> None: function git_lfs_pull (line 76) | def git_lfs_pull(repo_dir: Path, ignore_extensions: Optional[List[str]] ... function download_file (line 102) | def download_file( function download_and_cache_mlc_weights (line 127) | def download_and_cache_mlc_weights( # pylint: disable=too-many-locals function get_or_download_model (line 202) | def get_or_download_model(model: str) -> Path: FILE: python/mlc_llm/support/logging.py function enable_logging (line 10) | def enable_logging(): function getLogger (line 22) | def getLogger(name: str): # pylint: disable=invalid-name FILE: python/mlc_llm/support/max_thread_check.py function get_max_num_threads_per_block (line 6) | def get_max_num_threads_per_block(target: Target) -> int: function check_thread_limits (line 18) | def check_thread_limits(target: Target, bdx: int, bdy: int, bdz: int, gd... FILE: python/mlc_llm/support/preshard.py function _sharded_param_name (line 15) | def _sharded_param_name(param_name, worker_id): function _create_shard_func (line 19) | def _create_shard_func( function _compile_shard_funcs (line 55) | def _compile_shard_funcs(mod: IRModule, device: Device): function apply_preshard (line 71) | def apply_preshard( FILE: python/mlc_llm/support/random.py function set_global_random_seed (line 6) | def set_global_random_seed(seed): FILE: python/mlc_llm/support/style.py class Styles (line 6) | class Styles(Enum): function red (line 25) | def red(text: str) -> str: function green (line 30) | def green(text: str) -> str: function yellow (line 35) | def yellow(text: str) -> str: function blue (line 40) | def blue(text: str) -> str: function purple (line 45) | def purple(text: str) -> str: function cyan (line 50) | def cyan(text: str) -> str: function bold (line 55) | def bold(text: str) -> str: function underline (line 60) | def underline(text: str) -> str: FILE: python/mlc_llm/support/tensor_parallel.py class ShardSingleDim (line 12) | class ShardSingleDim: method gen_tir (line 36) | def gen_tir(self, shards: int, weight: nn.Tensor) -> tir.PrimFunc: method gen_shard_info (line 83) | def gen_shard_info(self, shards: int, weight: nn.Tensor) -> Dict[str, ... method _compute_in_shape (line 92) | def _compute_in_shape(self, shards: int, weight: nn.Tensor) -> List[int]: function shard_bias (line 99) | def shard_bias(linear: nn.Linear, tensor_parallel_shards: int): FILE: python/mlc_llm/support/tqdm.py function _redirect_print (line 12) | def _redirect_print(): function redirect (line 31) | def redirect(): FILE: python/mlc_llm/testing/debug_chat.py function _extract_metadata (line 27) | def _extract_metadata(mod: Module): function _load_params (line 31) | def _load_params( function _get_tvm_module (line 44) | def _get_tvm_module( class DefaultDebugInstrument (line 59) | class DefaultDebugInstrument: method __init__ (line 68) | def __init__(self, debug_out: Path): method reset (line 82) | def reset(self, debug_out: Path): method __call__ (line 96) | def __call__(self, func, name, before_run, ret_val, *args): class DebugChat (line 145) | class DebugChat: # pylint: disable=too-many-instance-attributes, too-fe... method __init__ (line 165) | def __init__( # pylint: disable=too-many-arguments method _preprocess_prompts (line 290) | def _preprocess_prompts( method _embed (line 323) | def _embed( method _prefill (line 358) | def _prefill(self, embedding: tvm.runtime.Tensor, input_len: int): method _decode (line 396) | def _decode(self, token: int, kv_caches: Object): method _softmax_with_temperature (line 403) | def _softmax_with_temperature(self, logits: np.ndarray, temperature: f... method _apply_presence_and_freq_penalty (line 412) | def _apply_presence_and_freq_penalty( method _sample_token_from_logits (line 418) | def _sample_token_from_logits( method generate (line 440) | def generate( function main (line 490) | def main(): FILE: python/mlc_llm/testing/debug_compare.py function _print_as_table (line 16) | def _print_as_table(sorted_list): class LibCompare (line 40) | class LibCompare(LibCompareVMInstrument): method __init__ (line 66) | def __init__( # pylint: disable=too-many-arguments, unused-argument method reset (line 85) | def reset(self, debug_out: Path): # pylint: disable=unused-argument method skip_instrument (line 109) | def skip_instrument(self, func, name, before_run, ret_val, *args): method compare (line 124) | def compare( function get_instrument (line 146) | def get_instrument(args): function main (line 182) | def main(): FILE: python/mlc_llm/testing/pytest_utils.py function require_test_model (line 13) | def require_test_model(*models: str): function require_test_tokenizers (line 83) | def require_test_tokenizers(*models: str): FILE: python/mlc_llm/tokenizers/streamer.py class TextStreamer (line 13) | class TextStreamer(Object): method __init__ (line 18) | def __init__(self, tokenizer: Tokenizer) -> None: # pylint: disable=s... method put (line 25) | def put(self, delta_tokens: Union[List[int], ShapeTuple]) -> str: method finish (line 47) | def finish(self) -> str: class StopStrHandler (line 53) | class StopStrHandler(Object): method __init__ (line 58) | def __init__( # pylint: disable=super-init-not-called method put (line 67) | def put(self, token_id: int) -> List[int]: method finish (line 77) | def finish(self) -> List[int]: method stop_triggered (line 84) | def stop_triggered(self) -> bool: FILE: python/mlc_llm/tokenizers/tokenizers.py class TokenizerInfo (line 19) | class TokenizerInfo: # pylint: disable=too-many-instance-attributes method asjson (line 48) | def asjson(self) -> str: method from_json (line 53) | def from_json(json_str: str) -> "TokenizerInfo": class Tokenizer (line 59) | class Tokenizer(Object): method __init__ (line 62) | def __init__(self, tokenizer_path: str) -> None: # pylint: disable=su... method encode (line 69) | def encode(self, text: str) -> List[int]: method encode_batch (line 84) | def encode_batch(self, texts: List[str]) -> List[List[int]]: method decode (line 99) | def decode(self, token_ids: List[int]) -> str: method detect_tokenizer_info (line 117) | def detect_tokenizer_info(tokenizer_path: str) -> TokenizerInfo: FILE: python/setup.py function get_lib_path (line 14) | def get_lib_path(): function git_describe_version (line 35) | def git_describe_version(original_version): function parse_requirements (line 50) | def parse_requirements(filename: os.PathLike): class BinaryDistribution (line 76) | class BinaryDistribution(Distribution): method has_ext_modules (line 79) | def has_ext_modules(self): method is_pure (line 83) | def is_pure(self): function main (line 88) | def main(): FILE: scripts/check_url_validity.py function find_urls_in_file (line 8) | def find_urls_in_file(file_path): function main (line 22) | def main(): FILE: tests/cpp/conv_template_unittest.cc type mlc (line 5) | namespace mlc { type llm (line 6) | namespace llm { type json_ffi (line 7) | namespace json_ffi { function _TestConvTemplateLoadJSONTextContent (line 9) | void _TestConvTemplateLoadJSONTextContent() { function _TestConvTemplateLoadJSONPartsContent (line 64) | void _TestConvTemplateLoadJSONPartsContent() { function TEST (line 124) | TEST(JsonFFIConvTest, LoadJSONTextContentTest) { _TestConvTemplate... function TEST (line 125) | TEST(JsonFFIConvTest, LoadJSONPartsContentTest) { _TestConvTemplat... FILE: tests/python/compiler_pass/test_fuse_ft_dequantize_matmul_epilogue.py function test_fuse_bias (line 12) | def test_fuse_bias(): function test_fuse_activation (line 75) | def test_fuse_activation(): function test_fuse_bias_activation (line 134) | def test_fuse_bias_activation(): function test_fuse_residual_binary (line 198) | def test_fuse_residual_binary(): function test_fuse_residual_unary (line 267) | def test_fuse_residual_unary(): FILE: tests/python/conftest.py function pytest_configure (line 21) | def pytest_configure(config): FILE: tests/python/conversation_template/test_conversation_protocol.py function get_conv_templates (line 7) | def get_conv_templates(): function test_json (line 23) | def test_json(conv_template_name): function test_prompt (line 31) | def test_prompt(conv_template_name): FILE: tests/python/conversation_template/test_llama_template.py function test_llama3_prompt (line 10) | def test_llama3_prompt(): FILE: tests/python/integration/test_model_compile.py function run_command (line 78) | def run_command(log_file, cmd): function test_model_compile (line 87) | def test_model_compile(): # pylint: disable=too-many-locals FILE: tests/python/json_ffi/test_json_ffi_engine.py function run_chat_completion (line 55) | def run_chat_completion( function run_json_schema_function_calling (line 92) | def run_json_schema_function_calling( function test_chat_completion (line 152) | def test_chat_completion(model): function test_reload_reset_unload (line 169) | def test_reload_reset_unload(model): function test_json_schema_with_system_prompt (line 185) | def test_json_schema_with_system_prompt(model): FILE: tests/python/json_ffi/test_json_ffi_engine_image.py function base64_encode_image (line 10) | def base64_encode_image(url: str) -> str: function run_chat_completion (line 35) | def run_chat_completion( function test_chat_completion (line 74) | def test_chat_completion(): FILE: tests/python/json_ffi/test_json_ffi_engine_mock.py function check_error_handling (line 13) | def check_error_handling(engine, expect_str, **params): function test_chat_completion_misuse (line 38) | def test_chat_completion_misuse(model: str): function check_normal_param_passing (line 52) | def check_normal_param_passing(engine): function check_n_generation (line 82) | def check_n_generation(engine): function test_chat_completion_api (line 97) | def test_chat_completion_api(model: str): FILE: tests/python/loader/test_awq.py function test_load_llama (line 23) | def test_load_llama(param_path: Union[str, Path]): FILE: tests/python/loader/test_huggingface.py function test_load_torch_llama (line 23) | def test_load_torch_llama(base_path: Union[str, Path]): function test_load_safetensor_llama (line 47) | def test_load_safetensor_llama(base_path: Union[str, Path]): FILE: tests/python/model/test_gemma3.py function test_gemma3_model_registered (line 9) | def test_gemma3_model_registered(): function test_gemma3_creation (line 21) | def test_gemma3_creation(model_name: str): function test_gemma3_config_validation (line 49) | def test_gemma3_config_validation(): FILE: tests/python/model/test_gpt2.py function test_gpt2_creation (line 8) | def test_gpt2_creation(model_name: str): FILE: tests/python/model/test_gptNeox.py function test_mistral_creation (line 8) | def test_mistral_creation(model_name: str): FILE: tests/python/model/test_kv_cache.py function test_nn_module_paged_kv_cache (line 15) | def test_nn_module_paged_kv_cache(): FILE: tests/python/model/test_llama.py function test_llama2_creation (line 10) | def test_llama2_creation(model_name: str): FILE: tests/python/model/test_llama_quantization.py function test_llama2_group_quantization (line 20) | def test_llama2_group_quantization(model_name: str, quant_name: str): function test_llama2_no_quantization (line 62) | def test_llama2_no_quantization(model_name: str, quant_name: str): FILE: tests/python/model/test_mistral.py function test_mistral_creation (line 8) | def test_mistral_creation(model_name: str): FILE: tests/python/model/test_phi.py function test_phi_creation (line 8) | def test_phi_creation(model_name: str): FILE: tests/python/model/test_qwen3_embedding.py function _load_embed_weight (line 51) | def _load_embed_weight(hf_dir): function _hf_logits (line 60) | def _hf_logits(text, tokenizer, hf_model, embed_weight): function _mlc_logits (line 68) | def _mlc_logits(text, tokenizer, mlc_module, params, metadata, dev, embe... function test_mlc_hf_logit_match (line 116) | def test_mlc_hf_logit_match(): FILE: tests/python/op/test_batch_spec_verify.py function test_batch_spec_verify (line 15) | def test_batch_spec_verify(nbatch, vocab, plist): FILE: tests/python/op/test_fp8_block_matmul.py function test_fp8_block_matmul_cutlass (line 30) | def test_fp8_block_matmul_cutlass(M: int, N: int, K: int, dtype: str): function test_fp8_block_matmul_triton (line 117) | def test_fp8_block_matmul_triton(M: int, N: int, K: int, dtype: str): function test_fp8_block_group_matmul_cutlass (line 210) | def test_fp8_block_group_matmul_cutlass(M: int, N: int, K: int, dtype: s... function test_fp8_block_group_matmul_triton (line 356) | def test_fp8_block_group_matmul_triton(M: int, N: int, K: int, dtype: str): function test_fp8_block_bmm_cutlass (line 489) | def test_fp8_block_bmm_cutlass(M: int, N: int, K: int, H: int, dtype: str): function test_fp8_block_gemv_tir (line 562) | def test_fp8_block_gemv_tir(N: int, K: int, up: bool, dtype: str): function blockwise_matmul (line 672) | def blockwise_matmul( function blockwise_group_matmul (line 704) | def blockwise_group_matmul( function blockwise_group_matmul_unquantized (line 742) | def blockwise_group_matmul_unquantized( function blockwise_bmm (line 772) | def blockwise_bmm( function blockwise_quant_fp8 (line 809) | def blockwise_quant_fp8( function rowwise_quant_fp8 (line 879) | def rowwise_quant_fp8( function test_cutlass_gemm (line 936) | def test_cutlass_gemm(): function test_triton_gemm (line 957) | def test_triton_gemm(): function test_cutlass_group_gemm (line 973) | def test_cutlass_group_gemm(): function test_triton_group_gemm (line 988) | def test_triton_group_gemm(): function test_cutlass_bmm (line 1003) | def test_cutlass_bmm(): function test_tir_moe_gemv (line 1019) | def test_tir_moe_gemv(): FILE: tests/python/op/test_mrope.py function _numpy_rotate_half (line 18) | def _numpy_rotate_half(x: np.ndarray) -> np.ndarray: function _numpy_apply_mrope (line 23) | def _numpy_apply_mrope( function _evaluate_tensor (line 64) | def _evaluate_tensor(expr): function _run_mlc_mrope (line 72) | def _run_mlc_mrope( function test_apply_mrope_matches_numpy_reference (line 116) | def test_apply_mrope_matches_numpy_reference(): function test_get_mrope_position_ids_text_only (line 135) | def test_get_mrope_position_ids_text_only(): function test_get_mrope_position_ids_single_image_block (line 160) | def test_get_mrope_position_ids_single_image_block(): function test_apply_mrope_accepts_3_batch_seq_layout (line 194) | def test_apply_mrope_accepts_3_batch_seq_layout(): function test_get_mrope_position_ids_output_is_directly_usable (line 218) | def test_get_mrope_position_ids_output_is_directly_usable(): FILE: tests/python/op/test_top_p_pivot.py function test_top_p_renorm (line 16) | def test_top_p_renorm(batch_size, vocab): FILE: tests/python/op/test_tree_attn.py function test_tree_attn (line 18) | def test_tree_attn(nbatch, h_q, h_kv, d, rotary_mode): FILE: tests/python/op/test_two_stage_softmax.py function test_two_stage_softmax (line 11) | def test_two_stage_softmax(): FILE: tests/python/quantization/test_awq_quantization.py function dequantize_np (line 16) | def dequantize_np( function test_dequantize_weight (line 52) | def test_dequantize_weight(quant_name: str, shape: List[int], dtype: str): FILE: tests/python/quantization/test_group_quantization.py function quantize_np (line 21) | def quantize_np(config: GroupQuantize, weight: np.ndarray): function dequantize_np (line 55) | def dequantize_np( function test_quantize_weight (line 96) | def test_quantize_weight(quant_name: str, shape: List[int], dtype: str, ... function test_dequantize_weight (line 123) | def test_dequantize_weight(quant_name: str, shape: List[int], dtype: str): function test_quantize_model (line 160) | def test_quantize_model(quant_name: str, shape: List[int], dtype: str): FILE: tests/python/router/test_router.py function get_router_1tp1 (line 15) | def get_router_1tp1(): function get_router_2tp1 (line 27) | def get_router_2tp1(): function get_router_1tp2 (line 41) | def get_router_1tp2(): function get_router_2tp2 (line 54) | def get_router_2tp2(): function test_router (line 76) | async def test_router(schedule: str = "round_robin", endpoints_config: s... FILE: tests/python/serve/evaluate_engine.py function _parse_args (line 11) | def _parse_args(): function generate_requests (line 25) | def generate_requests( function benchmark (line 40) | def benchmark(args: argparse.Namespace): FILE: tests/python/serve/server/conftest.py function served_model (line 11) | def served_model() -> Tuple[str, str]: function launch_server (line 24) | def launch_server(served_model): # pylint: disable=redefined-outer-name FILE: tests/python/serve/server/test_embedding_server.py function _skip_if_no_model (line 56) | def _skip_if_no_model(): function check_embedding_response (line 73) | def check_embedding_response( function expect_error (line 114) | def expect_error(response_str: str, msg_prefix: Optional[str] = None): function launch_embedding_server (line 129) | def launch_embedding_server(): function client (line 199) | def client(launch_embedding_server): function test_models_endpoint (line 211) | def test_models_endpoint(): function test_single_string_input (line 224) | def test_single_string_input(client): function test_batch_string_input (line 242) | def test_batch_string_input(client): function test_batch_index_ordering (line 249) | def test_batch_index_ordering(client): function test_cosine_similarity_via_endpoint (line 261) | def test_cosine_similarity_via_endpoint(client): function test_dimension_truncation (line 284) | def test_dimension_truncation(client): function test_base64_encoding (line 305) | def test_base64_encoding(): function test_any_model_name_works_with_single_engine (line 330) | def test_any_model_name_works_with_single_engine(): FILE: tests/python/serve/server/test_server.py function is_json (line 53) | def is_json(s: str) -> bool: function is_json_prefix (line 61) | def is_json_prefix(s: str) -> bool: function check_openai_nonstream_response (line 76) | def check_openai_nonstream_response( function check_openai_stream_response (line 140) | def check_openai_stream_response( function expect_error (line 222) | def expect_error(response_str: str, msg_prefix: Optional[str] = None): function test_openai_v1_models (line 230) | def test_openai_v1_models( function test_openai_v1_completions (line 251) | def test_openai_v1_completions( function test_openai_v1_completions_openai_package (line 298) | def test_openai_v1_completions_openai_package( function test_openai_v1_completions_echo (line 341) | def test_openai_v1_completions_echo( function test_openai_v1_completions_suffix (line 391) | def test_openai_v1_completions_suffix( function test_openai_v1_completions_stop_str (line 442) | def test_openai_v1_completions_stop_str( function test_openai_v1_completions_temperature (line 492) | def test_openai_v1_completions_temperature( function test_openai_v1_completions_json (line 538) | def test_openai_v1_completions_json( function test_openai_v1_completions_json_schema (line 585) | def test_openai_v1_completions_json_schema( function test_openai_v1_completions_logit_bias (line 643) | def test_openai_v1_completions_logit_bias( function test_openai_v1_completions_presence_frequency_penalty (line 692) | def test_openai_v1_completions_presence_frequency_penalty( function test_openai_v1_completions_seed (line 738) | def test_openai_v1_completions_seed( function test_openai_v1_completions_prompt_overlong (line 774) | def test_openai_v1_completions_prompt_overlong( function test_openai_v1_completions_invalid_logprobs (line 808) | def test_openai_v1_completions_invalid_logprobs( function test_openai_v1_chat_completions_invalid_logprobs (line 833) | def test_openai_v1_chat_completions_invalid_logprobs( function test_openai_v1_completions_unsupported_args (line 867) | def test_openai_v1_completions_unsupported_args( function test_openai_v1_completions_request_cancellation (line 888) | def test_openai_v1_completions_request_cancellation( function test_openai_v1_chat_completions (line 947) | def test_openai_v1_chat_completions( function test_openai_v1_chat_completions_n (line 990) | def test_openai_v1_chat_completions_n( function test_openai_v1_chat_completions_openai_package (line 1036) | def test_openai_v1_chat_completions_openai_package( function test_openai_v1_chat_completions_max_tokens (line 1077) | def test_openai_v1_chat_completions_max_tokens( function test_openai_v1_chat_completions_json (line 1123) | def test_openai_v1_chat_completions_json( function test_openai_v1_chat_completions_json_schema (line 1170) | def test_openai_v1_chat_completions_json_schema( function test_openai_v1_chat_completions_ignore_eos (line 1229) | def test_openai_v1_chat_completions_ignore_eos( function test_openai_v1_chat_completions_system_prompt_wrong_pos (line 1276) | def test_openai_v1_chat_completions_system_prompt_wrong_pos( function test_debug_dump_event_trace (line 1312) | def test_debug_dump_event_trace( function test_metrics (line 1324) | def test_metrics( FILE: tests/python/serve/server/test_server_function_call.py function check_openai_nonstream_response (line 19) | def check_openai_nonstream_response( function check_openai_stream_response (line 58) | def check_openai_stream_response( function test_openai_v1_chat_completion_function_call (line 157) | def test_openai_v1_chat_completion_function_call( FILE: tests/python/serve/server/test_server_image.py function is_json_or_json_prefix (line 19) | def is_json_or_json_prefix(s: str) -> bool: function check_openai_nonstream_response (line 34) | def check_openai_nonstream_response( function check_openai_stream_response (line 90) | def check_openai_stream_response( function test_openai_v1_chat_completions (line 201) | def test_openai_v1_chat_completions( FILE: tests/python/serve/test_embedding_engine.py function _skip_if_no_model (line 39) | def _skip_if_no_model(): function embedding_engine (line 53) | def embedding_engine(): function cosine_similarity (line 72) | def cosine_similarity(a, b): function test_engine_model_type (line 83) | def test_engine_model_type(embedding_engine): function test_engine_pooling_strategy (line 88) | def test_engine_pooling_strategy(embedding_engine): function test_single_text_shape (line 101) | def test_single_text_shape(embedding_engine): function test_single_text_unit_norm (line 109) | def test_single_text_unit_norm(embedding_engine): function test_batch_count (line 127) | def test_batch_count(embedding_engine): function test_batch_all_normalized (line 134) | def test_batch_all_normalized(embedding_engine): function test_batch_consistent_dimension (line 142) | def test_batch_consistent_dimension(embedding_engine): function test_cosine_similarity_ranking (line 160) | def test_cosine_similarity_ranking(embedding_engine): function test_deterministic_output (line 176) | def test_deterministic_output(embedding_engine): function test_async_embed (line 190) | def test_async_embed(embedding_engine): function test_empty_string (line 211) | def test_empty_string(embedding_engine): function test_long_text_decoder_chunked_prefill (line 229) | def test_long_text_decoder_chunked_prefill(embedding_engine): function _get_encoder_tokens (line 241) | def _get_encoder_tokens(embedding_engine, text): function test_long_text_encoder_truncation (line 255) | def test_long_text_encoder_truncation(embedding_engine): # pylint: disa... function test_long_vs_short_semantic_quality (line 306) | def test_long_vs_short_semantic_quality(embedding_engine): function test_unicode_text (line 327) | def test_unicode_text(embedding_engine): FILE: tests/python/serve/test_event_trace_recorder.py function test_event_trace_recorder (line 12) | def test_event_trace_recorder(): FILE: tests/python/serve/test_radix_tree.py function test_add (line 9) | def test_add(): function test_remove (line 17) | def test_remove(): function test_extend (line 52) | def test_extend(): function test_fork (line 73) | def test_fork(): function test_fork_2 (line 90) | def test_fork_2(): function test_rollback (line 102) | def test_rollback(): FILE: tests/python/serve/test_serve_async_engine.py function test_engine_generate (line 25) | async def test_engine_generate(model: str): function test_chat_completion (line 83) | async def test_chat_completion(model: str): function test_chat_completion_non_stream (line 134) | async def test_chat_completion_non_stream(model: str): function test_completion (line 184) | async def test_completion(model: str): function test_completion_non_stream (line 234) | async def test_completion_non_stream(model: str): FILE: tests/python/serve/test_serve_async_engine_spec.py function test_engine_generate (line 28) | async def test_engine_generate(model: str, small_model: str): FILE: tests/python/serve/test_serve_engine.py function test_engine_generate (line 24) | def test_engine_generate(model: str): function test_chat_completion (line 63) | def test_chat_completion(model: str): function test_chat_completion_non_stream (line 108) | def test_chat_completion_non_stream(model: str): function test_completion (line 152) | def test_completion(model: str): function test_completion_non_stream (line 196) | def test_completion_non_stream(model: str): FILE: tests/python/serve/test_serve_engine_grammar.py function test_batch_generation_with_grammar (line 20) | def test_batch_generation_with_grammar(model: str): function test_batch_generation_with_schema (line 97) | def test_batch_generation_with_schema(model: str): function test_batch_generation_jump_forward (line 204) | def test_batch_generation_jump_forward(model: str, jump_forward: bool = ... function run_async_engine (line 263) | async def run_async_engine( function test_async_engine (line 342) | def test_async_engine( FILE: tests/python/serve/test_serve_engine_image.py function get_test_image (line 9) | def get_test_image(config) -> data.ImageData: function test_engine_generate (line 13) | def test_engine_generate(): FILE: tests/python/serve/test_serve_engine_mock.py function test_completion_api (line 20) | def test_completion_api(model: str): FILE: tests/python/serve/test_serve_engine_prefix_cache.py function test_engine_system_prompt (line 20) | def test_engine_system_prompt(engine): function test_engine_multi_round (line 56) | def test_engine_multi_round(engine): function test_basic_engine_system_prompt (line 75) | def test_basic_engine_system_prompt(model: str): function test_basic_engine_multi_round (line 89) | def test_basic_engine_multi_round(model: str): function test_engine_spec_multi_round (line 103) | def test_engine_spec_multi_round(model: str, small_model: str): function test_engine_eagle_multi_round (line 119) | def test_engine_eagle_multi_round(model: str): FILE: tests/python/serve/test_serve_engine_rnn.py function test_engine_generate (line 22) | def test_engine_generate() -> None: FILE: tests/python/serve/test_serve_engine_spec.py function create_requests (line 26) | def create_requests( function test_engine_basic (line 59) | def test_engine_basic(model: str, small_model: str): function test_engine_eagle_basic (line 122) | def test_engine_eagle_basic(model: str): function test_engine_continuous_batching_1 (line 192) | def test_engine_continuous_batching_1(model: str, small_model: str): function test_engine_eagle_continuous_batching_1 (line 274) | def test_engine_eagle_continuous_batching_1(model: str): function compare_output_text (line 359) | def compare_output_text(output_text1, output_text2): function test_engine_generate (line 375) | def test_engine_generate(model: str, small_model: str, compare_precision... function test_engine_eagle_generate (line 433) | def test_engine_eagle_generate(model: str): function test_engine_efficiency (line 466) | def test_engine_efficiency(model: str): function test_engine_spec_efficiency (line 525) | def test_engine_spec_efficiency(model: str, small_model: str): function test_engine_eagle_spec_efficiency (line 591) | def test_engine_eagle_spec_efficiency(model: str): FILE: tests/python/serve/test_serve_sync_engine.py function create_requests (line 26) | def create_requests( function test_engine_basic (line 57) | def test_engine_basic(model: str): function test_engine_continuous_batching_1 (line 116) | def test_engine_continuous_batching_1(model: str): function test_engine_continuous_batching_2 (line 196) | def test_engine_continuous_batching_2(model: str): function test_engine_continuous_batching_3 (line 276) | def test_engine_continuous_batching_3(model: str): function test_engine_generate (line 364) | def test_engine_generate(model: str): function test_engine_hybrid_prefill (line 389) | def test_engine_hybrid_prefill(model: str): FILE: tests/python/support/test_auto_config.py function _create_json_file (line 17) | def _create_json_file(json_path, data): function test_detect_config (line 22) | def test_detect_config(): function test_detect_config_fail (line 32) | def test_detect_config_fail(): FILE: tests/python/support/test_auto_weight.py function _create_json_file (line 18) | def _create_json_file(json_path, data): function test_detect_weight (line 36) | def test_detect_weight(weight_format, index_filename, result): function test_detect_weight_in_config_json (line 61) | def test_detect_weight_in_config_json(weight_format, index_filename, res... function test_detect_weight_same_dir_config_json (line 93) | def test_detect_weight_same_dir_config_json(weight_format, index_filenam... function test_find_weight_fail (line 107) | def test_find_weight_fail(): FILE: tests/python/support/test_cli_convert_weight.py function test_convert_weight_cli_passes_lora_adapter (line 13) | def test_convert_weight_cli_passes_lora_adapter(monkeypatch): FILE: tests/python/support/test_convert_weight_lora_merge.py function test_resolve_base_model_dir (line 14) | def test_resolve_base_model_dir(): function test_convert_weight_with_lora_uses_merged_source (line 26) | def test_convert_weight_with_lora_uses_merged_source(monkeypatch): function test_convert_weight_with_lora_rejects_awq (line 91) | def test_convert_weight_with_lora_rejects_awq(): FILE: tests/python/tokenizers/test_streamer.py function test_text_streamer (line 58) | def test_text_streamer(llama_tokenizer_path: str): # pylint: disable=re... function stop_handler_process_tokens (line 68) | def stop_handler_process_tokens( function test_stop_str_handler_stop (line 84) | def test_stop_str_handler_stop(llama_tokenizer_path: str): # pylint: di... function test_stop_str_handler_not_stop (line 100) | def test_stop_str_handler_not_stop( function test_stop_str_handler_return_cached_tokens (line 112) | def test_stop_str_handler_return_cached_tokens( function test_stop_str_handler_throughput (line 130) | def test_stop_str_handler_throughput( function test_text_streamer_emojis (line 171) | def test_text_streamer_emojis( FILE: version.py function py_str (line 35) | def py_str(cstr): function git_describe_version (line 39) | def git_describe_version(): function update (line 126) | def update(file_name, pattern, repl, dry_run=False): function sync_version (line 153) | def sync_version(pub_ver, local_ver, dry_run): function main (line 164) | def main():