SYMBOL INDEX (5123 symbols across 423 files) FILE: backends/client/build.rs function main (line 3) | fn main() -> Result<(), Box> { FILE: backends/client/src/lib.rs type Health (line 15) | pub trait Health { method device_health (line 17) | async fn device_health(&self) -> Result<()>; method model_health (line 21) | async fn model_health(&self) -> Result<()>; type ShardInfo (line 25) | pub struct ShardInfo { type ClientError (line 34) | pub enum ClientError { method from (line 44) | fn from(err: Status) -> Self { method from (line 52) | fn from(err: transport::Error) -> Self { method from (line 61) | fn from(chunk: Chunk) -> Self { type ChunksToString (line 68) | pub trait ChunksToString { method chunks_to_string (line 70) | fn chunks_to_string(&self) -> String; method chunks_to_string (line 74) | fn chunks_to_string(&self) -> String { type Result (line 91) | pub type Result = std::result::Result; FILE: backends/client/src/v2/client.rs type Client (line 16) | pub struct Client { method connect (line 22) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 31) | pub async fn connect_uds(path: String) -> Result { method service_discovery (line 46) | pub async fn service_discovery(&mut self) -> Result> { method info (line 66) | pub async fn info(&mut self) -> Result { method health (line 74) | pub async fn health(&mut self) -> Result { method clear_cache (line 82) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 90) | pub async fn filter_batch( method warmup (line 108) | pub async fn warmup( method prefill (line 189) | pub async fn prefill( method decode (line 207) | pub async fn decode( type PrefillTimings (line 226) | pub struct PrefillTimings { method new (line 233) | fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self { type DecodeTimings (line 242) | pub struct DecodeTimings { method new (line 250) | fn new(concat_ns: Option, forward_ns: u64, decode_ns: u64, total_... FILE: backends/client/src/v2/sharded_client.rs type ShardedClient (line 18) | pub struct ShardedClient { method new (line 23) | fn new(clients: Vec) -> Self { method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result { method connect (line 38) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 44) | pub async fn connect_uds(path: String) -> Result { method info (line 51) | pub async fn info(&mut self) -> Result { method health (line 62) | pub async fn health(&mut self) -> Result { method clear_cache (line 73) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 84) | pub async fn filter_batch( method warmup (line 102) | pub async fn warmup( method prefill (line 134) | pub async fn prefill( method decode (line 167) | pub async fn decode( method from (line 197) | fn from(value: InfoResponse) -> Self { method device_health (line 210) | async fn device_health(&self) -> Result<()> { method model_health (line 215) | async fn model_health(&self) -> Result<()> { FILE: backends/client/src/v3/client.rs type Client (line 16) | pub struct Client { method connect (line 22) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 31) | pub async fn connect_uds(path: String) -> Result { method service_discovery (line 46) | pub async fn service_discovery(&mut self) -> Result> { method info (line 66) | pub async fn info(&mut self) -> Result { method health (line 74) | pub async fn health(&mut self) -> Result { method clear_cache (line 82) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 90) | pub async fn filter_batch( method warmup (line 108) | pub async fn warmup( method prefill (line 230) | pub async fn prefill( method decode (line 253) | pub async fn decode( type PrefillTimings (line 272) | pub struct PrefillTimings { method new (line 279) | fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self { type DecodeTimings (line 288) | pub struct DecodeTimings { method new (line 296) | fn new(concat_ns: Option, forward_ns: u64, decode_ns: u64, total_... FILE: backends/client/src/v3/sharded_client.rs type ShardedClient (line 18) | pub struct ShardedClient { method new (line 23) | fn new(clients: Vec) -> Self { method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result { method connect (line 38) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 44) | pub async fn connect_uds(path: String) -> Result { method info (line 51) | pub async fn info(&mut self) -> Result { method health (line 62) | pub async fn health(&mut self) -> Result { method clear_cache (line 73) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 84) | pub async fn filter_batch( method warmup (line 102) | pub async fn warmup( method prefill (line 142) | pub async fn prefill( method decode (line 176) | pub async fn decode( method from (line 206) | fn from(value: InfoResponse) -> Self { method device_health (line 219) | async fn device_health(&self) -> Result<()> { method model_health (line 224) | async fn model_health(&self) -> Result<()> { FILE: backends/gaudi/server/text_generation_server/adapters/config.py class ModuleMap (line 15) | class ModuleMap: class AdapterConfig (line 21) | class AdapterConfig(ABC): method map_weights_for_model (line 25) | def map_weights_for_model( FILE: backends/gaudi/server/text_generation_server/adapters/lora.py function get_start_stop_idxs_for_rank (line 30) | def get_start_stop_idxs_for_rank(offset, size, rank, world_size): function shard_on_dim (line 37) | def shard_on_dim( function shard_lora_weights (line 56) | def shard_lora_weights( class LoraConfig (line 74) | class LoraConfig(AdapterConfig): method map_weights_for_model (line 81) | def map_weights_for_model( method load (line 103) | def load(cls, adapter_id: str, api_token: str) -> "LoraConfig": class LoraWeights (line 117) | class LoraWeights(AdapterWeights): method __init__ (line 120) | def __init__( method weights_a (line 142) | def weights_a(self) -> torch.Tensor: method weights_b (line 148) | def weights_b(self) -> torch.Tensor: method weights_a_t (line 154) | def weights_a_t(self) -> torch.Tensor: method weights_b_t (line 160) | def weights_b_t(self) -> torch.Tensor: method _transpose_weights (line 165) | def _transpose_weights(self): method get_batch_types (line 173) | def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]: method prepare_weights (line 190) | def prepare_weights( class RankSegments (line 256) | class RankSegments: class BatchLoraWeights (line 273) | class BatchLoraWeights(BatchAdapterWeights): method has_adapter (line 280) | def has_adapter(self, adapter_index: int) -> bool: method can_vectorize (line 283) | def can_vectorize(self, pg: ProcessGroup) -> bool: method load (line 290) | def load( function get_scaling_factor (line 457) | def get_scaling_factor( function _convert_lora (line 468) | def _convert_lora(v: AdapterWeights) -> AdapterWeights: FILE: backends/gaudi/server/text_generation_server/adapters/weights.py class AdapterBatchMetadata (line 14) | class AdapterBatchMetadata: class AdapterWeights (line 30) | class AdapterWeights(ABC): method get_batch_types (line 32) | def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]: method speculative_tokens (line 36) | def speculative_tokens(self) -> int: class BatchAdapterWeights (line 40) | class BatchAdapterWeights(ABC): method has_adapter (line 42) | def has_adapter(self, adapter_index: int) -> bool: method load (line 46) | def load( class LayerAdapterWeights (line 56) | class LayerAdapterWeights: method __init__ (line 59) | def __init__(self): method add_adapter (line 62) | def add_adapter(self, adapter_idx: int, weights: AdapterWeights): method remove_adapter (line 65) | def remove_adapter(self, adapter_idx: int): method is_empty (line 70) | def is_empty(self) -> bool: method get_data (line 73) | def get_data( class AdapterBatchData (line 98) | class AdapterBatchData: method from_meta (line 107) | def from_meta( method ranks (line 122) | def ranks(self) -> Set[int]: method layer_names (line 134) | def layer_names(self) -> Set[str]: method adapter_keys (line 137) | def adapter_keys(self) -> Set[str]: method max_rank (line 144) | def max_rank(self) -> int: FILE: backends/gaudi/server/text_generation_server/cache.py class Cache (line 10) | class Cache: method __init__ (line 11) | def __init__(self): method pop (line 14) | def pop(self, batch_id: int) -> Optional[B]: method set (line 17) | def set(self, entry: B): method delete (line 21) | def delete(self, batch_id: int): method clear (line 28) | def clear(self): method __len__ (line 33) | def __len__(self): FILE: backends/gaudi/server/text_generation_server/cli.py class Quantization (line 16) | class Quantization(str, Enum): class Dtype (line 23) | class Dtype(str, Enum): class KVCacheDtype (line 28) | class KVCacheDtype(str, Enum): function serve (line 34) | def serve( function download_weights (line 132) | def download_weights( function quantize (line 336) | def quantize( FILE: backends/gaudi/server/text_generation_server/interceptor.py class ExceptionInterceptor (line 15) | class ExceptionInterceptor(AsyncServerInterceptor): method intercept (line 16) | async def intercept( FILE: backends/gaudi/server/text_generation_server/layers/attention/common.py class HPUPagedAttentionMetadata (line 11) | class HPUPagedAttentionMetadata: function subtuple (line 27) | def subtuple( function trim_attn_metadata (line 47) | def trim_attn_metadata(metadata: HPUPagedAttentionMetadata) -> object: class Seqlen (line 89) | class Seqlen: method __init__ (line 93) | def __init__( method clamp (line 99) | def clamp(self, max): method make_sliding_window_bias (line 103) | def make_sliding_window_bias( function _async_h2d_tensor_copy (line 146) | def _async_h2d_tensor_copy(source, device="hpu"): function trim_seqlen_metadata (line 157) | def trim_seqlen_metadata(metadata: Seqlen) -> object: FILE: backends/gaudi/server/text_generation_server/layers/attention/hpu.py class FP8Matmul (line 16) | class FP8Matmul(torch.nn.Module): method __init__ (line 18) | def __init__(self, scale_other): method quant_input (line 23) | def quant_input(self, x, scale): method matmul_fp8 (line 28) | def matmul_fp8( method forward (line 44) | def forward(self, input, other): class FetchFromCache (line 57) | class FetchFromCache(torch.nn.Module): method __init__ (line 59) | def __init__(self, scale_inv): method forward (line 63) | def forward(self, cache, blocks): function attention (line 73) | def attention( function set_block_mapping (line 110) | def set_block_mapping(hpu_attention_meta: HPUPagedAttentionMetadata, bat... function paged_attention (line 134) | def paged_attention( function paged_attention_mla (line 185) | def paged_attention_mla( FILE: backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py class KVScales (line 11) | class KVScales: method __post_init__ (line 26) | def __post_init__(self): class KVCache (line 34) | class KVCache: method __init__ (line 41) | def __init__( method dtype (line 69) | def dtype(self): method key (line 74) | def key(self): method value (line 80) | def value(self): method store (line 85) | def store( class KVCompressCache (line 110) | class KVCompressCache(KVCache): method __init__ (line 117) | def __init__( method dtype (line 137) | def dtype(self): method key (line 142) | def key(self): method value (line 148) | def value(self): method store (line 153) | def store( function paged_reshape_and_cache (line 170) | def paged_reshape_and_cache( function get_kv_scales (line 190) | def get_kv_scales(weights: Weights, prefix: str) -> KVScales: FILE: backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py function pack (line 9) | def pack(imatrix: torch.Tensor, direction: str = "column"): function unpack (line 35) | def unpack(qmatrix: torch.Tensor, direction: str = "column"): function apply_order (line 61) | def apply_order( function fast_awq_to_gptq (line 83) | def fast_awq_to_gptq(qweight, qzeros): FILE: backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py function error_raiser_hpu (line 12) | def error_raiser_hpu(*args, **kwargs): function unpack_awq (line 22) | def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int): function reverse_awq_order (line 45) | def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits... function unpack_weight_and_zeros (line 62) | def unpack_weight_and_zeros(qweight, qzeros, bits): function pack_tensor (line 75) | def pack_tensor(input, bits=4): class WQLinear (line 93) | class WQLinear(nn.Module): method __init__ (line 94) | def __init__( method _preprocessing (line 117) | def _preprocessing(self): method forward (line 126) | def forward(self, x): FILE: backends/gaudi/server/text_generation_server/layers/bnb.py class BNBWeight (line 10) | class BNBWeight(UnquantizedWeight): method get_linear (line 13) | def get_linear(self, bias: torch.Tensor): class Linear8bitLt (line 17) | class Linear8bitLt(torch.nn.Module): method __init__ (line 18) | def __init__( method init_8bit_state (line 49) | def init_8bit_state(self): method forward (line 55) | def forward(self, x: torch.Tensor): class BNBFP4Weight (line 76) | class BNBFP4Weight(UnquantizedWeight): method get_linear (line 79) | def get_linear(self, bias: torch.Tensor): class BNBNF4Weight (line 84) | class BNBNF4Weight(UnquantizedWeight): method get_linear (line 87) | def get_linear(self, bias: torch.Tensor): class Linear4bit (line 91) | class Linear4bit(torch.nn.Module): method __init__ (line 92) | def __init__(self, weight, bias, quant_type): method forward (line 104) | def forward(self, x: torch.Tensor): FILE: backends/gaudi/server/text_generation_server/layers/compressed_tensors/loader.py class CompressedTensorsLoader (line 29) | class CompressedTensorsLoader(WeightsLoader): method __init__ (line 32) | def __init__(self, config: Dict[str, Any]): method get_weights (line 69) | def get_weights(self, weights: Weights, prefix: str): method get_weights_col_packed (line 73) | def get_weights_col_packed( method get_multi_weights_col (line 82) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_multi_weights (line 86) | def get_multi_weights(self, weights: Weights, prefixes: List[str], dim... method get_weights_row (line 90) | def get_weights_row(self, weights: Weights, prefix: str): method _get_target_loaders (line 94) | def _get_target_loaders( method _create_loader_for_group (line 125) | def _create_loader_for_group( method _lookup_loader (line 154) | def _lookup_loader(self, prefix: str) -> WeightsLoader: FILE: backends/gaudi/server/text_generation_server/layers/compressed_tensors/w8an_fp.py class W8ANFpLoader (line 14) | class W8ANFpLoader(WeightsLoader): method __init__ (line 19) | def __init__( method __str__ (line 41) | def __str__(self) -> str: method get_weights (line 49) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 81) | def get_weights_col_packed( method get_multi_weights_col (line 130) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_multi_weights (line 177) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d... method get_weights_row (line 227) | def get_weights_row(self, weights: "Weights", prefix: str): FILE: backends/gaudi/server/text_generation_server/layers/conv.py function load_conv2d (line 6) | def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_... function load_conv2d_no_bias (line 23) | def load_conv2d_no_bias( FILE: backends/gaudi/server/text_generation_server/layers/exl2.py class Exl2Weight (line 9) | class Exl2Weight(Weight): method __post_init__ (line 20) | def __post_init__(self): method device (line 25) | def device(self) -> torch.device: method get_linear (line 28) | def get_linear(self, bias: torch.Tensor): class Exl2WeightsLoader (line 34) | class Exl2WeightsLoader(WeightsLoader): method get_weights (line 37) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 61) | def get_weights_col_packed( method get_weights_col (line 69) | def get_weights_col(self, weights: Weights, prefix: str): method get_multi_weights_col (line 73) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 76) | def get_weights_row(self, weights: Weights, prefix: str): FILE: backends/gaudi/server/text_generation_server/layers/fp8.py function pad_weight (line 22) | def pad_weight(weight, block_size): function unpad_weight (line 37) | def unpad_weight(weight, original_M, original_N, keep_first_dim=False): function pad_block_fp8_weight_naive (line 47) | def pad_block_fp8_weight_naive(weight, weight_scale, block_size): function dynamic_quant (line 63) | def dynamic_quant(data, single_scale=False): function dequant_block_fp8_weight_naive (line 75) | def dequant_block_fp8_weight_naive( function apply_block_fp8_linear_hpu_dynamic (line 132) | def apply_block_fp8_linear_hpu_dynamic( function get_fp8_linear (line 162) | def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]: function normalize_e4m3fn_to_native_float8 (line 170) | def normalize_e4m3fn_to_native_float8( function per_tensor_dequantize (line 178) | def per_tensor_dequantize( function requantize_with_max_scale (line 194) | def requantize_with_max_scale( function fp8_quantize (line 220) | def fp8_quantize( class HybridFP8UnquantLoader (line 245) | class HybridFP8UnquantLoader(WeightsLoader): method __init__ (line 248) | def __init__( method get_weights (line 258) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 299) | def get_weights_col_packed( method get_multi_weights_col (line 352) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_multi_weights (line 414) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d... method get_weights_row (line 476) | def get_weights_row(self, weights: "Weights", prefix: str): class Fp8Weight (line 524) | class Fp8Weight(Weight): method get_linear (line 533) | def get_linear(self, bias: torch.Tensor): class Fp8Linear (line 552) | class Fp8Linear(torch.nn.Module): method __init__ (line 555) | def __init__( method from_unquant (line 577) | def from_unquant(cls, weight, bias, dtype): method from_fp8 (line 589) | def from_fp8( method forward (line 627) | def forward(self, input: torch.Tensor) -> torch.Tensor: function _load_scalar_or_matrix_scale (line 650) | def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: t... FILE: backends/gaudi/server/text_generation_server/layers/gptq/__init__.py class GPTQWeight (line 19) | class GPTQWeight(Weight): method __post_init__ (line 29) | def __post_init__(self): method device (line 34) | def device(self) -> torch.device: method get_linear (line 37) | def get_linear(self, bias: torch.Tensor): class GPTQWeightsLoader (line 66) | class GPTQWeightsLoader(WeightsLoader): method __init__ (line 71) | def __init__( method is_layer_skipped_quantization (line 90) | def is_layer_skipped_quantization( method get_weights (line 95) | def get_weights(self, weights: Weights, prefix: str): method get_weights_col_packed (line 157) | def get_weights_col_packed( method get_multi_weights_col (line 217) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_multi_weights (line 279) | def get_multi_weights(self, weights: Weights, prefixes: List[str], dim... method get_weights_row (line 336) | def get_weights_row(self, weights: Weights, prefix: str): method _get_gptq_params (line 426) | def _get_gptq_params(self, weights: Weights): FILE: backends/gaudi/server/text_generation_server/layers/gptq/hpu.py function error_raiser_hpu (line 12) | def error_raiser_hpu(*args, **kwargs): function pack_tensor (line 20) | def pack_tensor(input, bits=4): class QuantLinear (line 34) | class QuantLinear(nn.Module): method __init__ (line 35) | def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsi... method unpack_zeros_from_cuda_old_format (line 58) | def unpack_zeros_from_cuda_old_format(self): method unpack_weight_from_cuda_old_format (line 71) | def unpack_weight_from_cuda_old_format(self): method _preprocessing (line 80) | def _preprocessing(self): method new (line 119) | def new(cls, bits, groupsize, infeatures, outfeatures, bias): method pack (line 140) | def pack(self, linear, scales, zeros, g_idx=None): method forward (line 197) | def forward(self, x): FILE: backends/gaudi/server/text_generation_server/layers/gptq/quantize.py class Quantizer (line 25) | class Quantizer(nn.Module): method __init__ (line 26) | def __init__(self, shape=1): method configure (line 32) | def configure( method _quantize (line 54) | def _quantize(self, x, scale, zero, maxq): method find_params (line 60) | def find_params(self, x, weight=False): method quantize (line 145) | def quantize(self, x): method enabled (line 151) | def enabled(self): method ready (line 154) | def ready(self): class GPTQ (line 158) | class GPTQ: method __init__ (line 159) | def __init__(self, layer, observe=False): method add_batch (line 174) | def add_batch(self, inp, out): method print_loss (line 209) | def print_loss(self, name, q_weight, weight_error, timecost): method fasterquant (line 243) | def fasterquant( method free (line 357) | def free(self): function get_wikitext2 (line 366) | def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code): function get_ptb (line 398) | def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code): function get_c4 (line 430) | def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code): function get_ptb_new (line 498) | def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code): function get_c4_new (line 530) | def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code): function get_loaders (line 584) | def get_loaders( function find_layers (line 599) | def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""): function sequential (line 615) | def sequential( function make_quant_linear (line 754) | def make_quant_linear(module, names, bits, groupsize, name=""): function pack (line 780) | def pack(model, quantizers, bits, groupsize): function setdeepattr (line 794) | def setdeepattr(module, full_name, tensor): function getdeepattr (line 802) | def getdeepattr(module, full_name): function load_weights_pre_hook (line 810) | def load_weights_pre_hook(module_name, weights, recursive=False): function load_weights_post_hook (line 842) | def load_weights_post_hook(module_name, weights, recursive=False): function quantize (line 867) | def quantize( FILE: backends/gaudi/server/text_generation_server/layers/gptq/utils.py function torch_snr_error (line 5) | def torch_snr_error( FILE: backends/gaudi/server/text_generation_server/layers/layernorm.py function load_layer_norm (line 8) | def load_layer_norm(cls, prefix, weights, eps): function load_layer_norm_no_bias (line 20) | def load_layer_norm_no_bias(cls, prefix, weights, eps): class FastLayerNorm (line 34) | class FastLayerNorm(nn.LayerNorm): method forward (line 35) | def forward(self, hidden_states, residual=None): class FastRMSNorm (line 43) | class FastRMSNorm(nn.Module): method __init__ (line 44) | def __init__(self, weight: torch.Tensor, eps: float): method load (line 51) | def load(cls, prefix, weights, eps=1e-6): method forward (line 55) | def forward(self, hidden_states, residual=None): FILE: backends/gaudi/server/text_generation_server/layers/linear.py class FastLinear (line 5) | class FastLinear(torch.nn.Module): method __init__ (line 6) | def __init__( method load (line 19) | def load(cls, config, prefix: str, weights, bias: bool): method forward (line 27) | def forward(self, input: torch.Tensor) -> torch.Tensor: function get_linear (line 31) | def get_linear(weight, bias): FILE: backends/gaudi/server/text_generation_server/layers/lora.py class LoraLinear (line 22) | class LoraLinear(nn.Module): method __init__ (line 23) | def __init__( method forward_layer_type (line 31) | def forward_layer_type( method forward_lora (line 135) | def forward_lora( method collect_lora_a (line 154) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor: class TensorParallelMultiAdapterLinear (line 158) | class TensorParallelMultiAdapterLinear(LoraLinear): method __init__ (line 159) | def __init__( method load (line 172) | def load( method forward (line 184) | def forward( method collect_lora_a (line 227) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor: class TensorParallelAdapterRowLinear (line 242) | class TensorParallelAdapterRowLinear(LoraLinear): method __init__ (line 243) | def __init__(self, base_layer, layer_id, layer_name, process_group): method load (line 248) | def load(cls, base_layer, layer_id, layer_name, process_group): method forward (line 251) | def forward( method collect_lora_a (line 270) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor: FILE: backends/gaudi/server/text_generation_server/layers/medusa.py class ResBlock (line 12) | class ResBlock(torch.nn.Module): method __init__ (line 13) | def __init__(self, config, prefix, weights): method forward (line 20) | def forward(self, x): class MedusaModel (line 24) | class MedusaModel(torch.nn.Module): method __init__ (line 25) | def __init__(self, config, medusa_config, weights): method forward (line 34) | def forward(self, x): class MedusaHead (line 41) | class MedusaHead(torch.nn.Module): method __init__ (line 42) | def __init__(self, config, medusa_config, prefix, weights): method forward (line 55) | def forward(self, x): class MedusaHeadV1 (line 62) | class MedusaHeadV1(nn.Module): method __init__ (line 63) | def __init__(self, lm_head, medusa): method load (line 69) | def load(config, prefix: str, weights): method forward (line 97) | def forward( class MedusaHeadV2 (line 109) | class MedusaHeadV2(nn.Module): method __init__ (line 110) | def __init__(self, config, prefix, weights): method forward (line 150) | def forward(self, x): FILE: backends/gaudi/server/text_generation_server/layers/mlp.py class MLPSpeculatorLayerNorm (line 11) | class MLPSpeculatorLayerNorm(nn.Module): method __init__ (line 27) | def __init__( method forward (line 39) | def forward(self, x): function simple_norm (line 51) | def simple_norm(x: torch.Tensor, eps=1e-06): class MLPSpeculatorModelTied (line 58) | class MLPSpeculatorModelTied(torch.nn.Module): method __init__ (line 59) | def __init__(self, config, prefix, weights): method forward (line 96) | def forward( class MLPSpeculatorModel (line 142) | class MLPSpeculatorModel(torch.nn.Module): method __init__ (line 143) | def __init__(self, config, prefix, weights): method forward (line 192) | def forward( class MLPSpeculatorHead (line 235) | class MLPSpeculatorHead(nn.Module): method __init__ (line 236) | def __init__(self, lm_head, mlp_speculator, scale_input: bool): method forward (line 242) | def forward( method load (line 257) | def load(config, prefix: str, weights): FILE: backends/gaudi/server/text_generation_server/layers/moe/__init__.py class MoELayer (line 30) | class MoELayer(Protocol): method __init__ (line 31) | def __init__( method forward (line 49) | def forward( class DenseMoELayer (line 54) | class DenseMoELayer(nn.Module): method __init__ (line 62) | def __init__( method forward (line 143) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... class SparseMoELayer (line 182) | class SparseMoELayer(nn.Module): method __init__ (line 189) | def __init__( method forward (line 242) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... method is_supported (line 246) | def is_supported(weights: Weights) -> bool: FILE: backends/gaudi/server/text_generation_server/layers/moe/fp8.py class FP8SparseMoELayer (line 20) | class FP8SparseMoELayer(nn.Module): method __init__ (line 21) | def __init__( method forward (line 105) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... function _load_expert_weights (line 168) | def _load_expert_weights( function _load_expert_multi_weights_col (line 218) | def _load_expert_multi_weights_col( function _load_expert_weights_row (line 248) | def _load_expert_weights_row( FILE: backends/gaudi/server/text_generation_server/layers/moe/fused_moe.py function grouped_topk (line 21) | def grouped_topk( function fused_topk (line 83) | def fused_topk( function select_experts (line 98) | def select_experts( FILE: backends/gaudi/server/text_generation_server/layers/moe/unquantized.py class UnquantizedSparseMoELayer (line 13) | class UnquantizedSparseMoELayer(nn.Module): method __init__ (line 14) | def __init__( method forward (line 83) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... function _load_expert_multi_weights_col (line 103) | def _load_expert_multi_weights_col( function _load_expert_weights_row (line 144) | def _load_expert_weights_row( FILE: backends/gaudi/server/text_generation_server/layers/rotary.py function _create_inv_freq (line 11) | def _create_inv_freq(dim, base, device): function _get_rope_config (line 18) | def _get_rope_config(config): class PositionRotaryEmbedding (line 28) | class PositionRotaryEmbedding(nn.Module): method __init__ (line 29) | def __init__(self, inv_freq, scaling_factor, max_position_embeddings): method forward (line 43) | def forward( method static (line 76) | def static(cls, config, dim, base, device): method load (line 208) | def load(cls, config, prefix, weights): method _update_cos_sin_cache (line 253) | def _update_cos_sin_cache(self, dtype, device, seqlen): method get_cos_sin (line 272) | def get_cos_sin(self, position_ids: torch.Tensor): class SuRotaryEmbedding (line 281) | class SuRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 282) | def __init__( method _update_cos_sin_cache (line 305) | def _update_cos_sin_cache(self, dtype, device, seqlen): class Phi3LongRoPEScaledRotaryEmbedding (line 332) | class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 333) | def __init__( method _update_cos_sin_cache (line 361) | def _update_cos_sin_cache(self, dtype, device, seqlen): class DynamicPositionRotaryEmbedding (line 392) | class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 393) | def __init__(self, dim, max_position_embeddings, base, device, scaling... method _update_cos_sin_cache (line 400) | def _update_cos_sin_cache(self, dtype, device, seqlen): function find_correction_dim (line 426) | def find_correction_dim(num_rotations, dim, base=10000, max_position_emb... function find_correction_range (line 433) | def find_correction_range( function linear_ramp_mask (line 441) | def linear_ramp_mask(min, max, dim): function get_mscale (line 450) | def get_mscale(scale: float = 1.0, mscale: float = 1.0): class YarnPositionRotaryEmbedding (line 456) | class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 457) | def __init__( method _update_cos_sin_cache (line 489) | def _update_cos_sin_cache(self, dtype, device, seqlen): function apply_llama3_scaling (line 531) | def apply_llama3_scaling( class RotaryPositionEmbeddingMultimodalSections (line 560) | class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding): method __init__ (line 561) | def __init__( method _update_cos_sin_cache (line 579) | def _update_cos_sin_cache( method get_cos_sin (line 596) | def get_cos_sin( FILE: backends/gaudi/server/text_generation_server/layers/speculative.py class SpeculativeHead (line 9) | class SpeculativeHead(torch.nn.Module): method __init__ (line 10) | def __init__(self, lm_head, speculator): method load (line 16) | def load(config, prefix: str, weights): method forward (line 44) | def forward( FILE: backends/gaudi/server/text_generation_server/layers/tensor_parallel.py class LayerConcat (line 9) | class LayerConcat(torch.nn.Module): method __init__ (line 15) | def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1): method forward (line 23) | def forward(self, x: torch.Tensor): class SuperLayer (line 28) | class SuperLayer(torch.nn.Module): method __init__ (line 29) | def __init__(self, linear): method forward (line 33) | def forward(self, x): class TensorParallelHead (line 37) | class TensorParallelHead(SuperLayer): method __init__ (line 38) | def __init__(self, linear, process_group, should_gather: bool): method load (line 44) | def load(config, prefix: str, weights): method forward (line 73) | def forward(self, input: torch.Tensor) -> torch.Tensor: class TensorParallelColumnLinear (line 111) | class TensorParallelColumnLinear(SuperLayer): method load_gate_up (line 113) | def load_gate_up(cls, config, prefix: str, weights, bias: bool): method load_qkv (line 124) | def load_qkv( method load (line 147) | def load(cls, config, prefix: str, weights, bias: bool): method load_multi (line 157) | def load_multi(cls, config, prefixes: List[str], weights, bias: bool, ... class TensorParallelRowLinear (line 176) | class TensorParallelRowLinear(SuperLayer): method __init__ (line 177) | def __init__(self, linear, process_group): method load (line 182) | def load(cls, config, prefix: str, weights, bias: bool): method forward (line 195) | def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.T... class TensorParallelEmbedding (line 206) | class TensorParallelEmbedding(torch.nn.Module): method __init__ (line 207) | def __init__(self, prefix: str, weights, reduce=True): method forward (line 229) | def forward(self, input: torch.Tensor) -> torch.Tensor: FILE: backends/gaudi/server/text_generation_server/models/__init__.py class ModelType (line 163) | class ModelType(enum.Enum): function get_model (line 359) | def get_model( function get_model_with_lora_adapters (line 949) | def get_model_with_lora_adapters( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py function _make_causal_mask (line 68) | def _make_causal_mask( function _expand_mask (line 88) | def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor: function build_alibi_tensor (line 99) | def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int) -> ... function dropout_add (line 156) | def dropout_add( function _split_heads (line 178) | def _split_heads( function _merge_heads (line 210) | def _merge_heads(x: torch.Tensor, num_heads: int, head_dim: int) -> torc... class BloomAttention (line 236) | class BloomAttention(nn.Module): method __init__ (line 237) | def __init__(self, prefix, config: BloomConfig, weights): method compute_attention (line 280) | def compute_attention( method forward (line 357) | def forward( class BloomMLP (line 435) | class BloomMLP(nn.Module): method __init__ (line 436) | def __init__(self, prefix, config: BloomConfig, weights): method forward (line 450) | def forward( class BloomBlock (line 474) | class BloomBlock(nn.Module): method __init__ (line 475) | def __init__(self, layer_id: int, config: BloomConfig, weights): method forward (line 500) | def forward( class BloomPreTrainedModel (line 556) | class BloomPreTrainedModel(PreTrainedModel): method _convert_to_standard_cache (line 562) | def _convert_to_standard_cache( method _convert_to_bloom_cache (line 582) | def _convert_to_bloom_cache( class BloomModel (line 601) | class BloomModel(BloomPreTrainedModel): method __init__ (line 602) | def __init__(self, config: BloomConfig, weights): method _prepare_attn_mask (line 635) | def _prepare_attn_mask( method set_input_embeddings (line 664) | def set_input_embeddings(self, new_embeddings: torch.Tensor): method forward (line 667) | def forward( class BloomForCausalLM (line 818) | class BloomForCausalLM(BloomPreTrainedModel): method __init__ (line 819) | def __init__(self, prefix: str, config, weights): method prepare_inputs_for_generation (line 829) | def prepare_inputs_for_generation( method forward (line 860) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py class CLIPVisionEmbeddings (line 23) | class CLIPVisionEmbeddings(nn.Module): method __init__ (line 24) | def __init__(self, prefix, config: CLIPVisionConfig, weights): method forward (line 56) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: class CLIPTextEmbeddings (line 70) | class CLIPTextEmbeddings(nn.Module): method __init__ (line 71) | def __init__(self, config: CLIPTextConfig): method forward (line 87) | def forward( class CLIPAttention (line 109) | class CLIPAttention(nn.Module): method __init__ (line 112) | def __init__(self, prefix, config, weights): method _shape (line 142) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 149) | def forward( class CLIPMLP (line 234) | class CLIPMLP(nn.Module): method __init__ (line 235) | def __init__(self, prefix, config, weights): method forward (line 246) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class CLIPEncoderLayer (line 253) | class CLIPEncoderLayer(nn.Module): method __init__ (line 254) | def __init__(self, prefix, config: CLIPConfig, weights): method forward (line 268) | def forward( class CLIPPreTrainedModel (line 299) | class CLIPPreTrainedModel(nn.Module): class CLIPEncoder (line 386) | class CLIPEncoder(nn.Module): method __init__ (line 395) | def __init__(self, prefix, config: CLIPConfig, weights): method forward (line 407) | def forward( class CLIPTextTransformer (line 446) | class CLIPTextTransformer(nn.Module): method __init__ (line 447) | def __init__(self, prefix: str, config: CLIPTextConfig, weights=None): method forward (line 461) | def forward( class CLIPTextModel (line 533) | class CLIPTextModel(CLIPPreTrainedModel): method __init__ (line 538) | def __init__(self, prefix, config: CLIPTextConfig): method forward (line 544) | def forward( class CLIPVisionTransformer (line 575) | class CLIPVisionTransformer(nn.Module): method __init__ (line 576) | def __init__(self, prefix, config: CLIPVisionConfig, weights): method forward (line 591) | def forward( class CLIPVisionModel (line 619) | class CLIPVisionModel(CLIPPreTrainedModel): method __init__ (line 624) | def __init__(self, config: CLIPVisionConfig): method get_input_embeddings (line 630) | def get_input_embeddings(self) -> nn.Module: method forward (line 633) | def forward( class CLIPModel (line 665) | class CLIPModel(nn.Module): method __init__ (line 666) | def __init__(self, prefix, config: CLIPConfig, weights): method get_text_features (line 691) | def get_text_features( method get_image_features (line 724) | def get_image_features( method forward (line 760) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py class CohereRotary (line 58) | class CohereRotary(PositionRotaryEmbedding): method forward (line 59) | def forward( class CohereLayerNorm (line 88) | class CohereLayerNorm(nn.Module): method __init__ (line 89) | def __init__(self, prefix, weights, eps): method forward (line 97) | def forward(self, hidden_states): function load_attention (line 112) | def load_attention(config, prefix, weights): function _load_gqa (line 125) | def _load_gqa(config, prefix: str, weights): class FlashCohereAttention (line 157) | class FlashCohereAttention(torch.nn.Module): method __init__ (line 158) | def __init__( method forward (line 214) | def forward( class CohereMLP (line 283) | class CohereMLP(nn.Module): method __init__ (line 284) | def __init__(self, prefix, config, weights): method forward (line 315) | def forward(self, hidden_states): class FlashCohereLayer (line 323) | class FlashCohereLayer(nn.Module): method __init__ (line 324) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb): method forward (line 342) | def forward( class FlashCohereModel (line 377) | class FlashCohereModel(torch.nn.Module): method __init__ (line 378) | def __init__(self, prefix: str, config, weights): method forward (line 415) | def forward( class FlashCohereForCausalLM (line 459) | class FlashCohereForCausalLM(torch.nn.Module): method __init__ (line 460) | def __init__(self, prefix: str, config, weights): method forward (line 483) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py class DbrxAttentionConfig (line 51) | class DbrxAttentionConfig(PretrainedConfig): method __init__ (line 52) | def __init__( class DbrxFFNConfig (line 73) | class DbrxFFNConfig(PretrainedConfig): method __init__ (line 74) | def __init__( class DbrxConfig (line 108) | class DbrxConfig(PretrainedConfig): method __init__ (line 115) | def __init__( method num_key_value_heads (line 168) | def num_key_value_heads(self): function promote_scalar (line 174) | def promote_scalar(x: torch.Tensor) -> torch.Tensor: function load_attention (line 178) | def load_attention(config, prefix, weights): function _load_experts (line 189) | def _load_experts(config, prefix, weights): function _load_experts_quantized (line 220) | def _load_experts_quantized(config, prefix, weights, cls): class DbrxAttention (line 260) | class DbrxAttention(torch.nn.Module): method __init__ (line 261) | def __init__( method forward (line 302) | def forward( class DbrxNormAttentionNorm (line 363) | class DbrxNormAttentionNorm(nn.Module): method __init__ (line 364) | def __init__( method forward (line 387) | def forward( function select_experts (line 420) | def select_experts( function round_up (line 438) | def round_up(x: torch.Tensor, value: int): class BlockSparseMoE (line 442) | class BlockSparseMoE(nn.Module): method __init__ (line 443) | def __init__(self, prefix, config: DbrxConfig, weights): method forward (line 493) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DenseMoE (line 505) | class DenseMoE(nn.Module): method __init__ (line 506) | def __init__(self, prefix, config: DbrxConfig, weights): method forward (line 556) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DbrxLayer (line 603) | class DbrxLayer(nn.Module): method __init__ (line 604) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb): method forward (line 618) | def forward( class DbrxModel (line 648) | class DbrxModel(torch.nn.Module): method __init__ (line 649) | def __init__(self, prefix: str, config, weights): method forward (line 682) | def forward( class FlashDbrxForCausalLM (line 725) | class FlashDbrxForCausalLM(torch.nn.Module): method __init__ (line 726) | def __init__(self, prefix: str, config, weights): method forward (line 741) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py function get_and_maybe_dequant_weights (line 48) | def get_and_maybe_dequant_weights(layer: torch.nn.Module) -> torch.Tensor: class DeepseekV2Config (line 60) | class DeepseekV2Config(PretrainedConfig): method __init__ (line 61) | def __init__( class DeepseekV2Attention (line 166) | class DeepseekV2Attention(torch.nn.Module): method __init__ (line 167) | def __init__( method _q_proj_and_k_up_proj (line 277) | def _q_proj_and_k_up_proj(self, x): method _v_up_proj_and_o_proj (line 292) | def _v_up_proj_and_o_proj(self, x): method forward (line 301) | def forward( class DeepseekV2MLP (line 422) | class DeepseekV2MLP(nn.Module): method __init__ (line 423) | def __init__(self, prefix: str, config, weights, intermediate_size: int): method forward (line 453) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True): class DeepseekV2MoE (line 461) | class DeepseekV2MoE(nn.Module): method __init__ (line 462) | def __init__( method forward (line 504) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DeepseekV2Layer (line 524) | class DeepseekV2Layer(nn.Module): method __init__ (line 525) | def __init__(self, prefix, layer_id, config, weights, rotary_emb): method forward (line 564) | def forward( class DeepseekV2Model (line 600) | class DeepseekV2Model(torch.nn.Module): method __init__ (line 601) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 634) | def forward( class FlashDeepseekV2ForCausalLM (line 678) | class FlashDeepseekV2ForCausalLM(torch.nn.Module): method __init__ (line 679) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 691) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py function get_and_maybe_dequant_weights (line 48) | def get_and_maybe_dequant_weights(layer: torch.nn.Module) -> torch.Tensor: class DeepseekV3Config (line 60) | class DeepseekV3Config(PretrainedConfig): method __init__ (line 61) | def __init__( class DeepseekV3Attention (line 166) | class DeepseekV3Attention(torch.nn.Module): method __init__ (line 167) | def __init__( method _q_proj_and_k_up_proj (line 276) | def _q_proj_and_k_up_proj(self, x): method _v_up_proj_and_o_proj (line 291) | def _v_up_proj_and_o_proj(self, x): method forward (line 300) | def forward( class DeepseekV3MLP (line 421) | class DeepseekV3MLP(nn.Module): method __init__ (line 422) | def __init__(self, prefix: str, config, weights, intermediate_size: int): method forward (line 452) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True): class DeepseekV3MoE (line 460) | class DeepseekV3MoE(nn.Module): method __init__ (line 461) | def __init__( method forward (line 512) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DeepseekV3Layer (line 532) | class DeepseekV3Layer(nn.Module): method __init__ (line 533) | def __init__(self, prefix, layer_id, config, weights, rotary_emb): method forward (line 572) | def forward( class DeepseekV3Model (line 608) | class DeepseekV3Model(torch.nn.Module): method __init__ (line 609) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 642) | def forward( class FlashDeepseekV3ForCausalLM (line 686) | class FlashDeepseekV3ForCausalLM(torch.nn.Module): method __init__ (line 687) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 699) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py class Gemma2Config (line 53) | class Gemma2Config(PretrainedConfig): method __init__ (line 54) | def __init__( class Gemma2FastRMSNorm (line 109) | class Gemma2FastRMSNorm(FastRMSNorm): method load (line 111) | def load(cls, prefix: str, weights, eps=1e-6): method forward (line 121) | def forward(self, hidden_states, residual=None): function load_attention (line 132) | def load_attention(config, prefix: str, weights): function _load_gqa (line 145) | def _load_gqa(config, prefix: str, weights): class FlashGemma2Attention (line 167) | class FlashGemma2Attention(torch.nn.Module): method __init__ (line 168) | def __init__( method forward (line 234) | def forward( class Gemma2MLP (line 299) | class Gemma2MLP(nn.Module): method __init__ (line 300) | def __init__(self, prefix, config, weights, layer_id): method forward (line 349) | def forward(self, hidden_states, adapter_data): class FlashGemma2Layer (line 357) | class FlashGemma2Layer(nn.Module): method __init__ (line 358) | def __init__( method forward (line 401) | def forward( class FlashGemma2Model (line 441) | class FlashGemma2Model(torch.nn.Module): method __init__ (line 442) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 477) | def forward( class FlashGemma2ForCausalLM (line 524) | class FlashGemma2ForCausalLM(torch.nn.Module): method __init__ (line 525) | def __init__(self, prefix: str, config, weights, *, causal: bool = True): method forward (line 554) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py class Gemma3FastRMSNorm (line 62) | class Gemma3FastRMSNorm(FastRMSNorm): method load (line 64) | def load(cls, prefix: str, weights, eps=1e-6): method forward (line 74) | def forward(self, hidden_states, residual=None): function load_attention (line 85) | def load_attention(config, prefix: str, weights): function _load_gqa (line 98) | def _load_gqa(config, prefix: str, weights): class FlashGemma3Attention (line 120) | class FlashGemma3Attention(torch.nn.Module): method __init__ (line 121) | def __init__( method forward (line 198) | def forward( class Gemma3MLP (line 275) | class Gemma3MLP(nn.Module): method __init__ (line 276) | def __init__(self, prefix, config, weights, layer_id): method forward (line 325) | def forward(self, hidden_states, adapter_data): class FlashGemma3Layer (line 333) | class FlashGemma3Layer(nn.Module): method __init__ (line 334) | def __init__( method forward (line 379) | def forward( class FlashGemma3Model (line 419) | class FlashGemma3Model(torch.nn.Module): method __init__ (line 420) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 464) | def forward( class FlashGemma3ForCausalLM (line 514) | class FlashGemma3ForCausalLM(torch.nn.Module): method __init__ (line 515) | def __init__(self, prefix: str, config, weights, *, causal: bool = True): method forward (line 545) | def forward( class Gemma3MultimodalInputProjection (line 576) | class Gemma3MultimodalInputProjection(torch.nn.Module): method __init__ (line 577) | def __init__(self, prefix, config, weights): method forward (line 599) | def forward(self, vision_outputs: torch.Tensor): class Gemma3ForConditionalGeneration (line 620) | class Gemma3ForConditionalGeneration(nn.Module): method __init__ (line 621) | def __init__(self, prefix, config, weights): method get_vision_embeds (line 671) | def get_vision_embeds( method get_inputs_embeds (line 687) | def get_inputs_embeds( method forward (line 704) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py class GemmaConfig (line 51) | class GemmaConfig(PretrainedConfig): method __init__ (line 52) | def __init__( class GemmaFastRMSNorm (line 107) | class GemmaFastRMSNorm(FastRMSNorm): method load (line 109) | def load(cls, prefix: str, weights, eps=1e-6): method forward (line 119) | def forward(self, hidden_states, residual=None): function load_attention (line 130) | def load_attention(config, prefix: str, weights): function _load_gqa (line 143) | def _load_gqa(config, prefix: str, weights): class FlashGemmaAttention (line 165) | class FlashGemmaAttention(torch.nn.Module): method __init__ (line 166) | def __init__(self, prefix: str, config, weights, causal: bool, rotary_... method forward (line 198) | def forward( class GemmaMLP (line 257) | class GemmaMLP(nn.Module): method __init__ (line 258) | def __init__(self, prefix: str, config, weights): method forward (line 289) | def forward(self, hidden_states): class FlashGemmaLayer (line 295) | class FlashGemmaLayer(nn.Module): method __init__ (line 296) | def __init__(self, prefix: str, config, weights, causal: bool, rotary_... method forward (line 316) | def forward( class FlashGemmaModel (line 352) | class FlashGemmaModel(torch.nn.Module): method __init__ (line 353) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 386) | def forward( class FlashGemmaForCausalLM (line 431) | class FlashGemmaForCausalLM(torch.nn.Module): method __init__ (line 432) | def __init__(self, prefix: str, config, weights, *, causal: bool = True): method forward (line 459) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py function load_qkv (line 45) | def load_qkv(config, prefix: str, weights, head_size, num_heads): function _load_qkv_gptq (line 56) | def _load_qkv_gptq(config, prefix: str, weights): function _load_qkv (line 87) | def _load_qkv(config, prefix: str, weights, head_size, num_heads): function load_row (line 134) | def load_row(config, prefix: str, weights, bias: bool): function load_col (line 153) | def load_col(config, prefix: str, weights, bias: bool): class FlashGPT2Attention (line 168) | class FlashGPT2Attention(torch.nn.Module): method __init__ (line 169) | def __init__( method forward (line 209) | def forward( class GPT2MLP (line 259) | class GPT2MLP(nn.Module): method __init__ (line 260) | def __init__(self, prefix: str, config, weights): method forward (line 290) | def forward(self, hidden_states): class FlashGPT2Layer (line 296) | class FlashGPT2Layer(nn.Module): method __init__ (line 297) | def __init__(self, prefix: str, config, weights): method forward (line 313) | def forward( class FlashGPT2Model (line 346) | class FlashGPT2Model(torch.nn.Module): method __init__ (line 347) | def __init__(self, prefix: str, config, weights): method forward (line 377) | def forward( class FlashGPT2ForCausalLM (line 416) | class FlashGPT2ForCausalLM(torch.nn.Module): method __init__ (line 417) | def __init__(self, prefix: str, config, weights): method forward (line 436) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py function load_attention (line 55) | def load_attention(config, prefix: str, weights): function load_row (line 65) | def load_row(config, prefix: str, weights, bias: bool): class GPTJRotary (line 78) | class GPTJRotary(PositionRotaryEmbedding): method forward (line 79) | def forward( class FlashGPTJAttention (line 107) | class FlashGPTJAttention(torch.nn.Module): method __init__ (line 108) | def __init__( method forward (line 149) | def forward( class GPTJMLP (line 209) | class GPTJMLP(nn.Module): method __init__ (line 210) | def __init__(self, prefix: str, config, weights): method forward (line 235) | def forward(self, hidden_states): class FlashGPTJLayer (line 241) | class FlashGPTJLayer(nn.Module): method __init__ (line 242) | def __init__(self, prefix: str, config, weights, rotary_emb): method forward (line 256) | def forward( class FlashGPTJModel (line 286) | class FlashGPTJModel(torch.nn.Module): method __init__ (line 287) | def __init__(self, prefix: str, config, weights): method forward (line 323) | def forward( class FlashGPTJForCausalLM (line 367) | class FlashGPTJForCausalLM(torch.nn.Module): method __init__ (line 368) | def __init__(self, prefix: str, config, weights): method forward (line 381) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama4_modeling.py function reshape_for_broadcast (line 55) | def reshape_for_broadcast(freqs: torch.Tensor, target): function apply_rotary_emb (line 61) | def apply_rotary_emb( function repeat_kv (line 94) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class Llama4TextExperts (line 108) | class Llama4TextExperts(nn.Module): method __init__ (line 109) | def __init__(self, prefix, config, weights): method forward (line 127) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Llama4TextMLP (line 156) | class Llama4TextMLP(nn.Module): method __init__ (line 157) | def __init__(self, prefix, config, weights): method forward (line 180) | def forward(self, x): class Llama4TextL2Norm (line 186) | class Llama4TextL2Norm(torch.nn.Module): method __init__ (line 187) | def __init__(self, eps: float = 1e-6): method _norm (line 191) | def _norm(self, x): method forward (line 194) | def forward(self, x): method extra_repr (line 197) | def extra_repr(self): class Llama4TextMoe (line 201) | class Llama4TextMoe(nn.Module): method __init__ (line 202) | def __init__( method forward (line 223) | def forward(self, hidden_states, adapter_data): class Llama4TextRotaryEmbedding (line 265) | class Llama4TextRotaryEmbedding(nn.Module): method __init__ (line 266) | def __init__(self, config, device=None): method forward (line 281) | def forward(self, x, position_ids): class Llama4TextAttention (line 302) | class Llama4TextAttention(FlashLlamaAttention): method __init__ (line 305) | def __init__(self, prefix, config, weights, layer_idx): method forward (line 325) | def forward( class Llama4TextDecoderLayer (line 435) | class Llama4TextDecoderLayer(nn.Module): method __init__ (line 436) | def __init__(self, prefix, config, weights, layer_idx): method forward (line 460) | def forward( class Llama4TextModel (line 507) | class Llama4TextModel(nn.Module): method __init__ (line 509) | def __init__(self, prefix, config, weights): method forward (line 540) | def forward( method _update_causal_mask (line 600) | def _update_causal_mask( method create_chunked_attention_mask (line 735) | def create_chunked_attention_mask( method _prepare_4d_causal_attention_mask_with_cache_position (line 761) | def _prepare_4d_causal_attention_mask_with_cache_position( class Llama4ForCausalLM (line 826) | class Llama4ForCausalLM(nn.Module): method __init__ (line 827) | def __init__(self, prefix, config, weights): method forward (line 839) | def forward( class Llama4VisionMLP2 (line 873) | class Llama4VisionMLP2(torch.nn.Module): method __init__ (line 874) | def __init__(self, prefix, config, weights): method forward (line 887) | def forward(self, hidden_states): class Llama4MultiModalProjector (line 897) | class Llama4MultiModalProjector(nn.Module): method __init__ (line 898) | def __init__(self, prefix, config, weights): method forward (line 904) | def forward(self, image_features): function pixel_shuffle (line 909) | def pixel_shuffle(input_tensor, shuffle_ratio): class Llama4VisionPixelShuffleMLP (line 932) | class Llama4VisionPixelShuffleMLP(nn.Module): method __init__ (line 933) | def __init__(self, prefix, config, weights): method forward (line 944) | def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor: function vision_reshape_for_broadcast (line 950) | def vision_reshape_for_broadcast(freqs_ci: torch.Tensor, query: torch.Te... class Llama4VisionAttention (line 956) | class Llama4VisionAttention(nn.Module): method __init__ (line 957) | def __init__(self, prefix, config, weights): method forward (line 981) | def forward( class Llama4VisionMLP (line 1027) | class Llama4VisionMLP(nn.Module): method __init__ (line 1028) | def __init__(self, prefix, config, weights): method forward (line 1039) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Llama4VisionEncoderLayer (line 1046) | class Llama4VisionEncoderLayer(nn.Module): method __init__ (line 1047) | def __init__(self, prefix, config, weights): method forward (line 1065) | def forward( class Llama4VisionEncoder (line 1093) | class Llama4VisionEncoder(nn.Module): method __init__ (line 1102) | def __init__(self, prefix, config, weights): method forward (line 1116) | def forward( class Llama4UnfoldConvolution (line 1135) | class Llama4UnfoldConvolution(nn.Module): method __init__ (line 1136) | def __init__(self, prefix, config, weights): method forward (line 1146) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Llama4VisionRotaryEmbedding (line 1153) | class Llama4VisionRotaryEmbedding(nn.Module): method __init__ (line 1154) | def __init__(self, config, weights): method forward (line 1192) | def forward(self, hidden_states): class Llama4VisionModel (line 1199) | class Llama4VisionModel(nn.Module): method __init__ (line 1201) | def __init__(self, prefix, config, weights): method forward (line 1243) | def forward( class Llama4ForConditionalGeneration (line 1298) | class Llama4ForConditionalGeneration(nn.Module): method __init__ (line 1300) | def __init__(self, prefix: str, config, weights): method get_image_features (line 1328) | def get_image_features( method get_vision_embeds (line 1359) | def get_vision_embeds( method get_inputs_embeds (line 1376) | def get_inputs_embeds( method forward (line 1411) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py function load_attention (line 64) | def load_attention(config, prefix: str, weights, layer_id): function no_fp8 (line 117) | def no_fp8(weights: Weights): class FlashLlamaAttention (line 129) | class FlashLlamaAttention(torch.nn.Module): method __init__ (line 130) | def __init__( method forward (line 189) | def forward( class Phi3MoE (line 250) | class Phi3MoE(nn.Module): method __init__ (line 251) | def __init__( method forward (line 274) | def forward(self, x, adapter_data) -> torch.Tensor: class LlamaMLP (line 286) | class LlamaMLP(nn.Module): method __init__ (line 287) | def __init__(self, prefix, config, weights, index): method forward (line 359) | def forward(self, hidden_states, adapter_data): class FlashLlamaLayer (line 367) | class FlashLlamaLayer(nn.Module): method __init__ (line 368) | def __init__(self, index, prefix, config, weights, rotary_emb): method forward (line 420) | def forward( class FlashLlamaModel (line 462) | class FlashLlamaModel(torch.nn.Module): method __init__ (line 463) | def __init__(self, prefix, config, weights): method forward (line 545) | def forward( class FlashLlamaForCausalLM (line 594) | class FlashLlamaForCausalLM(torch.nn.Module): method __init__ (line 595) | def __init__(self, prefix: str, config, weights, name=None): method forward (line 640) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py function get_anyres_image_grid_shape (line 37) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): function unpad_image (line 60) | def unpad_image(tensor, original_size): class LlavaNextMultiModalProjector (line 94) | class LlavaNextMultiModalProjector(nn.Module): method __init__ (line 95) | def __init__(self, prefix, config, weights): method forward (line 106) | def forward(self, image_features): class FlashLlavaNextForConditionalGeneration (line 113) | class FlashLlavaNextForConditionalGeneration(nn.Module): method __init__ (line 114) | def __init__(self, prefix, config, weights): method _merge_input_ids_with_image_features (line 149) | def _merge_input_ids_with_image_features( method get_vision_embeds (line 166) | def get_vision_embeds( method get_inputs_embeds (line 254) | def get_inputs_embeds( method forward (line 271) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py class MistralConfig (line 52) | class MistralConfig(PretrainedConfig): method __init__ (line 55) | def __init__( class MistralAttention (line 106) | class MistralAttention(torch.nn.Module): method __init__ (line 107) | def __init__(self, prefix: str, config, weights, layer_id, rotary_emb): method forward (line 172) | def forward( class MistralMLP (line 235) | class MistralMLP(nn.Module): method __init__ (line 236) | def __init__(self, prefix: str, config, weights, layer_id): method forward (line 290) | def forward(self, hidden_states, adapter_data): class MistralLayer (line 298) | class MistralLayer(nn.Module): method __init__ (line 299) | def __init__(self, prefix: str, config, weights, layer_id, rotary_emb): method forward (line 321) | def forward( class MistralModel (line 359) | class MistralModel(torch.nn.Module): method __init__ (line 360) | def __init__(self, prefix: str, config, weights): method forward (line 401) | def forward( class FlashMistralForCausalLM (line 445) | class FlashMistralForCausalLM(torch.nn.Module): method __init__ (line 446) | def __init__(self, prefix: str, config, weights, name=None): method forward (line 478) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py class MixtralConfig (line 51) | class MixtralConfig(PretrainedConfig): method __init__ (line 54) | def __init__( function promote_scalar (line 109) | def promote_scalar(x: torch.Tensor) -> torch.Tensor: function load_attention (line 113) | def load_attention(config, prefix: str, weights): function _load_gqa (line 126) | def _load_gqa(config, prefix: str, weights): function _load_experts (line 149) | def _load_experts(config, prefix: str, mat, weights): class MixtralAttention (line 185) | class MixtralAttention(torch.nn.Module): method __init__ (line 186) | def __init__( method forward (line 228) | def forward( function select_experts (line 288) | def select_experts(gate_logits: torch.Tensor, top_k: int): function round_up (line 301) | def round_up(x: torch.Tensor, value: int): class MixtralMoE (line 305) | class MixtralMoE(nn.Module): method __init__ (line 306) | def __init__( method forward (line 330) | def forward(self, x: torch.Tensor) -> torch.Tensor: class MixtralLayer (line 342) | class MixtralLayer(nn.Module): method __init__ (line 343) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb): method forward (line 370) | def forward( class MixtralModel (line 406) | class MixtralModel(torch.nn.Module): method __init__ (line 407) | def __init__(self, prefix: str, config, weights): method forward (line 445) | def forward( class FlashMixtralForCausalLM (line 489) | class FlashMixtralForCausalLM(torch.nn.Module): method __init__ (line 490) | def __init__(self, prefix: str, config, weights): method forward (line 506) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py function _prepare_aspect_ratio_attention_mask (line 44) | def _prepare_aspect_ratio_attention_mask( function _prepare_4d_causal_attention_mask_with_cache_position (line 76) | def _prepare_4d_causal_attention_mask_with_cache_position( function _prepare_cross_attention_mask (line 140) | def _prepare_cross_attention_mask( class MllamaVisionMLP (line 173) | class MllamaVisionMLP(nn.Module): method __init__ (line 174) | def __init__(self, *, prefix, config, weights): method forward (line 185) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class MllamaVisionSdpaAttention (line 192) | class MllamaVisionSdpaAttention(nn.Module): method __init__ (line 193) | def __init__(self, *, prefix, config, weights): method forward (line 214) | def forward( class MllamaVisionEncoderLayer (line 260) | class MllamaVisionEncoderLayer(nn.Module): method __init__ (line 261) | def __init__(self, *, prefix, config, weights, is_gated: bool): method forward (line 292) | def forward( class MllamaVisionEncoder (line 313) | class MllamaVisionEncoder(nn.Module): method __init__ (line 314) | def __init__(self, *, prefix, config, weights, is_gated: bool, num_lay... method forward (line 327) | def forward( class MllamaPrecomputedAspectRatioEmbedding (line 350) | class MllamaPrecomputedAspectRatioEmbedding(nn.Module): method __init__ (line 351) | def __init__(self, *, prefix, config, weights): method forward (line 364) | def forward( class MllamaPrecomputedPositionEmbedding (line 377) | class MllamaPrecomputedPositionEmbedding(nn.Module): method __init__ (line 378) | def __init__(self, *, prefix, config, weights): method forward (line 399) | def forward( class MllamaVisionModel (line 419) | class MllamaVisionModel(nn.Module): method __init__ (line 420) | def __init__(self, *, prefix, config, weights): method apply_class_embedding (line 496) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T... method forward (line 502) | def forward( class MllamaTextCrossAttention (line 634) | class MllamaTextCrossAttention(nn.Module): method __init__ (line 637) | def __init__(self, *, prefix, config, weights, layer_idx): method forward (line 686) | def forward( class MllamaTextMLP (line 744) | class MllamaTextMLP(nn.Module): method __init__ (line 745) | def __init__(self, *, prefix, config, weights): method forward (line 767) | def forward(self, x): class FlashLlamaCrossLayer (line 777) | class FlashLlamaCrossLayer(torch.nn.Module): method __init__ (line 780) | def __init__(self, *, prefix, config, weights, index) -> None: method forward (line 808) | def forward( class MllamaTextRMSNorm (line 852) | class MllamaTextRMSNorm(nn.Module): method __init__ (line 853) | def __init__(self, weight, eps): method load (line 859) | def load(cls, *, prefix, weights, eps): method forward (line 865) | def forward(self, hidden_states): method extra_repr (line 872) | def extra_repr(self): class FlashMllamaForConditionalGeneration (line 876) | class FlashMllamaForConditionalGeneration(nn.Module): method __init__ (line 877) | def __init__(self, prefix, config, weights): method vision_forward (line 898) | def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_... method forward (line 916) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py class GPTNeoXConfig (line 54) | class GPTNeoXConfig(TransformersGPTNeoXConfig): function load_row (line 60) | def load_row(config, prefix: str, weights, bias: bool): function load_qkv (line 76) | def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_... class FlashNeoxAttention (line 101) | class FlashNeoxAttention(torch.nn.Module): method __init__ (line 102) | def __init__(self, config, prefix, weights, rotary_emb): method forward (line 138) | def forward( class FlashMLP (line 197) | class FlashMLP(nn.Module): method __init__ (line 198) | def __init__(self, config, prefix, weights): method forward (line 219) | def forward(self, hidden_states): class FlashNeoXLayer (line 226) | class FlashNeoXLayer(nn.Module): method __init__ (line 227) | def __init__(self, layer_id, config, weights, rotary_emb): method forward (line 253) | def forward( class FlashGPTNeoXPreTrainedModel (line 311) | class FlashGPTNeoXPreTrainedModel(PreTrainedModel): class FlashGPTNeoXModel (line 318) | class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel): method __init__ (line 319) | def __init__(self, prefix: str, config, weights): method forward (line 353) | def forward( class FlashGPTNeoXForCausalLM (line 397) | class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel): method __init__ (line 398) | def __init__(self, prefix, config, weights): method forward (line 412) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py class PaliGemmaForConditionalGeneration (line 29) | class PaliGemmaForConditionalGeneration(nn.Module): method __init__ (line 30) | def __init__(self, prefix, config, weights): method get_vision_embeds (line 67) | def get_vision_embeds( method get_inputs_embeds (line 83) | def get_inputs_embeds( method forward (line 96) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py class PhiConfig (line 33) | class PhiConfig(PretrainedConfig): method __init__ (line 34) | def __init__( function load_attention (line 73) | def load_attention(config, prefix, weights): function _load_gqa (line 86) | def _load_gqa(config, prefix: str, weights): class FlashPhiAttention (line 110) | class FlashPhiAttention(torch.nn.Module): method __init__ (line 111) | def __init__( method forward (line 153) | def forward( class PhiMLP (line 221) | class PhiMLP(nn.Module): method __init__ (line 222) | def __init__(self, prefix, config, weights): method forward (line 250) | def forward(self, hidden_states): class FlashPhiLayer (line 256) | class FlashPhiLayer(nn.Module): method __init__ (line 257) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb): method forward (line 274) | def forward( class FlashPhiModel (line 306) | class FlashPhiModel(torch.nn.Module): method __init__ (line 307) | def __init__(self, prefix: str, config, weights): method forward (line 350) | def forward( class FlashPhiForCausalLM (line 394) | class FlashPhiForCausalLM(torch.nn.Module): method __init__ (line 395) | def __init__(self, prefix: str, config, weights): method forward (line 410) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py class PhiMoEConfig (line 29) | class PhiMoEConfig(PretrainedConfig): method __init__ (line 120) | def __init__( method _rope_scaling_validation (line 190) | def _rope_scaling_validation(self): FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py function load_attention (line 29) | def load_attention(config, prefix, weights): function _load_gqa (line 42) | def _load_gqa(config, prefix: str, weights): class Qwen2Attention (line 55) | class Qwen2Attention(torch.nn.Module): method __init__ (line 56) | def __init__( method forward (line 101) | def forward( class Qwen2MLP (line 161) | class Qwen2MLP(nn.Module): method __init__ (line 162) | def __init__(self, prefix, config, weights): method forward (line 193) | def forward(self, hidden_states): class Qwen2Layer (line 199) | class Qwen2Layer(nn.Module): method __init__ (line 200) | def __init__(self, prefix, layer_id, config, weights, rotary_emb): method forward (line 219) | def forward( class Qwen2Model (line 253) | class Qwen2Model(torch.nn.Module): method __init__ (line 254) | def __init__(self, prefix: str, config, weights): method forward (line 292) | def forward( class Qwen2ForCausalLM (line 336) | class Qwen2ForCausalLM(torch.nn.Module): method __init__ (line 337) | def __init__(self, prefix: str, config, weights): method forward (line 365) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_modeling.py class Qwen3Attention (line 41) | class Qwen3Attention(nn.Module): method __init__ (line 44) | def __init__(self, config, prefix, weights, layer_idx, rotary_emb): method forward (line 112) | def forward( class Qwen3DecoderLayer (line 177) | class Qwen3DecoderLayer(nn.Module): method __init__ (line 178) | def __init__(self, config, prefix, weights, layer_idx: int, rotary_emb): method forward (line 198) | def forward( class Qwen3Model (line 235) | class Qwen3Model(nn.Module): method __init__ (line 236) | def __init__(self, config, prefix: str, weights): method forward (line 267) | def forward( class Qwen3ForCausalLM (line 314) | class Qwen3ForCausalLM(nn.Module): method __init__ (line 316) | def __init__(self, prefix: str, config, weights): method forward (line 336) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py function rotate_half (line 47) | def rotate_half(x): function apply_rotary_pos_emb (line 54) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di... class Qwen3MoeAttention (line 81) | class Qwen3MoeAttention(nn.Module): method __init__ (line 84) | def __init__(self, config, prefix, weights, layer_idx, rotary_emb): method forward (line 143) | def forward( class Qwen3MoE (line 202) | class Qwen3MoE(nn.Module): method __init__ (line 203) | def __init__(self, prefix, config, moe_layer_cls: Type[MoELayer], weig... method forward (line 226) | def forward(self, x: torch.Tensor) -> torch.Tensor: class Qwen3MoeMLP (line 237) | class Qwen3MoeMLP(nn.Module): method __init__ (line 238) | def __init__(self, prefix, config, weights, intermediate_size=None): method forward (line 267) | def forward(self, x): class Qwen3MoeSparseMoeBlock (line 273) | class Qwen3MoeSparseMoeBlock(nn.Module): method __init__ (line 274) | def __init__(self, prefix, config, weights): method forward (line 295) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Qwen3MoeDecoderLayer (line 343) | class Qwen3MoeDecoderLayer(nn.Module): method __init__ (line 344) | def __init__(self, config, prefix, weights, layer_idx: int, rotary_emb): method forward (line 387) | def forward( class Qwen3MoeModel (line 428) | class Qwen3MoeModel(nn.Module): method __init__ (line 429) | def __init__(self, config, prefix: str, weights): method forward (line 460) | def forward( class Qwen3MoeForCausalLM (line 502) | class Qwen3MoeForCausalLM(nn.Module): method __init__ (line 504) | def __init__(self, prefix: str, config, weights): method forward (line 524) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py function load_row (line 28) | def load_row(config, prefix: str, weights, bias: bool): class RWConfig (line 44) | class RWConfig(PretrainedConfig): method __init__ (line 51) | def __init__( class FlashRWAttention (line 131) | class FlashRWAttention(torch.nn.Module): method __init__ (line 132) | def __init__( method forward (line 176) | def forward( class FlashRWLargeAttention (line 236) | class FlashRWLargeAttention(torch.nn.Module): method __init__ (line 237) | def __init__( method forward (line 290) | def forward( class FlashMLP (line 351) | class FlashMLP(nn.Module): method __init__ (line 352) | def __init__(self, config, prefix: str, weights): method forward (line 363) | def forward(self, hidden_states): class FlashRWLayer (line 370) | class FlashRWLayer(nn.Module): method __init__ (line 371) | def __init__( method forward (line 420) | def forward( class FlashRWLayerNorm (line 477) | class FlashRWLayerNorm(nn.Module): method __init__ (line 478) | def __init__(self, config, prefix: str, weights): method forward (line 508) | def forward( class FlashRWLargeLayer (line 522) | class FlashRWLargeLayer(nn.Module): method __init__ (line 523) | def __init__(self, layer_id, prefix: str, config, weights, rotary_emb): method forward (line 541) | def forward( class FlashRWPreTrainedModel (line 579) | class FlashRWPreTrainedModel(PreTrainedModel): class FlashRWModel (line 583) | class FlashRWModel(FlashRWPreTrainedModel): method __init__ (line 584) | def __init__(self, prefix: str, config, weights): method forward (line 623) | def forward( class FlashRWForCausalLM (line 667) | class FlashRWForCausalLM(FlashRWPreTrainedModel): method __init__ (line 668) | def __init__(self, prefix: str, config, weights): method forward (line 680) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py function load_multi_mqa (line 30) | def load_multi_mqa( function _load_multi_mqa_gptq (line 43) | def _load_multi_mqa_gptq( function _load_multi_mqa (line 130) | def _load_multi_mqa( function load_col (line 200) | def load_col(config, prefix: str, weights, bias: bool): function load_row (line 213) | def load_row(config, prefix: str, weights, bias: bool): class FlashMQAttention (line 229) | class FlashMQAttention(torch.nn.Module): method __init__ (line 230) | def __init__(self, prefix, config, weights): method forward (line 265) | def forward( class MLP (line 319) | class MLP(nn.Module): method __init__ (line 320) | def __init__(self, prefix, config, weights): method forward (line 341) | def forward(self, hidden_states): class Block (line 348) | class Block(nn.Module): method __init__ (line 349) | def __init__(self, prefix: str, layer_id, config, weights): method forward (line 369) | def forward( class FlashSantacoderModel (line 396) | class FlashSantacoderModel(nn.Module): method __init__ (line 397) | def __init__(self, prefix: str, config, weights): method forward (line 431) | def forward( class FlashSantacoderForCausalLM (line 472) | class FlashSantacoderForCausalLM(nn.Module): method __init__ (line 473) | def __init__(self, prefix, config, weights): method forward (line 487) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py class Starcoder2Config (line 57) | class Starcoder2Config(PretrainedConfig): method __init__ (line 60) | def __init__( function load_attention (line 117) | def load_attention(config, prefix, weights, layer_id): function _load_gqa (line 144) | def _load_gqa(config, prefix: str, weights): class Starcoder2Attention (line 176) | class Starcoder2Attention(torch.nn.Module): method __init__ (line 177) | def __init__( method forward (line 228) | def forward( class Starcoder2MLP (line 291) | class Starcoder2MLP(nn.Module): method __init__ (line 292) | def __init__(self, prefix, config, weights, index): method forward (line 334) | def forward(self, hidden_states, adapter_data): class Starcoder2GatedMLP (line 340) | class Starcoder2GatedMLP(nn.Module): method __init__ (line 341) | def __init__(self, index, prefix, config, weights): method forward (line 390) | def forward(self, hidden_states, adapter_data): class Starcoder2Layer (line 409) | class Starcoder2Layer(nn.Module): method __init__ (line 410) | def __init__(self, layer_id, config, weights, rotary_emb): method forward (line 436) | def forward( class Starcoder2Model (line 474) | class Starcoder2Model(torch.nn.Module): method __init__ (line 475) | def __init__(self, prefix, config, weights): method forward (line 511) | def forward( class FlashStarcoder2ForCausalLM (line 557) | class FlashStarcoder2ForCausalLM(torch.nn.Module): method __init__ (line 558) | def __init__(self, prefix, config, weights): method forward (line 587) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py function repeat_kv (line 39) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class Idefics2VisionEmbeddings (line 53) | class Idefics2VisionEmbeddings(nn.Module): method __init__ (line 64) | def __init__(self, prefix, config, weights): method forward (line 91) | def forward( class Idefics2VisionAttention (line 134) | class Idefics2VisionAttention(nn.Module): method __init__ (line 135) | def __init__(self, prefix, config, weights): method forward (line 164) | def forward( class Idefics2VisionMLP (line 232) | class Idefics2VisionMLP(nn.Module): method __init__ (line 233) | def __init__(self, prefix, config, weights): method forward (line 244) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Idefics2EncoderLayer (line 251) | class Idefics2EncoderLayer(nn.Module): method __init__ (line 252) | def __init__(self, prefix, config, weights): method forward (line 269) | def forward( class Idefics2Encoder (line 291) | class Idefics2Encoder(nn.Module): method __init__ (line 292) | def __init__(self, prefix, config, weights): method forward (line 305) | def forward( class Idefics2VisionTransformer (line 319) | class Idefics2VisionTransformer(nn.Module): method __init__ (line 320) | def __init__(self, prefix, config, weights): method forward (line 335) | def forward( class Idefics2MLP (line 380) | class Idefics2MLP(nn.Module): method __init__ (line 381) | def __init__(self, prefix, config, weights): method forward (line 408) | def forward(self, hidden_states): class Idefics2RMSNorm (line 418) | class Idefics2RMSNorm(nn.Module): method __init__ (line 419) | def __init__(self, prefix, weights, eps): method forward (line 429) | def forward(self, hidden_states): class Idefics2PerceiverAttention (line 437) | class Idefics2PerceiverAttention(nn.Module): method __init__ (line 438) | def __init__(self, prefix, config, weights): method forward (line 472) | def forward( class Idefics2PerceiverLayer (line 544) | class Idefics2PerceiverLayer(nn.Module): method __init__ (line 545) | def __init__(self, prefix, config, weights): method forward (line 572) | def forward( class Idefics2PerceiverResampler (line 605) | class Idefics2PerceiverResampler(nn.Module): method __init__ (line 606) | def __init__(self, prefix, config, weights) -> None: method forward (line 632) | def forward( class Idefics2Connector (line 664) | class Idefics2Connector(nn.Module): method __init__ (line 665) | def __init__(self, prefix, config, weights): method forward (line 674) | def forward(self, image_hidden_states, attention_mask): class Idefics2ForConditionalGeneration (line 682) | class Idefics2ForConditionalGeneration(nn.Module): method __init__ (line 683) | def __init__(self, prefix, config, weights): method _merge_input_ids_with_image_features (line 723) | def _merge_input_ids_with_image_features( method get_vision_embeds (line 737) | def get_vision_embeds( method get_inputs_embeds (line 820) | def get_inputs_embeds( method forward (line 835) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py function repeat_kv (line 38) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class Idefics3VisionEmbeddings (line 52) | class Idefics3VisionEmbeddings(nn.Module): method __init__ (line 63) | def __init__(self, prefix, config, weights): method forward (line 90) | def forward( class Idefics3VisionAttention (line 133) | class Idefics3VisionAttention(nn.Module): method __init__ (line 134) | def __init__(self, prefix, config, weights): method forward (line 163) | def forward( class Idefics3VisionMLP (line 231) | class Idefics3VisionMLP(nn.Module): method __init__ (line 232) | def __init__(self, prefix, config, weights): method forward (line 243) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Idefics3EncoderLayer (line 250) | class Idefics3EncoderLayer(nn.Module): method __init__ (line 251) | def __init__(self, prefix, config, weights): method forward (line 268) | def forward( class Idefics3Encoder (line 290) | class Idefics3Encoder(nn.Module): method __init__ (line 291) | def __init__(self, prefix, config, weights): method forward (line 304) | def forward( class Idefics3VisionTransformer (line 318) | class Idefics3VisionTransformer(nn.Module): method __init__ (line 319) | def __init__(self, prefix, config, weights): method forward (line 334) | def forward( class Idefics3SimpleMLP (line 379) | class Idefics3SimpleMLP(nn.Module): method __init__ (line 380) | def __init__(self, prefix, config, weights): method forward (line 391) | def forward(self, x): class Idefics3Connector (line 395) | class Idefics3Connector(nn.Module): method __init__ (line 396) | def __init__(self, prefix, config, weights): method pixel_shuffle (line 401) | def pixel_shuffle(self, x, scale_factor=2): method forward (line 417) | def forward(self, image_hidden_states): class Idefics3ForConditionalGeneration (line 423) | class Idefics3ForConditionalGeneration(nn.Module): method __init__ (line 424) | def __init__(self, prefix, config, weights): method _merge_input_ids_with_image_features (line 466) | def _merge_input_ids_with_image_features( method get_vision_embeds (line 480) | def get_vision_embeds( method get_inputs_embeds (line 563) | def get_inputs_embeds( method forward (line 578) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py class InferenceParams (line 25) | class InferenceParams: class MambaConfig (line 36) | class MambaConfig(PretrainedConfig): method __init__ (line 37) | def __init__( class MambaBlock (line 71) | class MambaBlock(nn.Module): method __init__ (line 72) | def __init__(self, prefix, config, weights, layer_id): method forward (line 94) | def forward(self, hidden_states: torch.Tensor, inference_params=None): method step (line 140) | def step(self, hidden_states, conv_state, ssm_state): class ResidualBlock (line 170) | class ResidualBlock(nn.Module): method __init__ (line 171) | def __init__(self, prefix, config, weights, layer_id): method forward (line 180) | def forward( class MambaModel (line 195) | class MambaModel(nn.Module): method __init__ (line 196) | def __init__(self, config, weights): method forward (line 218) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py class Qwen2_5_VLVideosProcessorKwargs (line 68) | class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): class Qwen2_5_VLProcessorKwargs (line 72) | class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): class Qwen2_5_VLProcessor (line 82) | class Qwen2_5_VLProcessor(ProcessorMixin): method __init__ (line 102) | def __init__( method __call__ (line 117) | def __call__( method batch_decode (line 237) | def batch_decode(self, *args, **kwargs): method decode (line 244) | def decode(self, *args, **kwargs): method post_process_image_text_to_text (line 251) | def post_process_image_text_to_text(self, generated_outputs): method model_input_names (line 270) | def model_input_names(self): class Qwen2_5_VLVisionConfig (line 280) | class Qwen2_5_VLVisionConfig(PretrainedConfig): method __init__ (line 284) | def __init__( class Qwen2_5_VLConfig (line 320) | class Qwen2_5_VLConfig(PretrainedConfig): method __init__ (line 322) | def __init__( class Qwen2_5VLAttention (line 384) | class Qwen2_5VLAttention(nn.Module): method __init__ (line 385) | def __init__(self, *, prefix, config, weights): method forward (line 409) | def forward( class Qwen2_5VLVisionMLP (line 478) | class Qwen2_5VLVisionMLP(nn.Module): method __init__ (line 479) | def __init__(self, *, prefix, config, weights): method forward (line 497) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Qwen2_5VLVisionBlock (line 505) | class Qwen2_5VLVisionBlock(nn.Module): method __init__ (line 506) | def __init__(self, prefix, config, weights): method forward (line 529) | def forward(self, hidden_states, cu_seqlens, cos, sin, max_seqlen) -> ... class Qwen2_5VLPatchMerger (line 539) | class Qwen2_5VLPatchMerger(nn.Module): method __init__ (line 540) | def __init__(self, *, prefix, config, weights): method forward (line 555) | def forward(self, hidden_states) -> torch.Tensor: class Qwen2_5VisionModel (line 564) | class Qwen2_5VisionModel(nn.Module): method __init__ (line 565) | def __init__(self, *, prefix, config, weights): method apply_class_embedding (line 612) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T... method get_window_index (line 618) | def get_window_index(self, grid_thw): method forward (line 665) | def forward( class Qwen2_5VLForConditionalGeneration (line 774) | class Qwen2_5VLForConditionalGeneration(nn.Module): method __init__ (line 775) | def __init__(self, prefix, config, weights): method get_position_ids (line 824) | def get_position_ids( method get_vision_embeds (line 898) | def get_vision_embeds( method get_inputs_embeds (line 908) | def get_inputs_embeds( method forward (line 922) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py class Qwen2VLAttention (line 54) | class Qwen2VLAttention(nn.Module): method __init__ (line 55) | def __init__(self, *, prefix, config, weights): method forward (line 78) | def forward( class Qwen2VLVisionMLP (line 147) | class Qwen2VLVisionMLP(nn.Module): method __init__ (line 148) | def __init__(self, *, prefix, config, weights): method forward (line 158) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Qwen2VLVisionBlock (line 165) | class Qwen2VLVisionBlock(nn.Module): method __init__ (line 166) | def __init__(self, prefix, config, weights): method forward (line 189) | def forward(self, hidden_states, cu_seqlens, cos, sin, max_seqlen) -> ... class Qwen2VLPatchMerger (line 198) | class Qwen2VLPatchMerger(nn.Module): method __init__ (line 199) | def __init__(self, *, prefix, config, weights): method forward (line 214) | def forward(self, hidden_states) -> torch.Tensor: class Qwen2VisionModel (line 223) | class Qwen2VisionModel(nn.Module): method __init__ (line 224) | def __init__(self, *, prefix, config, weights): method apply_class_embedding (line 266) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T... method forward (line 272) | def forward( class Qwen2VLForConditionalGeneration (line 349) | class Qwen2VLForConditionalGeneration(nn.Module): method __init__ (line 350) | def __init__(self, prefix, config, weights): method get_position_ids (line 404) | def get_position_ids( method get_vision_embeds (line 478) | def get_vision_embeds( method get_inputs_embeds (line 488) | def get_inputs_embeds( method forward (line 502) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/siglip.py class SiglipVisionEmbeddings (line 21) | class SiglipVisionEmbeddings(nn.Module): method __init__ (line 22) | def __init__(self, prefix, config: SiglipVisionConfig, weights): method forward (line 52) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: class SiglipAttention (line 62) | class SiglipAttention(nn.Module): method __init__ (line 65) | def __init__(self, prefix, config, weights): method _shape (line 95) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 102) | def forward( class SiglipMLP (line 163) | class SiglipMLP(nn.Module): method __init__ (line 164) | def __init__(self, prefix, config, weights): method forward (line 175) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class SiglipEncoderLayer (line 182) | class SiglipEncoderLayer(nn.Module): method __init__ (line 183) | def __init__(self, prefix, config: SiglipConfig, weights): method forward (line 197) | def forward( class SiglipMultiheadAttentionPoolingHead (line 216) | class SiglipMultiheadAttentionPoolingHead(nn.Module): method __init__ (line 219) | def __init__(self, prefix, config: SiglipVisionConfig, weights): method forward (line 229) | def forward(self, hidden_state): function _trunc_normal_ (line 242) | def _trunc_normal_(tensor, mean, std, a, b): function trunc_normal_tf_ (line 278) | def trunc_normal_tf_( function variance_scaling_ (line 308) | def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="no... function lecun_normal_ (line 333) | def lecun_normal_(tensor): function default_flax_embed_init (line 337) | def default_flax_embed_init(tensor): class SiglipEncoder (line 341) | class SiglipEncoder(nn.Module): method __init__ (line 350) | def __init__(self, prefix, config: SiglipConfig, weights): method forward (line 362) | def forward( class SiglipVisionTransformer (line 377) | class SiglipVisionTransformer(nn.Module): method __init__ (line 378) | def __init__(self, prefix, config: SiglipVisionConfig, weights): method forward (line 389) | def forward( FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py function load_text_model (line 1) | def load_text_model(prefix, config, weights, name=None): function load_vision_model (line 42) | def load_vision_model(prefix, config, weights): FILE: backends/gaudi/server/text_generation_server/models/flash_causal_lm.py function generate_block_metadata (line 84) | def generate_block_metadata( class FlashCausalLMBatch (line 171) | class FlashCausalLMBatch(Batch): method to_pb (line 261) | def to_pb(self) -> generate_pb2.CachedBatch: method batch_tokenized_inputs (line 275) | def batch_tokenized_inputs( method from_tokenized (line 295) | def from_tokenized( method from_pb (line 495) | def from_pb( method filter (line 507) | def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": method concatenate (line 699) | def concatenate( method prepare_for_decode (line 980) | def prepare_for_decode( method prepare_for_prefill (line 1097) | def prepare_for_prefill( method __len__ (line 1422) | def __len__(self): class FlashCausalLM (line 1438) | class FlashCausalLM(Model): method __init__ (line 1439) | def __init__( method batch_type (line 1592) | def batch_type(self) -> Type[FlashCausalLMBatch]: method max_past (line 1595) | def max_past(self) -> int: method init_kv_cache (line 1598) | def init_kv_cache( method warmup (line 1631) | def warmup( method log_warmup (line 1766) | def log_warmup(self, prefilling, i, max_i, batch_size, seq_len): method use_graphs (line 1782) | def use_graphs(self, prefill, seq_len, batch_size): method align_workers (line 1791) | def align_workers(self, value, op): method warmup_hpu_graph (line 1798) | def warmup_hpu_graph(self, batch): method warmup_prefill (line 1908) | def warmup_prefill( method warmup_decode (line 1964) | def warmup_decode(self, batch_size: int, block_num: int, batch: FlashC... method forward (line 2063) | def forward( method generate_token (line 2179) | def generate_token( FILE: backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py function prompt_split_image_llama4 (line 44) | def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk): function _prompt_split_image (line 72) | def _prompt_split_image( function get_anyres_image_grid_shape (line 101) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): function image_text_replacement (line 124) | def image_text_replacement(processor, image_input, config) -> str: function image_text_replacement_fixup (line 197) | def image_text_replacement_fixup(config, text: str) -> str: function preprocess_text (line 205) | def preprocess_text(config, text: str) -> str: function preprocess_image (line 211) | def preprocess_image(config, img): function get_unpadded_features (line 226) | def get_unpadded_features( function get_number_of_features (line 253) | def get_number_of_features(height: int, width: int, config) -> int: function scatter_image_embeds (line 280) | def scatter_image_embeds( function gather_image_embeds (line 294) | def gather_image_embeds( class ImagePositions (line 304) | class ImagePositions: class FlashVlmCausalLMBatch (line 312) | class FlashVlmCausalLMBatch(FlashCausalLMBatch): method concatenate (line 326) | def concatenate(cls, batches, padded_total_bs: int = 0): method filter (line 356) | def filter(self, request_ids: List[int]): method batch_tokenized_inputs (line 386) | def batch_tokenized_inputs( method get_image_positions (line 464) | def get_image_positions( method from_pb_processor (line 535) | def from_pb_processor( method prepare_for_prefill (line 558) | def prepare_for_prefill( method update_encoder_cache (line 628) | def update_encoder_cache(self, encoder_outputs, request_id, img_pos): method gather_vision_embeds (line 633) | def gather_vision_embeds(self): method free_encoder_cache (line 696) | def free_encoder_cache(self): class FlashVlmCausalLM (line 703) | class FlashVlmCausalLM(FlashCausalLM): method __init__ (line 704) | def __init__( method batch_type (line 736) | def batch_type(self) -> Type[FlashVlmCausalLMBatch]: method max_past (line 739) | def max_past(self) -> Optional[int]: method warmup_decode (line 742) | def warmup_decode( method warmup_hpu_graph (line 844) | def warmup_hpu_graph(self, batch: FlashVlmCausalLMBatch): method get_vision_embeds (line 908) | def get_vision_embeds( method get_inputs_embeds (line 923) | def get_inputs_embeds( method encode_images (line 933) | def encode_images(self, batch): method set_inputs_embeds (line 972) | def set_inputs_embeds(self, batch): method forward (line 986) | def forward( FILE: backends/gaudi/server/text_generation_server/models/globals.py function set_model_id (line 35) | def set_model_id(model_id: str): function set_adapter_to_index (line 45) | def set_adapter_to_index(adapter_to_index: Dict[str, int]): function get_adapter_to_index (line 50) | def get_adapter_to_index(): FILE: backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py class FlashMllamaCausalLMBatch (line 45) | class FlashMllamaCausalLMBatch(FlashVlmCausalLMBatch): method prepare_for_prefill (line 51) | def prepare_for_prefill( method concatenate (line 60) | def concatenate(cls, batches, padded_total_bs: int = 0): method filter (line 83) | def filter(self, request_ids: List[int]): method batch_tokenized_inputs (line 115) | def batch_tokenized_inputs( method from_pb_processor (line 181) | def from_pb_processor( function generate_cross_attention_states (line 225) | def generate_cross_attention_states( class FlashMllamaCausalLM (line 240) | class FlashMllamaCausalLM(FlashVlmCausalLM): method set_inputs_embeds (line 241) | def set_inputs_embeds(self, batch): method warmup_decode (line 245) | def warmup_decode( method warmup_prefill (line 316) | def warmup_prefill( method warmup_hpu_graph (line 378) | def warmup_hpu_graph(self, batch: FlashMllamaCausalLMBatch): method forward (line 489) | def forward( FILE: backends/gaudi/server/text_generation_server/models/model.py class Model (line 22) | class Model(ABC): method __init__ (line 23) | def __init__( method info (line 74) | def info(self) -> InfoResponse: method batch_type (line 89) | def batch_type(self) -> Type[B]: method generate_token (line 93) | def generate_token( method warmup (line 98) | def warmup( method decode_token (line 104) | def decode_token( method check_initialized (line 134) | def check_initialized(self): FILE: backends/gaudi/server/text_generation_server/models/seq2seq_lm.py class Seq2SeqLMBatch (line 35) | class Seq2SeqLMBatch(Batch): method to_pb (line 75) | def to_pb(self) -> generate_pb2.CachedBatch: method from_pb (line 85) | def from_pb( method filter (line 179) | def filter(self, request_ids: List[int]) -> Optional["Seq2SeqLMBatch"]: method concatenate (line 294) | def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBat... method __len__ (line 536) | def __len__(self): class Seq2SeqLM (line 540) | class Seq2SeqLM(Model): method __init__ (line 541) | def __init__( method fallback (line 609) | def fallback( method batch_type (line 671) | def batch_type(self) -> Type[Seq2SeqLMBatch]: method forward (line 674) | def forward( method generate_token (line 712) | def generate_token( FILE: backends/gaudi/server/text_generation_server/models/types.py class Batch (line 13) | class Batch(ABC): method to_pb (line 15) | def to_pb(self) -> generate_pb2.CachedBatch: method from_pb (line 20) | def from_pb( method filter (line 30) | def filter(self, request_ids: List[int]) -> "Batch": method concatenate (line 35) | def concatenate(cls, batches: List["Batch"]) -> "Batch": method __len__ (line 39) | def __len__(self): class GeneratedText (line 44) | class GeneratedText: method to_pb (line 50) | def to_pb(self) -> generate_pb2.GeneratedText: class Tokens (line 60) | class Tokens: method to_pb (line 66) | def to_pb(self) -> generate_pb2.Tokens: method __len__ (line 74) | def __len__(self): class Generation (line 79) | class Generation: method to_pb (line 87) | def to_pb(self) -> generate_pb2.Generation: FILE: backends/gaudi/server/text_generation_server/server.py class SignalHandler (line 34) | class SignalHandler: method __init__ (line 37) | def __init__(self): method exit_gracefully (line 41) | def exit_gracefully(self, signum, frame): class TextGenerationService (line 46) | class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServi... method __init__ (line 47) | def __init__( method Info (line 65) | async def Info(self, request, context): method Health (line 68) | async def Health(self, request, context): method ServiceDiscovery (line 73) | async def ServiceDiscovery(self, request, context): method ClearCache (line 76) | async def ClearCache(self, request, context): method FilterBatch (line 83) | async def FilterBatch(self, request, context): method Warmup (line 92) | async def Warmup(self, request, context): method Prefill (line 144) | async def Prefill(self, request, context): method Decode (line 173) | async def Decode(self, request, context): function serve (line 201) | def serve( FILE: backends/gaudi/server/text_generation_server/tracing.py class UDSOpenTelemetryAioServerInterceptor (line 16) | class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterce... method __init__ (line 17) | def __init__(self): method _start_span (line 20) | def _start_span(self, handler_call_details, context, set_status_on_exc... function setup_tracing (line 57) | def setup_tracing(otlp_service_name: str, otlp_endpoint: str): FILE: backends/gaudi/server/text_generation_server/utils/adapter.py class AdapterInfo (line 28) | class AdapterInfo: class AdapterParameters (line 35) | class AdapterParameters: class AdapterSource (line 44) | class AdapterSource: function parse_lora_adapters (line 50) | def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]: function load_and_merge_adapters (line 71) | def load_and_merge_adapters( class AdapterParametersContainer (line 99) | class AdapterParametersContainer: method __hash__ (line 103) | def __hash__(self) -> int: function _load_and_merge (line 108) | def _load_and_merge( function check_architectures (line 146) | def check_architectures( function load_module_map (line 185) | def load_module_map( function get_attn_weights (line 233) | def get_attn_weights(i, layer): function get_mlp_weights (line 256) | def get_mlp_weights(i, layer): function build_layer_weight_lookup (line 294) | def build_layer_weight_lookup(model): FILE: backends/gaudi/server/text_generation_server/utils/chunks.py function concat_text_chunks (line 8) | def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str: FILE: backends/gaudi/server/text_generation_server/utils/convert.py function _remove_duplicate_names (line 12) | def _remove_duplicate_names( function convert_file (line 62) | def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]): function convert_files (line 96) | def convert_files(pt_files: List[Path], sf_files: List[Path], discard_na... FILE: backends/gaudi/server/text_generation_server/utils/debug.py function to_gb_rounded (line 17) | def to_gb_rounded(mem: float) -> float: function count_hpu_graphs (line 30) | def count_hpu_graphs(): function dbg_trace (line 34) | def dbg_trace(tag, txt): FILE: backends/gaudi/server/text_generation_server/utils/dist.py class FakeBarrier (line 13) | class FakeBarrier: method wait (line 14) | def wait(self): class FakeGroup (line 18) | class FakeGroup(ProcessGroup): method __init__ (line 19) | def __init__(self, rank, size): method allreduce (line 24) | def allreduce(self, *args, **kwargs): method allgather (line 27) | def allgather(self, inputs, local_tensor, **kwargs): method barrier (line 35) | def barrier(self, *args, **kwargs): method size (line 38) | def size(self): method rank (line 41) | def rank(self): method _get_backend_name (line 44) | def _get_backend_name(self): function initialize_torch_distributed (line 48) | def initialize_torch_distributed(): FILE: backends/gaudi/server/text_generation_server/utils/hub.py function _cached_weight_files (line 21) | def _cached_weight_files( function _weight_hub_files_from_model_info (line 32) | def _weight_hub_files_from_model_info( function _weight_files_from_dir (line 46) | def _weight_files_from_dir(d: Path, extension: str) -> List[str]: function _get_cached_revision_directory (line 62) | def _get_cached_revision_directory( function weight_hub_files (line 97) | def weight_hub_files( function try_to_load_from_cache (line 119) | def try_to_load_from_cache( function weight_files (line 133) | def weight_files( function download_weights (line 188) | def download_weights( FILE: backends/gaudi/server/text_generation_server/utils/import_utils.py function get_hpu_free_memory (line 4) | def get_hpu_free_memory(device, memory_fraction): function synchronize_hpu (line 9) | def synchronize_hpu(device): function noop (line 13) | def noop(*args, **kwargs): FILE: backends/gaudi/server/text_generation_server/utils/kernels.py function load_kernel (line 9) | def load_kernel(*, module: str, repo_id: str): FILE: backends/gaudi/server/text_generation_server/utils/log.py function log_once (line 6) | def log_once(log, msg: str, master=True): function log_master (line 13) | def log_master(log, msg: str): FILE: backends/gaudi/server/text_generation_server/utils/logits_process.py class StaticWarper (line 26) | class StaticWarper: method __init__ (line 27) | def __init__( method __call__ (line 51) | def __call__(self, scores): function static_warper (line 76) | def static_warper( class HeterogeneousRepetitionPenaltyLogitsProcessor (line 87) | class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor): method __init__ (line 99) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t... method __call__ (line 105) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 116) | def filter(self, indices): class FrequencyPenaltyLogitsProcessor (line 124) | class FrequencyPenaltyLogitsProcessor(LogitsProcessor): method __init__ (line 133) | def __init__(self, penalty: float): method __call__ (line 136) | def __call__( class HeterogeneousFrequencyPenaltyLogitsProcessor (line 148) | class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor): method __init__ (line 158) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t... method __call__ (line 164) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 183) | def filter(self, indices): class HeterogeneousTemperatureLogitsWarper (line 191) | class HeterogeneousTemperatureLogitsWarper: method __init__ (line 202) | def __init__( method __call__ (line 210) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 214) | def filter(self, indices): class HeterogeneousTopPLogitsWarper (line 222) | class HeterogeneousTopPLogitsWarper(LogitsProcessor): method __init__ (line 238) | def __init__( method __call__ (line 253) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 273) | def filter(self, indices): class HeterogeneousTopKLogitsWarper (line 281) | class HeterogeneousTopKLogitsWarper(LogitsProcessor): method __init__ (line 296) | def __init__( method __call__ (line 324) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 345) | def filter(self, indices): class HeterogeneousTypicalLogitsWarper (line 362) | class HeterogeneousTypicalLogitsWarper(LogitsProcessor): method __init__ (line 378) | def __init__( method __call__ (line 400) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 436) | def filter(self, indices): class HeterogeneousProcessorWrapper (line 452) | class HeterogeneousProcessorWrapper(LogitsProcessor): method __init__ (line 460) | def __init__( method __call__ (line 466) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 471) | def filter(self, indices): class GrammarLogitProcessor (line 483) | class GrammarLogitProcessor(LogitsProcessor): method __init__ (line 487) | def __init__(self, tokenizer, device, grammar, grammar_type): method __call__ (line 494) | def __call__( method advance (line 507) | def advance(self, next_token_id, fsm_grammar_state): method _advance (line 513) | def _advance(next_token_id, fsm_grammar_state, fsm): method _cached_compile_fsm (line 521) | def _cached_compile_fsm(grammar_type, schema, tokenizer): method _cached_adapt_tokenizer (line 533) | def _cached_adapt_tokenizer(tokenizer): class HeterogeneousGrammarLogitProcessor (line 561) | class HeterogeneousGrammarLogitProcessor(LogitsProcessor): method __init__ (line 562) | def __init__(self, tokenizer, device, grammars, grammar_types): method __call__ (line 575) | def __call__( method advance_batch (line 590) | def advance_batch(self, next_token_ids, fsm_grammar_states): method advance_at_index (line 598) | def advance_at_index(self, next_token_id, fsm_grammar_state, index): method filter (line 605) | def filter(self, indices): FILE: backends/gaudi/server/text_generation_server/utils/merges/strategies.py class AdapterParameters (line 17) | class AdapterParameters: method __init__ (line 18) | def __init__( function _apply_weights (line 28) | def _apply_weights( class MergeStrategy (line 44) | class MergeStrategy(ABC): method merge (line 45) | def merge( class LinearMerge (line 51) | class LinearMerge(MergeStrategy): method __init__ (line 52) | def __init__(self, **kwargs): method merge (line 55) | def merge( class TiesMerge (line 62) | class TiesMerge(MergeStrategy): method __init__ (line 63) | def __init__(self, density: float, majority_sign_method: str = "total"... method merge (line 67) | def merge( class DareLinearMerge (line 86) | class DareLinearMerge(MergeStrategy): method __init__ (line 87) | def __init__(self, density: float, **kwargs): method merge (line 90) | def merge( class DareTiesMerge (line 102) | class DareTiesMerge(MergeStrategy): method __init__ (line 103) | def __init__(self, density: float, majority_sign_method: str = "total"... method merge (line 107) | def merge( function merge_adapters (line 136) | def merge_adapters( function _validate_lora_configs (line 193) | def _validate_lora_configs(lora_configs: List["LoraConfig"]): function _merge_lora_configs (line 207) | def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig": FILE: backends/gaudi/server/text_generation_server/utils/merges/utils.py function magnitude_based_pruning (line 23) | def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> tor... function random_pruning (line 39) | def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) ... function prune (line 56) | def prune( function calculate_majority_sign_mask (line 83) | def calculate_majority_sign_mask( function disjoint_merge (line 105) | def disjoint_merge(task_tensors, majority_sign_mask): FILE: backends/gaudi/server/text_generation_server/utils/peft.py function download_and_unload_peft (line 10) | def download_and_unload_peft(model_id, revision, trust_remote_code): function download_peft (line 48) | def download_peft( FILE: backends/gaudi/server/text_generation_server/utils/prefill_chunking.py function set_support_chunking (line 7) | def set_support_chunking(support_chunking: bool): function get_support_chunking (line 12) | def get_support_chunking() -> bool: function set_max_prefill_tokens (line 17) | def set_max_prefill_tokens(max_prefill_tokens: int): function get_max_prefill_tokens (line 22) | def get_max_prefill_tokens() -> int: FILE: backends/gaudi/server/text_generation_server/utils/quantization.py class _QuantizerConfig (line 14) | class _QuantizerConfig: class _FP8QuantizerConfig (line 26) | class _FP8QuantizerConfig: function _get_config_json (line 30) | def _get_config_json(model_id: str, revision: Optional[str], filename: s... function _get_quantizer_config (line 45) | def _get_quantizer_config(model_id, revision): function get_loader (line 122) | def get_loader( FILE: backends/gaudi/server/text_generation_server/utils/segments.py function find_segments (line 10) | def find_segments( class SegmentConcatBuilder (line 35) | class SegmentConcatBuilder: method __init__ (line 36) | def __init__(self): method concat (line 40) | def concat(self, adapter_segments: torch.Tensor, segment_indices: List... method build (line 65) | def build(self) -> Tuple[torch.Tensor, List[int]]: FILE: backends/gaudi/server/text_generation_server/utils/sgmv.py function has_sgmv (line 30) | def has_sgmv() -> bool: function pad_rank (line 34) | def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor: function use_cutlass_shrink (line 64) | def use_cutlass_shrink(lora_rank: int) -> bool: function orient_for_rank (line 68) | def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor: function add_lora_sgmv_cutlass (line 75) | def add_lora_sgmv_cutlass( function _add_lora_sgmv_cutlass_legacy (line 115) | def _add_lora_sgmv_cutlass_legacy( function get_tmp_tensor (line 133) | def get_tmp_tensor(device: torch.device) -> torch.Tensor: function get_tmp_tensor_for_size (line 138) | def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Te... function get_tmp_tensor_for_size_no_kernels (line 143) | def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) ... function get_tmp_expand_size (line 147) | def get_tmp_expand_size(size: int) -> int: function get_tmp_tensors (line 151) | def get_tmp_tensors( function lora_a_sgmv_cutlass (line 167) | def lora_a_sgmv_cutlass( function lora_b_sgmv_cutlass (line 184) | def lora_b_sgmv_cutlass( function add_lora_a_bgmv (line 217) | def add_lora_a_bgmv( function add_lora_b_bgmv (line 227) | def add_lora_b_bgmv( function segmented_matmul (line 237) | def segmented_matmul( FILE: backends/gaudi/server/text_generation_server/utils/speculate.py function get_speculate (line 4) | def get_speculate() -> int: function set_speculate (line 9) | def set_speculate(speculate: int): FILE: backends/gaudi/server/text_generation_server/utils/tokens.py class NextTokenChooser (line 27) | class NextTokenChooser: method __init__ (line 28) | def __init__( method __call__ (line 84) | def __call__(self, input_ids, scores): method advance_grammar (line 103) | def advance_grammar(self, next_id: int): method from_pb (line 111) | def from_pb( class StopSequenceCriteria (line 134) | class StopSequenceCriteria: method __init__ (line 135) | def __init__(self, stop_sequence: str): method __call__ (line 139) | def __call__(self, output: str) -> bool: class StoppingCriteria (line 145) | class StoppingCriteria: method __init__ (line 146) | def __init__( method __call__ (line 174) | def __call__(self, last_token: int, last_output: str) -> Tuple[bool, O... method from_pb (line 198) | def from_pb( function create_n_gram_speculation (line 216) | def create_n_gram_speculation( class HeterogeneousNextTokenChooser (line 240) | class HeterogeneousNextTokenChooser: method __init__ (line 241) | def __init__( method __call__ (line 335) | def __call__( method advance_grammar (line 424) | def advance_grammar(self, next_ids: List[int]): method advance_grammar_single (line 432) | def advance_grammar_single(self, grammar_state_index: int, next_id: int): method advance_grammar_single_with_past_state (line 443) | def advance_grammar_single_with_past_state( method filter (line 457) | def filter(self, indices): method from_pb (line 500) | def from_pb( function pad_next_token_chooser_parameters (line 531) | def pad_next_token_chooser_parameters( class Sampling (line 553) | class Sampling: method __init__ (line 554) | def __init__(self, seed: int, device: str = "cpu"): method __call__ (line 564) | def __call__(self, logits): class Greedy (line 572) | class Greedy: method __call__ (line 573) | def __call__(self, logits): class HeterogeneousSampling (line 577) | class HeterogeneousSampling: method __init__ (line 582) | def __init__(self, do_sample: List[bool], seeds: List[int], device: to... method __call__ (line 595) | def __call__(self, logits): method filter (line 605) | def filter(self, indices): function batch_top_tokens (line 619) | def batch_top_tokens( function make_tokenizer_optional (line 700) | def make_tokenizer_optional(tokenizer): function is_tokenizer_transparent (line 766) | def is_tokenizer_transparent(tokenizer): FILE: backends/gaudi/server/text_generation_server/utils/version.py function get_driver_version (line 6) | def get_driver_version(): function is_driver_compatible (line 32) | def is_driver_compatible(): FILE: backends/gaudi/server/text_generation_server/utils/watermark.py class WatermarkLogitsProcessor (line 26) | class WatermarkLogitsProcessor(LogitsProcessor): method __init__ (line 27) | def __init__( method _seed_rng (line 40) | def _seed_rng(self, input_ids: Union[List[int], torch.LongTensor]): method _get_greenlist_ids (line 55) | def _get_greenlist_ids( method _calc_greenlist_mask (line 70) | def _calc_greenlist_mask( method _bias_greenlist_logits (line 79) | def _bias_greenlist_logits( method __call__ (line 85) | def __call__( FILE: backends/gaudi/server/text_generation_server/utils/weights.py class WeightsLoader (line 11) | class WeightsLoader(ABC): method get_weights (line 23) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 30) | def get_weights_col_packed( method get_weights_col (line 50) | def get_weights_col(self, weights: "Weights", prefix: str): method get_multi_weights_col (line 58) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_multi_weights (line 66) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d... method get_weights_row (line 74) | def get_weights_row(self, weights: "Weights", prefix: str): class Weight (line 82) | class Weight(ABC): method get_linear (line 87) | def get_linear(self, bias: torch.Tensor): class UnquantizedWeight (line 93) | class UnquantizedWeight(Weight): method get_linear (line 96) | def get_linear(self, bias: torch.Tensor): class DefaultWeightsLoader (line 102) | class DefaultWeightsLoader(WeightsLoader): method __init__ (line 105) | def __init__(self, weight_class: Type[UnquantizedWeight]): method get_weights (line 117) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 120) | def get_weights_col_packed( method get_multi_weights_col (line 132) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_weights_row (line 136) | def get_weights_row(self, weights: "Weights", prefix: str): method get_multi_weights (line 141) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d... class Weights (line 146) | class Weights: method __init__ (line 147) | def __init__( method _get_handle (line 177) | def _get_handle(self, filename): method get_filename (line 184) | def get_filename(self, tensor_name: str) -> (str, str): method _get_slice (line 201) | def _get_slice(self, tensor_name: str): method has_tensor (line 207) | def has_tensor(self, tensor_name: str): method get_shape (line 214) | def get_shape(self, tensor_name: str): method get_tensor (line 217) | def get_tensor( method get_partial_sharded (line 242) | def get_partial_sharded( method get_sharded (line 275) | def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_d... method get_packed_sharded (line 288) | def get_packed_sharded( method get_weights (line 357) | def get_weights(self, prefix: str): method get_weights_col_packed_qkv (line 360) | def get_weights_col_packed_qkv( method get_weights_col_packed_gate_up (line 370) | def get_weights_col_packed_gate_up(self, prefix: str): method get_weights_col_packed (line 373) | def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, ... method get_weights_col (line 383) | def get_weights_col(self, prefix: str): method get_multi_weights_col (line 386) | def get_multi_weights_col(self, prefixes: List[str], dim: int): method get_tensor_shard (line 389) | def get_tensor_shard(self, var, dim): method get_weights_row (line 405) | def get_weights_row(self, prefix: str): method get_multi_weights (line 408) | def get_multi_weights(self, prefixes: List[str], dim: int): method use_loader (line 412) | def use_loader(self, weights_loader: WeightsLoader): method loader (line 426) | def loader(self): function _blocks_to_block_sizes (line 430) | def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]... FILE: backends/grpc-metadata/src/lib.rs type MetadataInjector (line 9) | struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap); method set (line 13) | fn set(&mut self, key: &str, value: String) { function inject (line 23) | fn inject(metadata: &mut tonic::metadata::MetadataMap) { type InjectTelemetryContext (line 32) | pub trait InjectTelemetryContext { method inject_context (line 33) | fn inject_context(self) -> Self; method inject_context (line 37) | fn inject_context(mut self) -> Self { FILE: backends/llamacpp/build.rs type PrefixStripper (line 6) | struct PrefixStripper; method generated_name_override (line 9) | fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option Result { type LlamacppNuma (line 43) | pub enum LlamacppNuma { type LlamacppGGMLType (line 53) | pub enum LlamacppGGMLType { method to_ggml_type (line 89) | fn to_ggml_type(self) -> llamacpp::ggml_type { type LlamacppConfig (line 126) | pub struct LlamacppConfig { type LlamacppRequest (line 147) | struct LlamacppRequest { method new (line 170) | fn new( type LlamacppBackend (line 164) | pub struct LlamacppBackend { method new (line 427) | pub fn new( type Llamacpp (line 193) | struct Llamacpp { method new (line 219) | fn new(conf: LlamacppConfig) -> Result { method decode (line 284) | fn decode(&mut self) -> i32 { method clear_kv_cache (line 288) | fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) { method batch_push (line 294) | fn batch_push( function llamacpp_log_callback (line 201) | extern "C" fn llamacpp_log_callback( method drop (line 315) | fn drop(&mut self) { type LlamacppSampler (line 326) | struct LlamacppSampler { method new (line 331) | fn new(req: &LlamacppRequest) -> Option { method sample (line 381) | fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (llamacpp::ll... method drop (line 406) | fn drop(&mut self) { type LlamacppSeq (line 413) | struct LlamacppSeq { method schedule (line 644) | fn schedule( method health (line 659) | async fn health(&self, _: bool) -> bool { method name (line 663) | fn name(&self) -> &'static str { type BackendError (line 669) | pub enum BackendError { FILE: backends/llamacpp/src/main.rs type Args (line 25) | struct Args { function main (line 167) | async fn main() -> Result<(), RouterError> { type RouterError (line 335) | enum RouterError { FILE: backends/llamacpp/src/quantize.rs type QuantizeType (line 7) | pub enum QuantizeType { function model (line 11) | pub fn model( FILE: backends/neuron/server/text_generation_server/cli.py function serve (line 12) | def serve( function download_weights (line 75) | def download_weights( FILE: backends/neuron/server/text_generation_server/generator.py class Generator (line 35) | class Generator(ABC): method info (line 43) | def info(self) -> InfoResponse: method warmup (line 47) | def warmup(self, batch: Batch) -> int: method prefill (line 59) | def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]: method decode (line 74) | def decode(self, batches: List[Batch]) -> Tuple[List[Generation], Cach... method filter (line 78) | def filter(self, batch_id: int, request_ids: List[int]) -> CachedBatch: method clear (line 82) | def clear(self): method from_pretrained (line 87) | def from_pretrained(cls, model_id: str, revision: Optional[str]): class Slot (line 92) | class Slot: class State (line 95) | class State(Enum): method __init__ (line 100) | def __init__(self, id: int, tokenizer: PreTrainedTokenizerBase): method clear (line 105) | def clear(self): method id (line 123) | def id(self) -> int: method state (line 127) | def state(self) -> "Slot.State": method batch_id (line 131) | def batch_id(self) -> int: method request_id (line 135) | def request_id(self) -> int: method cached_text (line 139) | def cached_text(self) -> str: method generation_config (line 143) | def generation_config(self) -> GenerationConfig: method generated_tokens (line 147) | def generated_tokens(self) -> int: method assign (line 150) | def assign( method reset (line 198) | def reset( method pause (line 221) | def pause(self): method resume (line 228) | def resume(self): method _decode_next_tokens (line 232) | def _decode_next_tokens( method append (line 259) | def append(self, next_token: int) -> str: method select (line 284) | def select( method stopped (line 301) | def stopped(self) -> bool: method generated_text (line 307) | def generated_text(self) -> str: method next_token (line 311) | def next_token(self) -> int: method attention_mask (line 315) | def attention_mask(self) -> torch.LongTensor: method max_token (line 319) | def max_token(self) -> int: method max_new_tokens (line 323) | def max_new_tokens(self) -> int: method truncate (line 329) | def truncate(self) -> int: class NeuronGenerator (line 333) | class NeuronGenerator(Generator): method __init__ (line 336) | def __init__( method on_device_sampling (line 363) | def on_device_sampling(self) -> bool: method info (line 367) | def info(self) -> InfoResponse: method warmup (line 376) | def warmup(self, batch: Batch) -> int: method max_prefill_length (line 399) | def max_prefill_length(self) -> int: method prefill (line 404) | def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]: method decode (line 517) | def decode( method _generate_token (line 585) | def _generate_token( method _cached_batch (line 652) | def _cached_batch(self, batch_id: int, request_ids: List): method filter (line 659) | def filter(self, batch_id: int, keep_request_ids: List[int]) -> Cached... method clear (line 677) | def clear(self, batch_id: Optional[int] = None): method _clear (line 684) | def _clear(self, keep_slot_ids: List): method from_pretrained (line 691) | def from_pretrained(cls, model_id: str, revision: str = None): FILE: backends/neuron/server/text_generation_server/interceptor.py class ExceptionInterceptor (line 10) | class ExceptionInterceptor(AsyncServerInterceptor): method intercept (line 11) | async def intercept( FILE: backends/neuron/server/text_generation_server/model.py function get_export_kwargs_from_env (line 17) | def get_export_kwargs_from_env(): function is_cached (line 36) | def is_cached(model_id): function log_cache_size (line 50) | def log_cache_size(): function fetch_model (line 62) | def fetch_model( FILE: backends/neuron/server/text_generation_server/server.py class TextGenerationService (line 14) | class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServi... method __init__ (line 15) | def __init__(self, generator: Generator, server_urls: List[str]): method Info (line 19) | async def Info(self, request, context): method Health (line 22) | async def Health(self, request, context): method ServiceDiscovery (line 25) | async def ServiceDiscovery(self, request, context): method ClearCache (line 28) | async def ClearCache(self, request, context): method FilterBatch (line 35) | async def FilterBatch(self, request, context): method Warmup (line 39) | async def Warmup(self, request, context): method Prefill (line 43) | async def Prefill(self, request, context): method Decode (line 47) | async def Decode(self, request, context): function serve (line 52) | def serve( FILE: backends/neuron/server/text_generation_server/tgi_env.py function parse_cmdline_and_set_env (line 34) | def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namesp... function neuron_config_to_env (line 88) | def neuron_config_to_env(neuron_config): function sort_neuron_configs (line 114) | def sort_neuron_configs(dictionary): function lookup_compatible_cached_model (line 118) | def lookup_compatible_cached_model( function check_env_and_neuron_config_compatibility (line 158) | def check_env_and_neuron_config_compatibility( function get_env_dict (line 245) | def get_env_dict() -> Dict[str, str]: function get_neuron_config_for_model (line 252) | def get_neuron_config_for_model( FILE: backends/neuron/tests/fixtures/model.py function export_model (line 58) | def export_model(model_id, export_kwargs, neuron_model_path): function neuron_model_config (line 80) | def neuron_model_config(request): function neuron_model_path (line 117) | def neuron_model_path(neuron_model_config): FILE: backends/neuron/tests/prune_test_models.py function main (line 5) | def main(): FILE: backends/neuron/tests/server/helpers.py function create_request (line 10) | def create_request( function check_prefill (line 40) | def check_prefill( function check_decode_single (line 80) | def check_decode_single( function check_decode_multiple (line 106) | def check_decode_multiple(model_path): FILE: backends/neuron/tests/server/test_cached_model.py function cached_model_id (line 9) | def cached_model_id(neuron_model_config) -> str: function test_model_is_cached (line 26) | def test_model_is_cached(cached_model_id): function test_fetch_cached_model (line 30) | def test_fetch_cached_model(cached_model_id: str): function test_generator_from_cached_model (line 38) | def test_generator_from_cached_model(cached_model_id: str): FILE: backends/neuron/tests/server/test_continuous_batching.py function test_continuous_batching_two_requests (line 6) | def test_continuous_batching_two_requests(neuron_model_config): FILE: backends/neuron/tests/server/test_decode.py function test_decode (line 6) | def test_decode(neuron_model_config): function _test_decode (line 25) | def _test_decode(config_name, generator, do_sample): FILE: backends/neuron/tests/server/test_generator_slot.py function tokenizer (line 12) | def tokenizer(request): function test_decode_streaming (line 33) | def test_decode_streaming(tokenizer, input_text, generated_text): FILE: backends/neuron/tests/server/test_info.py function test_info (line 4) | def test_info(neuron_model_path): FILE: backends/neuron/tests/server/test_prefill.py function test_prefill (line 6) | def test_prefill(neuron_model_config): function _test_prefill (line 21) | def _test_prefill(config_name, generator, batch_size, do_sample): function test_prefill_truncate (line 60) | def test_prefill_truncate(neuron_model_config): FILE: backends/neuron/tests/test_entry_point.py function test_get_neuron_config_for_model (line 15) | def test_get_neuron_config_for_model(neuron_model_config): function test_lookup_compatible_cached_model (line 38) | def test_lookup_compatible_cached_model(model_id: str): function test_neuron_config_to_env (line 43) | def test_neuron_config_to_env(neuron_model_config) -> None: FILE: backends/neuron/tgi_entry_point.py function main (line 22) | def main(): FILE: backends/trtllm/build.rs constant ADDITIONAL_BACKEND_LINK_LIBRARIES (line 8) | const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"]; constant CUDA_ARCH_LIST (line 9) | const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); constant CUDA_REQUIRED_VERSION (line 10) | const CUDA_REQUIRED_VERSION: &str = "12.8"; constant MPI_REQUIRED_VERSION (line 11) | const MPI_REQUIRED_VERSION: &str = "4.1"; constant INSTALL_PREFIX (line 12) | const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX"); constant TENSORRT_ROOT_DIR (line 13) | const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR"); constant NCCL_ROOT_DIR (line 14) | const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR"); constant IS_GHA_BUILD (line 16) | const IS_GHA_BUILD: LazyLock = LazyLock::new(|| { constant BACKEND_DEPS (line 26) | const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl"; constant CUDA_TRANSITIVE_DEPS (line 27) | const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nv... constant TENSORRT_LLM_TRANSITIVE_DEPS (line 28) | const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [ function get_compiler_flag (line 45) | fn get_compiler_flag( function get_library_architecture (line 56) | fn get_library_architecture() -> &'static str { function build_backend (line 87) | fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> ... function build_ffi_layer (line 178) | fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { function main (line 206) | fn main() { FILE: backends/trtllm/csrc/backend.cpp type huggingface::tgi::backends::trtllm (line 8) | namespace huggingface::tgi::backends::trtllm { FILE: backends/trtllm/csrc/backend.hpp type huggingface::tgi::backends::trtllm (line 17) | namespace huggingface::tgi::backends::trtllm { type generation_params_t (line 26) | struct generation_params_t { type sampling_params_t (line 33) | struct sampling_params_t { type generation_config_t (line 65) | struct generation_config_t { method generation_config_t (line 70) | constexpr explicit generation_config_t(const json &config) : class backend_workspace_t (line 87) | class backend_workspace_t { method backend_workspace_t (line 100) | backend_workspace_t(std::filesystem::path &engines_folder, std::file... method backend_workspace_t (line 106) | backend_workspace_t(std::filesystem::path &&engines_folder, std::fil... method engines_folder (line 116) | [[nodiscard]] constexpr std::filesystem::path engines_folder() const... method generation_config_t (line 123) | [[nodiscard]] constexpr const generation_config_t &generation_config... type backend_error_t (line 143) | enum backend_error_t { class backend_t (line 155) | class backend_t { method backend_t (line 163) | backend_t(std::filesystem::path &&engines_folder, std::filesystem::p... type fmt::formatter (line 212) | struct fmt::formatter (line 220) | struct fmt::formatter> pull_tokens() noexce... method cancel (line 139) | void cancel(request_id_t request_id) noexcept { function finish_reason_t (line 35) | constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason r... class tensorrt_llm_backend_t (line 78) | class tensorrt_llm_backend_t { method tensorrt_llm_backend_t (line 83) | tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::f... method num_tokens_ready (line 86) | size_t num_tokens_ready() const noexcept { return inner_.num_tokens_... method request_id_t (line 88) | request_id_t submit( method pull_tokens (line 118) | std::unique_ptr> pull_tokens() noexce... method cancel (line 139) | void cancel(request_id_t request_id) noexcept { function initialize_logging (line 145) | void initialize_logging() { function initialize_tensorrt_llm_backend (line 163) | void initialize_tensorrt_llm_backend() { function create_backend_from_engine_folder (line 180) | std::unique_ptr type huggingface::tgi::backends::trtllm (line 32) | namespace huggingface::tgi::backends::trtllm { class tensorrt_llm_backend_t (line 26) | class tensorrt_llm_backend_t method tensorrt_llm_backend_t (line 83) | tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::f... method num_tokens_ready (line 86) | size_t num_tokens_ready() const noexcept { return inner_.num_tokens_... method request_id_t (line 88) | request_id_t submit( method pull_tokens (line 118) | std::unique_ptr> pull_tokens() noexce... method cancel (line 139) | void cancel(request_id_t request_id) noexcept { function finish_reason_t (line 35) | constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason r... class tensorrt_llm_backend_t (line 78) | class tensorrt_llm_backend_t { method tensorrt_llm_backend_t (line 83) | tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::f... method num_tokens_ready (line 86) | size_t num_tokens_ready() const noexcept { return inner_.num_tokens_... method request_id_t (line 88) | request_id_t submit( method pull_tokens (line 118) | std::unique_ptr> pull_tokens() noexce... method cancel (line 139) | void cancel(request_id_t request_id) noexcept { function initialize_logging (line 145) | void initialize_logging() { function initialize_tensorrt_llm_backend (line 163) | void initialize_tensorrt_llm_backend() { function create_backend_from_engine_folder (line 180) | std::unique_ptr FILE: backends/trtllm/csrc/hardware.hpp type huggingface::tgi::hardware::cuda (line 8) | namespace huggingface::tgi::hardware::cuda { function get_device_count (line 19) | inline std::optional get_device_count() { type compute_capabilities_t (line 30) | struct compute_capabilities_t { method compute_capabilities_t (line 34) | compute_capabilities_t(): compute_capabilities_t(0) {} method compute_capabilities_t (line 35) | explicit compute_capabilities_t(size_t device_idx): major(-1), minor... method compute_capabilities_t (line 41) | compute_capabilities_t(int32_t major, int32_t minor): major(major), ... method is_at_least (line 48) | [[nodiscard]] constexpr auto is_at_least(std::tuple usize; function submit (line 71) | fn submit( function pull_tokens (line 83) | fn pull_tokens( function cancel (line 87) | fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64); method from (line 95) | fn from(reason: FinishReason) -> Self { FILE: backends/trtllm/src/looper.rs type InferResult (line 29) | type InferResult = Result; type GenerationContext (line 32) | struct GenerationContext { type DecodedToken (line 41) | struct DecodedToken { type Error (line 49) | type Error = InferError; method try_from (line 51) | fn try_from(step: &'step GenerationStep) -> Result { function executor_status_looper (line 65) | fn executor_status_looper( function post_process_decoded_token (line 170) | fn post_process_decoded_token( function ensure_paths_exist (line 218) | fn ensure_paths_exist, PP: AsRef>( type TensorRtLlmBackendV2 (line 259) | pub struct TensorRtLlmBackendV2(UnboundedSender); method new (line 262) | pub fn new + Send, PP: AsRef + Send>( method validate (line 286) | fn validate(request: &ValidGenerateRequest) -> InferResult<()> { method schedule (line 315) | fn schedule( method health (line 340) | async fn health(&self, _: bool) -> bool { method name (line 344) | fn name(&self) -> &'static str { FILE: backends/trtllm/src/main.rs type Args (line 19) | struct Args { function get_tokenizer (line 74) | async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> ... function main (line 219) | async fn main() -> Result<(), TensorRtLlmBackendError> { FILE: backends/trtllm/src/utils.rs function first_line (line 20) | pub(crate) fn first_line(s: &str, fail: &str) -> String { FILE: backends/v2/build.rs function main (line 3) | fn main() -> Result<(), Box> { FILE: backends/v2/src/backend.rs type BackendV2 (line 16) | pub struct BackendV2 { method new (line 27) | pub(crate) fn new( method schedule (line 73) | fn schedule( method health (line 98) | async fn health(&self, current_health: bool) -> bool { method start_health (line 108) | fn start_health(&self) -> bool { method name (line 112) | fn name(&self) -> &'static str { function batching_task (line 122) | pub(crate) async fn batching_task( function prefill (line 240) | async fn prefill( function decode (line 280) | async fn decode( function filter_batch (line 327) | async fn filter_batch( function filter_send_generations (line 361) | fn filter_send_generations(generations: Vec, entries: &mut I... function send_responses (line 386) | fn send_responses( function send_errors (line 478) | fn send_errors(error: ClientError, entries: &mut IntMap) { method from (line 495) | fn from(value: crate::client::GeneratedText) -> Self { FILE: backends/v2/src/client/grpc_client.rs type Client (line 14) | pub struct Client { method connect (line 21) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 30) | pub async fn connect_uds(path: String) -> Result { method service_discovery (line 45) | pub async fn service_discovery(&mut self) -> Result> { method info (line 65) | pub async fn info(&mut self) -> Result { method health (line 73) | pub async fn health(&mut self) -> Result { method clear_cache (line 81) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 89) | pub async fn filter_batch( method warmup (line 107) | pub async fn warmup( method prefill (line 188) | pub async fn prefill( method decode (line 206) | pub async fn decode( type PrefillTimings (line 225) | pub struct PrefillTimings { method new (line 232) | fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self { type DecodeTimings (line 241) | pub struct DecodeTimings { method new (line 249) | fn new(concat_ns: Option, forward_ns: u64, decode_ns: u64, total_... FILE: backends/v2/src/client/mod.rs type Health (line 22) | pub trait Health { method device_health (line 24) | async fn device_health(&self) -> Result<()>; method model_health (line 28) | async fn model_health(&self) -> Result<()>; type ShardInfo (line 32) | pub struct ShardInfo { type ClientError (line 41) | pub enum ClientError { method from (line 51) | fn from(err: Status) -> Self { method from (line 59) | fn from(err: transport::Error) -> Self { type Result (line 68) | pub type Result = std::result::Result; FILE: backends/v2/src/client/sharded_client.rs type ShardedClient (line 18) | pub struct ShardedClient { method new (line 23) | fn new(clients: Vec) -> Self { method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result { method connect (line 39) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 45) | pub async fn connect_uds(path: String) -> Result { method info (line 52) | pub async fn info(&mut self) -> Result { method health (line 63) | pub async fn health(&mut self) -> Result { method clear_cache (line 74) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 85) | pub async fn filter_batch( method warmup (line 103) | pub async fn warmup( method prefill (line 135) | pub async fn prefill( method decode (line 168) | pub async fn decode( method from (line 198) | fn from(value: InfoResponse) -> Self { method device_health (line 211) | async fn device_health(&self) -> Result<()> { method model_health (line 216) | async fn model_health(&self) -> Result<()> { FILE: backends/v2/src/lib.rs type BackendInfo (line 12) | pub struct BackendInfo { function connect_backend (line 33) | pub async fn connect_backend( type V2Error (line 130) | pub enum V2Error { FILE: backends/v2/src/main.rs type Args (line 9) | struct Args { type Commands (line 82) | enum Commands { function main (line 87) | async fn main() -> Result<(), RouterError> { type RouterError (line 215) | enum RouterError { FILE: backends/v2/src/queue.rs type Entry (line 18) | pub(crate) struct Entry { type Queue (line 35) | pub(crate) struct Queue { method new (line 41) | pub(crate) fn new( method append (line 63) | pub(crate) fn append(&self, entry: Entry) { method next_batch (line 73) | pub(crate) async fn next_batch( function queue_task (line 101) | async fn queue_task( type State (line 135) | struct State { method new (line 159) | fn new( method append (line 177) | fn append(&mut self, mut entry: Entry) { method next_batch (line 188) | fn next_batch( type NextBatch (line 349) | type NextBatch = (IntMap, Batch, Span); type QueueCommand (line 352) | enum QueueCommand { method from (line 365) | fn from(value: ValidParameters) -> Self { method from (line 392) | fn from(value: ValidStoppingParameters) -> Self { function default_entry (line 407) | fn default_entry() -> ( function test_append (line 452) | fn test_append() { function test_next_batch_empty (line 468) | fn test_next_batch_empty() { function test_next_batch_min_size (line 476) | fn test_next_batch_min_size() { function test_next_batch_max_size (line 508) | fn test_next_batch_max_size() { function test_next_batch_token_budget (line 528) | fn test_next_batch_token_budget() { function test_queue_append (line 561) | async fn test_queue_append() { function test_queue_next_batch_empty (line 568) | async fn test_queue_next_batch_empty() { function test_queue_next_batch_min_size (line 576) | async fn test_queue_next_batch_min_size() { function test_queue_next_batch_max_size (line 609) | async fn test_queue_next_batch_max_size() { function test_queue_next_batch_token_budget (line 625) | async fn test_queue_next_batch_token_budget() { function test_queue_next_batch_token_speculate (line 650) | async fn test_queue_next_batch_token_speculate() { function test_queue_next_batch_dropped_receiver (line 669) | async fn test_queue_next_batch_dropped_receiver() { FILE: backends/v3/benches/prefix_cache.rs function prefix_cache_benchmark (line 9) | fn prefix_cache_benchmark(c: &mut Criterion) { FILE: backends/v3/build.rs function main (line 3) | fn main() -> Result<(), Box> { FILE: backends/v3/src/backend.rs type BackendV3 (line 18) | pub struct BackendV3 { method new (line 29) | pub(crate) fn new( method schedule (line 79) | fn schedule( method health (line 105) | async fn health(&self, current_health: bool) -> bool { method start_health (line 115) | fn start_health(&self) -> bool { method name (line 119) | fn name(&self) -> &'static str { function batching_task (line 129) | pub(crate) async fn batching_task( function prefill (line 297) | async fn prefill( function decode (line 342) | async fn decode( function filter_batch (line 389) | async fn filter_batch( function filter_send_generations (line 423) | fn filter_send_generations(generations: Vec, entries: &mut I... function send_responses (line 448) | fn send_responses( function send_errors (line 540) | fn send_errors(error: ClientError, entries: &mut IntMap) { method from (line 557) | fn from(value: crate::client::GeneratedText) -> Self { FILE: backends/v3/src/block_allocator.rs type BlockAllocation (line 7) | pub struct BlockAllocation { method drop (line 20) | fn drop(&mut self) { type BlockAllocator (line 28) | pub struct BlockAllocator { method new (line 34) | pub(crate) fn new( method allocate (line 57) | pub(crate) async fn allocate( method free (line 77) | pub(crate) fn free(&self, blocks: Vec, allocation_id: u64) { function block_allocator_task (line 87) | async fn block_allocator_task( type BlockAllocatorCommand (line 119) | enum BlockAllocatorCommand { type Allocator (line 131) | pub trait Allocator { method allocate (line 132) | fn allocate( method free (line 138) | fn free(&mut self, blocks: Vec, allocation_id: u64); method allocate (line 160) | fn allocate( method free (line 218) | fn free(&mut self, blocks: Vec, _allocation_id: u64) { type SimpleAllocator (line 140) | pub struct SimpleAllocator { method new (line 148) | fn new(blocks: u32, block_size: u32, window_size: Option) -> Self { FILE: backends/v3/src/client/grpc_client.rs type Client (line 16) | pub struct Client { method connect (line 23) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 32) | pub async fn connect_uds(path: String) -> Result { method service_discovery (line 47) | pub async fn service_discovery(&mut self) -> Result> { method info (line 67) | pub async fn info(&mut self) -> Result { method health (line 75) | pub async fn health(&mut self) -> Result { method clear_cache (line 83) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 91) | pub async fn filter_batch( method warmup (line 109) | pub async fn warmup( method prefill (line 230) | pub async fn prefill( method decode (line 258) | pub async fn decode( type PrefillTimings (line 277) | pub struct PrefillTimings { method new (line 285) | fn new(concat_ns: Option, forward_ns: u64, decode_ns: u64, total_... type DecodeTimings (line 295) | pub struct DecodeTimings { method new (line 303) | fn new(concat_ns: Option, forward_ns: u64, decode_ns: u64, total_... FILE: backends/v3/src/client/mod.rs type Health (line 23) | pub trait Health { method device_health (line 25) | async fn device_health(&self) -> Result<()>; method model_health (line 29) | async fn model_health(&self) -> Result<()>; type ClientError (line 33) | pub enum ClientError { method from (line 43) | fn from(err: Status) -> Self { method from (line 51) | fn from(err: transport::Error) -> Self { method from (line 60) | fn from(chunk: Chunk) -> Self { type Result (line 67) | pub type Result = std::result::Result; FILE: backends/v3/src/client/sharded_client.rs type ShardedClient (line 18) | pub struct ShardedClient { method new (line 23) | fn new(clients: Vec) -> Self { method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result { method connect (line 39) | pub async fn connect(uri: Uri) -> Result { method connect_uds (line 45) | pub async fn connect_uds(path: String) -> Result { method info (line 52) | pub async fn info(&mut self) -> Result { method health (line 63) | pub async fn health(&mut self) -> Result { method clear_cache (line 74) | pub async fn clear_cache(&mut self, batch_id: Option) -> Result<(... method filter_batch (line 85) | pub async fn filter_batch( method warmup (line 103) | pub async fn warmup( method prefill (line 142) | pub async fn prefill( method decode (line 176) | pub async fn decode( method device_health (line 207) | async fn device_health(&self) -> Result<()> { method model_health (line 212) | async fn model_health(&self) -> Result<()> { FILE: backends/v3/src/lib.rs type BackendInfo (line 14) | pub struct BackendInfo { function connect_backend (line 48) | pub async fn connect_backend( type V3Error (line 172) | pub enum V3Error { FILE: backends/v3/src/main.rs type Args (line 9) | struct Args { type Commands (line 82) | enum Commands { function main (line 87) | async fn main() -> Result<(), RouterError> { type RouterError (line 231) | enum RouterError { FILE: backends/v3/src/queue.rs type Entry (line 22) | pub(crate) struct Entry { type Queue (line 41) | pub(crate) struct Queue { method new (line 47) | pub(crate) fn new( method append (line 76) | pub(crate) fn append(&self, entry: Entry) { method next_batch (line 86) | pub(crate) async fn next_batch( function queue_task (line 119) | async fn queue_task( type State (line 166) | struct State { method new (line 195) | fn new( method append (line 226) | fn append(&mut self, mut entry: Entry) { method next_batch (line 237) | async fn next_batch( type NextBatch (line 507) | type NextBatch = (IntMap, Batch, Span); type QueueCommand (line 510) | enum QueueCommand { method from (line 523) | fn from(value: ValidParameters) -> Self { method from (line 550) | fn from(value: ValidStoppingParameters) -> Self { function default_entry (line 566) | fn default_entry() -> ( function test_append (line 612) | async fn test_append() { function test_next_batch_empty (line 628) | async fn test_next_batch_empty() { function test_next_batch_min_size (line 636) | async fn test_next_batch_min_size() { function test_next_batch_max_size (line 668) | async fn test_next_batch_max_size() { function test_next_batch_token_budget (line 688) | async fn test_next_batch_token_budget() { function test_queue_append (line 721) | async fn test_queue_append() { function test_queue_next_batch_empty (line 728) | async fn test_queue_next_batch_empty() { function test_queue_next_batch_min_size (line 736) | async fn test_queue_next_batch_min_size() { function test_queue_next_batch_max_size (line 769) | async fn test_queue_next_batch_max_size() { function test_queue_next_batch_token_budget (line 785) | async fn test_queue_next_batch_token_budget() { function test_queue_next_batch_token_speculate (line 810) | async fn test_queue_next_batch_token_speculate() { function test_queue_next_batch_dropped_receiver (line 829) | async fn test_queue_next_batch_dropped_receiver() { FILE: backends/v3/src/radix.rs function hash (line 9) | fn hash(slice: &[u32]) -> u64 { type RadixAllocator (line 20) | pub struct RadixAllocator { method new (line 39) | pub fn new(block_size: u32, n_blocks: u32, window_size: Option) -... method alloc_or_reclaim (line 52) | fn alloc_or_reclaim(&mut self, n_blocks_needed: usize) -> Option, allocation_id: u64) { type RadixAllocation (line 211) | struct RadixAllocation { type TrieError (line 230) | pub enum TrieError { type NodeId (line 235) | pub type NodeId = DefaultKey; type RadixTrie (line 238) | pub struct RadixTrie { method new (line 258) | pub fn new(block_size: usize) -> Self { method find (line 280) | pub fn find(&mut self, key: &[u32], blocks: &mut Vec) -> NodeId { method find_ (line 286) | fn find_(&mut self, node_id: NodeId, key: &[u32], blocks: &mut Vec Result<(), TrieError> { method incref (line 342) | pub fn incref(&mut self, node_id: NodeId) -> Result<(), TrieError> { method evict (line 363) | pub fn evict(&mut self, n_blocks: usize) -> Vec { method insert (line 416) | pub fn insert(&mut self, tokens: &[u32], blocks: &[u32]) -> Result NodeId { method add_node (line 509) | fn add_node( method add_node_to_parent (line 529) | fn add_node_to_parent(&mut self, parent_id: NodeId, hash: u64, child_i... method remove_node (line 540) | fn remove_node(&mut self, node_id: NodeId) -> TrieNode { method update_access_time (line 558) | fn update_access_time(&mut self, node_id: NodeId) { method print_debug (line 575) | pub fn print_debug(&self) { method print_debug_ (line 579) | fn print_debug_(&self, node_id: NodeId, indent: usize) { method root_id (line 597) | pub(crate) fn root_id(&self) -> DefaultKey { type TrieNode (line 604) | struct TrieNode { method new (line 614) | fn new(key: Vec, blocks: Vec, last_accessed: u64, parent: Op... function shared_prefix (line 626) | fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize { function allocator_block_size (line 647) | fn allocator_block_size() { function allocator_block_size_non_aligned (line 662) | fn allocator_block_size_non_aligned() { function allocator_reuses_prefixes (line 677) | fn allocator_reuses_prefixes() { function allocator_collects_older_prefixes_first (line 691) | fn allocator_collects_older_prefixes_first() { function allocator_frees_fully_overlapping_prefills (line 711) | fn allocator_frees_fully_overlapping_prefills() { function allocator_frees_partially_overlapping_prefills (line 727) | fn allocator_frees_partially_overlapping_prefills() { function trie_insertions_have_correct_prefix_len (line 769) | fn trie_insertions_have_correct_prefix_len() { function trie_insertions_block_size (line 792) | fn trie_insertions_block_size() { function trie_get_returns_correct_blocks (line 816) | fn trie_get_returns_correct_blocks() { function trie_evict_removes_correct_blocks (line 850) | fn trie_evict_removes_correct_blocks() { function full_match_returns_correct_node (line 888) | fn full_match_returns_correct_node() { function partial_match_does_not_recurse (line 899) | fn partial_match_does_not_recurse() { type AllocationWithInfo (line 910) | struct AllocationWithInfo { function invariants_hold_on_many_operations_remove_all (line 919) | fn invariants_hold_on_many_operations_remove_all() { function invariants_hold_on_many_operations_remove_subset (line 924) | fn invariants_hold_on_many_operations_remove_subset() { function invariants_hold_on_many_insertions (line 928) | fn invariants_hold_on_many_insertions(remove_all: bool) { function check_allocation_invariants (line 1014) | fn check_allocation_invariants(allocations: &[AllocationWithInfo]) { FILE: benchmark/src/app.rs type App (line 15) | pub(crate) struct App { method new (line 33) | pub(crate) fn new( method handle_key_event (line 69) | pub(crate) fn handle_key_event(&mut self, key_event: KeyEvent) { method tick (line 125) | pub(crate) fn tick(&mut self) { method render (line 155) | pub fn render(&mut self, f: &mut Frame) { type Data (line 367) | pub(crate) struct Data { method new (line 379) | fn new(n_run: usize, batch_size: Vec) -> Self { method push_prefill (line 406) | fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) { method push_decode (line 412) | fn push_decode(&mut self, decode: Decode, batch_idx: usize) { method end_batch (line 420) | fn end_batch(&mut self, batch_idx: usize) { function progress_gauge (line 437) | fn progress_gauge(title: &str, label: String, progress: f64, color: Colo... function throughput_paragraph (line 446) | fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> P... function latency_paragraph (line 459) | fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Par... function statis_spans (line 485) | fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec> { function latency_histogram_data (line 516) | fn latency_histogram_data(latency: &[f64], bins: usize) -> Vec<(String, ... function latency_histogram (line 529) | fn latency_histogram<'a>( function latency_throughput_chart (line 544) | fn latency_throughput_chart<'a>( function color_vec (line 674) | fn color_vec() -> Vec { FILE: benchmark/src/event.rs type Event (line 8) | pub(crate) enum Event { function terminal_event_task (line 17) | pub(crate) async fn terminal_event_task( function event_loop (line 33) | async fn event_loop(fps: u32, event_sender: mpsc::Sender) { FILE: benchmark/src/generation.rs constant LOREM_IPSUM (line 10) | const LOREM_IPSUM: &str = "Lorem ipsum dolor sit amet, consectetur adipi... type Prefill (line 13) | pub(crate) struct Prefill { type Decode (line 19) | pub(crate) struct Decode { type Message (line 26) | pub(crate) enum Message { function generation_task (line 36) | pub(crate) async fn generation_task( function generate_runs (line 64) | async fn generate_runs( function prefill (line 132) | async fn prefill( function decode (line 197) | async fn decode(batch: CachedBatch, client: &mut ShardedClient) -> Resul... function create_sequence (line 227) | fn create_sequence(sequence_length: u32, tokenizer: Tokenizer) -> String { FILE: benchmark/src/lib.rs function run (line 19) | pub async fn run( FILE: benchmark/src/main.rs type Args (line 16) | struct Args { function main (line 108) | fn main() -> Result<(), Box> { function init_logging (line 211) | fn init_logging() { FILE: benchmark/src/table.rs function parameters_table (line 6) | pub(crate) fn parameters_table( function latency_table (line 46) | pub(crate) fn latency_table(data: &Data) -> Table { function throughput_table (line 84) | pub(crate) fn throughput_table(data: &Data) -> Table { function add_latencies (line 107) | fn add_latencies( function add_throuhgputs (line 132) | fn add_throuhgputs( function avg_min_max (line 154) | fn avg_min_max(data: &[f64]) -> (f64, f64, f64) { function px (line 167) | fn px(data: &[f64], p: u32) -> f64 { function format_value (line 172) | fn format_value(value: f64, unit: &'static str) -> String { FILE: benchmark/src/utils.rs function histogram (line 16) | pub(crate) fn histogram(values: &[f64], bins: usize) -> Vec<(f64, usize)> { function percentiles (line 35) | pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap Exception: FILE: clients/python/text_generation/inference_api.py function deployed_models (line 16) | def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]: function check_model_support (line 37) | def check_model_support(repo_id: str, headers: Optional[Dict] = None) ->... class InferenceAPIClient (line 59) | class InferenceAPIClient(Client): method __init__ (line 83) | def __init__(self, repo_id: str, token: Optional[str] = None, timeout:... class InferenceAPIAsyncClient (line 115) | class InferenceAPIAsyncClient(AsyncClient): method __init__ (line 140) | def __init__(self, repo_id: str, token: Optional[str] = None, timeout:... FILE: clients/python/text_generation/types.py class GrammarType (line 9) | class GrammarType(str, Enum): class Grammar (line 15) | class Grammar(BaseModel): class ToolCall (line 22) | class ToolCall(BaseModel): class Chunk (line 31) | class Chunk(BaseModel): class Message (line 37) | class Message(BaseModel): class Tool (line 48) | class Tool(BaseModel): class Function (line 55) | class Function(BaseModel): class ChoiceDeltaToolCall (line 60) | class ChoiceDeltaToolCall(BaseModel): class ChoiceDelta (line 67) | class ChoiceDelta(BaseModel): class Choice (line 73) | class Choice(BaseModel): class CompletionRequest (line 80) | class CompletionRequest(BaseModel): class CompletionComplete (line 106) | class CompletionComplete(BaseModel): class Completion (line 117) | class Completion(BaseModel): class ChatRequest (line 127) | class ChatRequest(BaseModel): class ChatCompletionComplete (line 169) | class ChatCompletionComplete(BaseModel): class ChatComplete (line 182) | class ChatComplete(BaseModel): class ChatCompletionChunk (line 193) | class ChatCompletionChunk(BaseModel): class Parameters (line 203) | class Parameters(BaseModel): method valid_best_of (line 247) | def valid_best_of(cls, field_value, values): method valid_repetition_penalty (line 266) | def valid_repetition_penalty(cls, v): method valid_frequency_penalty (line 272) | def valid_frequency_penalty(cls, v): method valid_seed (line 278) | def valid_seed(cls, v): method valid_temp (line 284) | def valid_temp(cls, v): method valid_top_k (line 290) | def valid_top_k(cls, v): method valid_top_p (line 296) | def valid_top_p(cls, v): method valid_truncate (line 302) | def valid_truncate(cls, v): method valid_typical_p (line 308) | def valid_typical_p(cls, v): method valid_top_n_tokens (line 314) | def valid_top_n_tokens(cls, v): method valid_grammar (line 320) | def valid_grammar(cls, v): class Request (line 329) | class Request(BaseModel): method valid_input (line 338) | def valid_input(cls, v): method valid_best_of_stream (line 344) | def valid_best_of_stream(cls, field_value, values): class InputToken (line 359) | class InputToken(BaseModel): class Token (line 370) | class Token(BaseModel): class FinishReason (line 383) | class FinishReason(str, Enum): class BestOfSequence (line 393) | class BestOfSequence(BaseModel): class Details (line 411) | class Details(BaseModel): class Response (line 429) | class Response(BaseModel): class StreamDetails (line 437) | class StreamDetails(BaseModel): class StreamResponse (line 447) | class StreamResponse(BaseModel): class DeployedModel (line 461) | class DeployedModel(BaseModel): FILE: integration-tests/conftest.py class SessionTimeoutFix (line 19) | class SessionTimeoutFix(requests.Session): method request (line 20) | def request(self, *args, **kwargs): function pytest_addoption (line 68) | def pytest_addoption(parser): function pytest_configure (line 86) | def pytest_configure(config): function pytest_collection_modifyitems (line 91) | def pytest_collection_modifyitems(config, items): function container_log (line 139) | def container_log(request: SubRequest): class ResponseComparator (line 151) | class ResponseComparator(JSONSnapshotExtension): method _serialize (line 155) | def _serialize( method serialize (line 181) | def serialize( method matches (line 201) | def matches( class GenerousResponseComparator (line 385) | class GenerousResponseComparator(ResponseComparator): class IgnoreLogProbResponseComparator (line 390) | class IgnoreLogProbResponseComparator(ResponseComparator): class LauncherHandle (line 394) | class LauncherHandle: method __init__ (line 395) | def __init__(self, port: int, error_log): method _inner_health (line 400) | def _inner_health(self): method health (line 403) | async def health(self, timeout: int = 60): class ContainerLauncherHandle (line 421) | class ContainerLauncherHandle(LauncherHandle): method __init__ (line 422) | def __init__(self, docker_client, container_name, port: int, error_log): method _inner_health (line 427) | def _inner_health(self) -> bool: class ProcessLauncherHandle (line 432) | class ProcessLauncherHandle(LauncherHandle): method __init__ (line 433) | def __init__(self, process, port: int, error_log): method _inner_health (line 437) | def _inner_health(self) -> bool: function response_snapshot (line 442) | def response_snapshot(snapshot): function generous_response_snapshot (line 447) | def generous_response_snapshot(snapshot): function ignore_logprob_response_snapshot (line 452) | def ignore_logprob_response_snapshot(snapshot): function error_log (line 457) | def error_log(): function launcher (line 463) | async def launcher(error_log): function generate_load (line 734) | def generate_load(): function generate_multi (line 762) | def generate_multi(): function chicken (line 797) | def chicken(): function cow_beach (line 806) | def cow_beach(): FILE: integration-tests/fixtures/gaudi/service.py function stream_container_logs (line 58) | def stream_container_logs(container, test_name): class TestClient (line 72) | class TestClient(AsyncInferenceClient): method __init__ (line 73) | def __init__(self, service_name: str, base_url: str): class LauncherHandle (line 78) | class LauncherHandle: method __init__ (line 79) | def __init__(self, service_name: str, port: int): method _inner_health (line 82) | def _inner_health(self): method health (line 85) | async def health(self, timeout: int = 60): class ContainerLauncherHandle (line 118) | class ContainerLauncherHandle(LauncherHandle): method __init__ (line 119) | def __init__(self, docker_client, container_name, port: int): method _inner_health (line 125) | def _inner_health(self) -> bool: class ProcessLauncherHandle (line 140) | class ProcessLauncherHandle(LauncherHandle): method __init__ (line 141) | def __init__(self, process, port: int): method _inner_health (line 146) | def _inner_health(self) -> bool: function data_volume (line 151) | def data_volume(): function gaudi_launcher (line 162) | def gaudi_launcher(): function gaudi_generate_load (line 292) | def gaudi_generate_load(): FILE: integration-tests/fixtures/neuron/export_models.py function get_neuron_backend_hash (line 79) | def get_neuron_backend_hash(): function get_neuron_model_name (line 104) | def get_neuron_model_name(config_name: str): function get_tgi_docker_image (line 108) | def get_tgi_docker_image(): function maybe_export_model (line 121) | def maybe_export_model(config_name, model_config): function maybe_export_models (line 218) | def maybe_export_models(): function neuron_model_config (line 224) | def neuron_model_config(request): function neuron_model_path (line 269) | def neuron_model_path(neuron_model_config): FILE: integration-tests/fixtures/neuron/service.py function get_tgi_docker_image (line 24) | def get_tgi_docker_image(): class TestClient (line 45) | class TestClient(AsyncInferenceClient): method __init__ (line 46) | def __init__(self, service_name: str, base_url: str): class LauncherHandle (line 51) | class LauncherHandle: method __init__ (line 52) | def __init__(self, service_name: str, port: int): method _inner_health (line 55) | def _inner_health(self): method health (line 58) | async def health(self, timeout: int = 60): class ContainerLauncherHandle (line 75) | class ContainerLauncherHandle(LauncherHandle): method __init__ (line 76) | def __init__(self, service_name, docker_client, container_name, port: ... method _inner_health (line 82) | def _inner_health(self) -> bool: function event_loop (line 92) | def event_loop(): function neuron_launcher (line 99) | def neuron_launcher(event_loop): function neuron_generate_load (line 239) | def neuron_generate_load(): FILE: integration-tests/gaudi/capture_expected_outputs.py function test_config (line 17) | def test_config(request) -> Dict[str, Any]: function test_name (line 25) | def test_name(test_config): function tgi_service (line 30) | def tgi_service(launcher, test_config, test_name) -> Generator: function test_capture_expected_outputs (line 37) | async def test_capture_expected_outputs(tgi_service, test_config, test_n... FILE: integration-tests/gaudi/test_gaudi_generate.py function pytest_configure (line 7) | def pytest_configure(config): function pytest_generate_tests (line 193) | def pytest_generate_tests(metafunc): function test_config (line 208) | def test_config(request: SubRequest) -> Dict[str, Any]: function model_id (line 217) | def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]: function test_name (line 222) | def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]: function expected_outputs (line 227) | def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]: function input (line 235) | def input(test_config: Dict[str, Any]) -> str: function tgi_service (line 240) | def tgi_service( function tgi_client (line 253) | async def tgi_client(tgi_service) -> AsyncInferenceClient: function test_model_single_request (line 260) | async def test_model_single_request( function test_model_multiple_requests (line 276) | async def test_model_multiple_requests( FILE: integration-tests/models/test_bloom_560m.py function bloom_560_handle (line 5) | def bloom_560_handle(launcher): function bloom_560 (line 11) | async def bloom_560(bloom_560_handle): function test_bloom_560m (line 18) | async def test_bloom_560m(bloom_560, response_snapshot): function test_bloom_560m_all_params (line 33) | async def test_bloom_560m_all_params(bloom_560, response_snapshot): function test_bloom_560m_load (line 56) | async def test_bloom_560m_load(bloom_560, generate_load, response_snapsh... FILE: integration-tests/models/test_bloom_560m_sharded.py function bloom_560m_sharded_handle (line 5) | def bloom_560m_sharded_handle(launcher): function bloom_560m_sharded (line 11) | async def bloom_560m_sharded(bloom_560m_sharded_handle): function test_bloom_560m_sharded (line 18) | async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot): function test_bloom_560m_sharded_load (line 33) | async def test_bloom_560m_sharded_load( FILE: integration-tests/models/test_chat_llama.py function flash_llama_chat_handle (line 5) | def flash_llama_chat_handle(launcher): function flash_llama_chat (line 13) | async def flash_llama_chat(flash_llama_chat_handle): function test_flash_llama_simple (line 19) | async def test_flash_llama_simple(flash_llama_chat, response_snapshot): FILE: integration-tests/models/test_chat_stream_options.py function chat_handle (line 5) | def chat_handle(launcher): function chat_client (line 13) | async def chat_client(chat_handle): FILE: integration-tests/models/test_completion_prompts.py function flash_llama_completion_handle (line 8) | def flash_llama_completion_handle(launcher): function flash_llama_completion (line 16) | async def flash_llama_completion(flash_llama_completion_handle): function test_flash_llama_completion_single_prompt (line 26) | def test_flash_llama_completion_single_prompt( function test_flash_llama_completion_stream_usage (line 50) | async def test_flash_llama_completion_stream_usage( function test_flash_llama_completion_many_prompts (line 118) | def test_flash_llama_completion_many_prompts(flash_llama_completion, res... function test_flash_llama_completion_many_prompts_stream (line 154) | async def test_flash_llama_completion_many_prompts_stream( function test_chat_openai_usage (line 190) | async def test_chat_openai_usage(flash_llama_completion, response_snapsh... function test_chat_openai_nousage (line 214) | async def test_chat_openai_nousage(flash_llama_completion, response_snap... function test_chat_hfhub_usage (line 235) | async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot): function test_chat_hfhub_nousage (line 259) | async def test_chat_hfhub_nousage(flash_llama_completion, response_snaps... FILE: integration-tests/models/test_compressed_tensors_w8a8_int.py function compressed_tensors_w8a8_int_handle (line 5) | def compressed_tensors_w8a8_int_handle(launcher): function compressed_tensors_w8a8_int (line 15) | async def compressed_tensors_w8a8_int(compressed_tensors_w8a8_int_handle): function test_compressed_tensors_w8a8_int (line 23) | async def test_compressed_tensors_w8a8_int( function test_compressed_tensors_w8a8_int_all_params (line 43) | async def test_compressed_tensors_w8a8_int_all_params( function test_compressed_tensors_w8a8_int_load (line 73) | async def test_compressed_tensors_w8a8_int_load( FILE: integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py function compressed_tensors_w8a8_int_dynamic_weight_handle (line 5) | def compressed_tensors_w8a8_int_dynamic_weight_handle(launcher): function compressed_tensors_w8a8_int_dynamic_weight (line 15) | async def compressed_tensors_w8a8_int_dynamic_weight( function test_compressed_tensors_w8a8_int_dynamic_weight (line 25) | async def test_compressed_tensors_w8a8_int_dynamic_weight( function test_compressed_tensors_w8a8_int_dynamic_weight_all_params (line 46) | async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params( function test_compressed_tensors_w8a8_int_dynamic_weight_load (line 76) | async def test_compressed_tensors_w8a8_int_dynamic_weight_load( FILE: integration-tests/models/test_compressed_tensors_w8an_fp.py function compressed_tensors_w8an_handle (line 5) | def compressed_tensors_w8an_handle(launcher): function compressed_tensors_w8an (line 15) | async def compressed_tensors_w8an(compressed_tensors_w8an_handle): function test_compressed_tensors_w8an (line 23) | async def test_compressed_tensors_w8an(compressed_tensors_w8an, response... function test_compressed_tensors_w8an_all_params (line 39) | async def test_compressed_tensors_w8an_all_params( function test_compressed_tensors_w8an_load (line 69) | async def test_compressed_tensors_w8an_load( FILE: integration-tests/models/test_compressed_tensors_wna16_int.py function compressed_tensors_wna16_handle (line 5) | def compressed_tensors_wna16_handle(launcher): function compressed_tensors_wna16 (line 15) | async def compressed_tensors_wna16(compressed_tensors_wna16_handle): function test_compressed_tensors_wna16 (line 23) | async def test_compressed_tensors_wna16(compressed_tensors_wna16, respon... function test_compressed_tensors_wna16_all_params (line 39) | async def test_compressed_tensors_wna16_all_params( function test_compressed_tensors_wna16_load (line 69) | async def test_compressed_tensors_wna16_load( FILE: integration-tests/models/test_compressed_tensors_wna16_int_24.py function compressed_tensors_wna16_int_24_handle (line 5) | def compressed_tensors_wna16_int_24_handle(launcher): function compressed_tensors_wna16_int_24 (line 15) | async def compressed_tensors_wna16_int_24(compressed_tensors_wna16_int_2... function test_compressed_tensors_wna16_int_24 (line 23) | async def test_compressed_tensors_wna16_int_24( function test_compressed_tensors_wna16_int_24_all_params (line 43) | async def test_compressed_tensors_wna16_int_24_all_params( function test_compressed_tensors_wna16_int_24_load (line 73) | async def test_compressed_tensors_wna16_int_24_load( FILE: integration-tests/models/test_continue_final_message.py function llama_continue_final_message_handle (line 6) | def llama_continue_final_message_handle(launcher): function llama_continue_final_message (line 12) | async def llama_continue_final_message(llama_continue_final_message_hand... function test_llama_completion_single_prompt (line 17) | def test_llama_completion_single_prompt( function test_llama_completion_single_prompt_continue (line 46) | def test_llama_completion_single_prompt_continue( FILE: integration-tests/models/test_flash_awq.py function flash_llama_awq_handle (line 5) | def flash_llama_awq_handle(launcher): function flash_llama_awq (line 15) | async def flash_llama_awq(flash_llama_awq_handle): function test_flash_llama_awq (line 22) | async def test_flash_llama_awq(flash_llama_awq, response_snapshot): function test_flash_llama_awq_all_params (line 37) | async def test_flash_llama_awq_all_params(flash_llama_awq, response_snap... function test_flash_llama_awq_load (line 59) | async def test_flash_llama_awq_load(flash_llama_awq, generate_load, resp... FILE: integration-tests/models/test_flash_awq_sharded.py function flash_llama_awq_handle_sharded (line 5) | def flash_llama_awq_handle_sharded(launcher): function flash_llama_awq_sharded (line 15) | async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded): function test_flash_llama_awq_sharded (line 22) | async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response... function test_flash_llama_awq_load_sharded (line 37) | async def test_flash_llama_awq_load_sharded( FILE: integration-tests/models/test_flash_deepseek_v2.py function flash_deepseek_v2_handle (line 5) | def flash_deepseek_v2_handle(launcher): function flash_deepseek_v2 (line 11) | async def flash_deepseek_v2(flash_deepseek_v2_handle): function test_flash_deepseek_v2 (line 19) | async def test_flash_deepseek_v2(flash_deepseek_v2, response_snapshot): function test_flash_deepseek_v2_all_params (line 30) | async def test_flash_deepseek_v2_all_params(flash_deepseek_v2, response_... function test_flash_deepseek_v2_load (line 53) | async def test_flash_deepseek_v2_load( FILE: integration-tests/models/test_flash_falcon.py function flash_falcon_handle (line 5) | def flash_falcon_handle(launcher): function flash_falcon (line 11) | async def flash_falcon(flash_falcon_handle): function test_flash_falcon (line 19) | async def test_flash_falcon(flash_falcon, response_snapshot): function test_flash_falcon_all_params (line 33) | async def test_flash_falcon_all_params(flash_falcon, response_snapshot): function test_flash_falcon_load (line 57) | async def test_flash_falcon_load(flash_falcon, generate_load, response_s... FILE: integration-tests/models/test_flash_gemma.py function flash_gemma_handle (line 5) | def flash_gemma_handle(launcher): function flash_gemma (line 11) | async def flash_gemma(flash_gemma_handle): function test_flash_gemma_simple (line 19) | async def test_flash_gemma_simple(flash_gemma, response_snapshot): function test_flash_gemma_all_params (line 31) | async def test_flash_gemma_all_params(flash_gemma, response_snapshot): function test_flash_gemma_load (line 55) | async def test_flash_gemma_load(flash_gemma, generate_load, response_sna... FILE: integration-tests/models/test_flash_gemma2.py function flash_gemma2_handle (line 5) | def flash_gemma2_handle(launcher): function flash_gemma2 (line 11) | async def flash_gemma2(flash_gemma2_handle): function test_flash_gemma2 (line 19) | async def test_flash_gemma2(flash_gemma2, response_snapshot): function test_flash_gemma2_load (line 34) | async def test_flash_gemma2_load(flash_gemma2, generate_load, response_s... FILE: integration-tests/models/test_flash_gemma3.py function flash_gemma3_handle (line 9) | def flash_gemma3_handle(launcher): function flash_gemma3 (line 15) | async def flash_gemma3(flash_gemma3_handle): function test_flash_gemma3 (line 20) | async def test_flash_gemma3(flash_gemma3, response_snapshot): function test_flash_gemma3_image_cow_dog (line 35) | async def test_flash_gemma3_image_cow_dog(flash_gemma3, response_snapshot): function test_flash_gemma3_image_cow (line 62) | async def test_flash_gemma3_image_cow(flash_gemma3, response_snapshot): function test_exceed_window (line 85) | async def test_exceed_window(flash_gemma3, response_snapshot): function image_to_data_url (line 101) | def image_to_data_url(img: Image.Image, fmt: str) -> str: function test_flash_gemma3_image_base64_rgba (line 110) | async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_sna... function test_flash_gemma3_image_base64_rgb_png (line 133) | async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_... function test_flash_gemma3_image_base64_rgb_jpg (line 153) | async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_... FILE: integration-tests/models/test_flash_gemma_gptq.py function flash_gemma_gptq_handle (line 5) | def flash_gemma_gptq_handle(launcher): function flash_gemma_gptq (line 11) | async def flash_gemma_gptq(flash_gemma_gptq_handle): function test_flash_gemma_gptq (line 19) | async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_respons... function test_flash_gemma_gptq_all_params (line 31) | async def test_flash_gemma_gptq_all_params( function test_flash_gemma_gptq_load (line 57) | async def test_flash_gemma_gptq_load( FILE: integration-tests/models/test_flash_gpt2.py function flash_gpt2_handle (line 5) | def flash_gpt2_handle(launcher): function flash_gpt2 (line 11) | async def flash_gpt2(flash_gpt2_handle): function test_flash_gpt2 (line 18) | async def test_flash_gpt2(flash_gpt2, response_snapshot): function test_flash_gpt2_load (line 31) | async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snaps... FILE: integration-tests/models/test_flash_grammar_llama.py function flash_llama_grammar_handle (line 8) | def flash_llama_grammar_handle(launcher): function flash_llama_grammar (line 16) | async def flash_llama_grammar(flash_llama_grammar_handle): function test_flash_llama_grammar (line 22) | async def test_flash_llama_grammar(flash_llama_grammar, response_snapshot): function test_flash_llama_grammar_regex (line 33) | async def test_flash_llama_grammar_regex(flash_llama_grammar, response_s... function test_flash_llama_grammar_json (line 52) | async def test_flash_llama_grammar_json(flash_llama_grammar, response_sn... function test_flash_llama_grammar_load (line 101) | async def test_flash_llama_grammar_load( function test_flash_llama_grammar_single_load_instance (line 133) | async def test_flash_llama_grammar_single_load_instance( FILE: integration-tests/models/test_flash_llama.py function flash_llama_handle (line 5) | def flash_llama_handle(launcher): function flash_llama (line 11) | async def flash_llama(flash_llama_handle): function test_flash_llama_simple (line 18) | async def test_flash_llama_simple(flash_llama, response_snapshot): function test_flash_llama_all_params (line 29) | async def test_flash_llama_all_params(flash_llama, response_snapshot): function test_flash_llama_load (line 52) | async def test_flash_llama_load(flash_llama, generate_load, response_sna... FILE: integration-tests/models/test_flash_llama_exl2.py function flash_llama_exl2_handle (line 5) | def flash_llama_exl2_handle(launcher): function flash_llama_exl2 (line 19) | async def flash_llama_exl2(flash_llama_exl2_handle): function test_flash_llama_exl2 (line 26) | async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_respons... function test_flash_llama_exl2_all_params (line 37) | async def test_flash_llama_exl2_all_params( function test_flash_llama_exl2_load (line 63) | async def test_flash_llama_exl2_load( FILE: integration-tests/models/test_flash_llama_fp8.py function flash_llama_fp8_handle (line 5) | def flash_llama_fp8_handle(launcher): function flash_llama_fp8 (line 11) | async def flash_llama_fp8(flash_llama_fp8_handle): function test_flash_llama_fp8 (line 20) | async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot): function test_flash_llama_fp8_all_params (line 34) | async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snap... function test_flash_llama_fp8_load (line 58) | async def test_flash_llama_fp8_load(flash_llama_fp8, generate_load, resp... FILE: integration-tests/models/test_flash_llama_fp8_kv_cache.py function flash_llama_fp8_kv_cache_handle (line 5) | def flash_llama_fp8_kv_cache_handle(launcher): function flash_llama_fp8_kv_cache (line 15) | async def flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache_handle): function test_flash_llama_fp8_kv_cache (line 23) | async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, respon... function test_flash_llama_fp8_kv_cache_all_params (line 39) | async def test_flash_llama_fp8_kv_cache_all_params( function test_flash_llama_fp8_kv_cache_load (line 64) | async def test_flash_llama_fp8_kv_cache_load( FILE: integration-tests/models/test_flash_llama_gptq.py function flash_llama_gptq_handle (line 5) | def flash_llama_gptq_handle(launcher): function flash_llama_gptq (line 13) | async def flash_llama_gptq(flash_llama_gptq_handle): function test_flash_llama_gptq (line 21) | async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot): function test_flash_llama_gptq_all_params (line 33) | async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_sn... function test_flash_llama_gptq_load (line 56) | async def test_flash_llama_gptq_load( FILE: integration-tests/models/test_flash_llama_marlin.py function flash_llama_marlin_handle (line 5) | def flash_llama_marlin_handle(launcher): function flash_llama_marlin (line 13) | async def flash_llama_marlin(flash_llama_marlin_handle): function test_flash_llama_marlin (line 21) | async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot): function test_flash_llama_marlin_all_params (line 33) | async def test_flash_llama_marlin_all_params(flash_llama_marlin, respons... function test_flash_llama_marlin_load (line 56) | async def test_flash_llama_marlin_load( FILE: integration-tests/models/test_flash_llama_marlin_24.py function flash_llama_marlin24_handle (line 5) | def flash_llama_marlin24_handle(launcher): function flash_llama_marlin (line 13) | async def flash_llama_marlin(flash_llama_marlin24_handle): function test_flash_llama_marlin (line 22) | async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot): function test_flash_llama_marlin24_all_params (line 35) | async def test_flash_llama_marlin24_all_params(flash_llama_marlin, respo... function test_flash_llama_marlin24_load (line 59) | async def test_flash_llama_marlin24_load( FILE: integration-tests/models/test_flash_llama_prefix.py function flash_llama_handle (line 5) | def flash_llama_handle(launcher): function flash_llama (line 11) | async def flash_llama(flash_llama_handle): function test_flash_llama_load (line 18) | async def test_flash_llama_load( FILE: integration-tests/models/test_flash_llama_prefix_flashdecoding.py function flash_llama_handle_fd (line 5) | def flash_llama_handle_fd(launcher): function flash_llama_fd (line 13) | async def flash_llama_fd(flash_llama_handle_fd): function test_flash_llama_flashdecoding (line 20) | async def test_flash_llama_flashdecoding( FILE: integration-tests/models/test_flash_medusa.py function flash_medusa_handle (line 5) | def flash_medusa_handle(launcher): function flash_medusa (line 13) | async def flash_medusa(flash_medusa_handle): function test_flash_medusa_simple (line 19) | async def test_flash_medusa_simple(flash_medusa, response_snapshot): function test_flash_medusa_all_params (line 29) | async def test_flash_medusa_all_params(flash_medusa, response_snapshot): function test_flash_medusa_load (line 51) | async def test_flash_medusa_load(flash_medusa, generate_load, response_s... FILE: integration-tests/models/test_flash_mistral.py function flash_mistral_handle (line 5) | def flash_mistral_handle(launcher): function flash_mistral (line 11) | async def flash_mistral(flash_mistral_handle): function test_flash_mistral (line 17) | async def test_flash_mistral(flash_mistral, response_snapshot): function test_flash_mistral_all_params (line 28) | async def test_flash_mistral_all_params(flash_mistral, response_snapshot): function test_flash_mistral_load (line 50) | async def test_flash_mistral_load(flash_mistral, generate_load, response... FILE: integration-tests/models/test_flash_mixtral.py function flash_mixtral_handle (line 5) | def flash_mixtral_handle(launcher): function flash_mixtral (line 11) | async def flash_mixtral(flash_mixtral_handle): function test_flash_mixtral (line 18) | async def test_flash_mixtral(flash_mixtral, response_snapshot): function test_flash_mixtral_all_params (line 33) | async def test_flash_mixtral_all_params(flash_mixtral, response_snapshot): function test_flash_mixtral_load (line 60) | async def test_flash_mixtral_load(flash_mixtral, generate_load, response... FILE: integration-tests/models/test_flash_mixtral_awq.py function flash_mixtral_awq_handle (line 5) | def flash_mixtral_awq_handle(launcher): function flash_mixtral_awq (line 11) | async def flash_mixtral_awq(flash_mixtral_awq_handle): function test_flash_mixtral_awq (line 17) | async def test_flash_mixtral_awq(flash_mixtral_awq, response_snapshot): function test_flash_mixtral_awq_all_params (line 30) | async def test_flash_mixtral_awq_all_params(flash_mixtral_awq, response_... function test_flash_mixtral_awq_load (line 56) | async def test_flash_mixtral_awq_load( FILE: integration-tests/models/test_flash_mixtral_gptq.py function flash_mixtral_gptq_handle (line 5) | def flash_mixtral_gptq_handle(launcher): function flash_mixtral_gptq (line 15) | async def flash_mixtral_gptq(flash_mixtral_gptq_handle): function test_flash_mixtral_gptq (line 21) | async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot): function test_flash_mixtral_gptq_all_params (line 35) | async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, respons... function test_flash_mixtral_gptq_load (line 61) | async def test_flash_mixtral_gptq_load( FILE: integration-tests/models/test_flash_neox.py function flash_neox_handle (line 5) | def flash_neox_handle(launcher): function flash_neox (line 11) | async def flash_neox(flash_neox_handle): function test_flash_neox (line 19) | async def test_flash_neox(flash_neox, response_snapshot): function test_flash_neox_load (line 33) | async def test_flash_neox_load(flash_neox, generate_load, response_snaps... FILE: integration-tests/models/test_flash_neox_sharded.py function flash_neox_sharded_handle (line 5) | def flash_neox_sharded_handle(launcher): function flash_neox_sharded (line 11) | async def flash_neox_sharded(flash_neox_sharded_handle): function test_flash_neox (line 18) | async def test_flash_neox(flash_neox_sharded, response_snapshot): function test_flash_neox_load (line 31) | async def test_flash_neox_load(flash_neox_sharded, generate_load, respon... FILE: integration-tests/models/test_flash_pali_gemma.py function flash_pali_gemma_handle (line 5) | def flash_pali_gemma_handle(launcher): function flash_pali_gemma (line 17) | async def flash_pali_gemma(flash_pali_gemma_handle): function test_flash_pali_gemma (line 25) | async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot, cow... function test_flash_pali_gemma_two_images (line 36) | async def test_flash_pali_gemma_two_images( FILE: integration-tests/models/test_flash_pali_gemma2.py function flash_pali_gemma_handle (line 5) | def flash_pali_gemma_handle(launcher): function flash_pali_gemma (line 13) | async def flash_pali_gemma(flash_pali_gemma_handle): function test_flash_pali_gemma_image (line 18) | async def test_flash_pali_gemma_image(flash_pali_gemma, response_snapshot): FILE: integration-tests/models/test_flash_phi.py function flash_phi_handle (line 5) | def flash_phi_handle(launcher): function flash_phi (line 11) | async def flash_phi(flash_phi_handle): function test_flash_phi (line 18) | async def test_flash_phi(flash_phi, response_snapshot): function test_flash_phi_all_params (line 30) | async def test_flash_phi_all_params(flash_phi, response_snapshot): function test_flash_phi_load (line 54) | async def test_flash_phi_load(flash_phi, generate_load, response_snapshot): FILE: integration-tests/models/test_flash_phi35_moe.py function flash_phi35_moe_handle (line 5) | def flash_phi35_moe_handle(launcher): function flash_phi35_moe (line 14) | async def flash_phi35_moe(flash_phi35_moe_handle): function test_flash_phi35_moe (line 20) | async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot): function test_flash_phi35_moe_all_params (line 34) | async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snap... function test_flash_phi35_moe_load (line 60) | async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, resp... FILE: integration-tests/models/test_flash_qwen2.py function flash_qwen2_handle (line 5) | def flash_qwen2_handle(launcher): function flash_qwen2 (line 11) | async def flash_qwen2(flash_qwen2_handle): function test_flash_qwen2 (line 18) | async def test_flash_qwen2(flash_qwen2, response_snapshot): function test_flash_qwen2_all_params (line 30) | async def test_flash_qwen2_all_params(flash_qwen2, response_snapshot): function test_flash_qwen2_load (line 53) | async def test_flash_qwen2_load(flash_qwen2, generate_load, response_sna... FILE: integration-tests/models/test_flash_qwen2_5_vl.py function flash_qwen2_5_vl_handle (line 5) | def flash_qwen2_5_vl_handle(launcher): function flash_qwen2_5 (line 11) | async def flash_qwen2_5(flash_qwen2_5_vl_handle): function test_flash_qwen2_5_vl_simple (line 17) | async def test_flash_qwen2_5_vl_simple(flash_qwen2_5, response_snapshot): function test_flash_qwen2_5_vl_simple_streaming (line 45) | async def test_flash_qwen2_5_vl_simple_streaming(flash_qwen2_5, response... function test_flash_qwen2_5_vl_bay (line 82) | async def test_flash_qwen2_5_vl_bay(flash_qwen2_5, response_snapshot): function test_flash_qwen2_5_vl_inpaint (line 104) | async def test_flash_qwen2_5_vl_inpaint(flash_qwen2_5, response_snapshot): FILE: integration-tests/models/test_flash_qwen2_vl.py function flash_qwen2_vl_handle (line 5) | def flash_qwen2_vl_handle(launcher): function flash_qwen2 (line 11) | async def flash_qwen2(flash_qwen2_vl_handle): function test_flash_qwen2_vl_simple (line 17) | async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot): function test_flash_qwen2_vl_simple_streaming (line 45) | async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_sna... function test_flash_qwen2_vl_bay (line 82) | async def test_flash_qwen2_vl_bay(flash_qwen2, response_snapshot): function test_flash_qwen2_vl_inpaint (line 104) | async def test_flash_qwen2_vl_inpaint(flash_qwen2, response_snapshot): FILE: integration-tests/models/test_flash_santacoder.py function flash_santacoder_handle (line 5) | def flash_santacoder_handle(launcher): function flash_santacoder (line 11) | async def flash_santacoder(flash_santacoder_handle): function test_flash_santacoder (line 18) | async def test_flash_santacoder(flash_santacoder, response_snapshot): function test_flash_santacoder_load (line 29) | async def test_flash_santacoder_load( FILE: integration-tests/models/test_flash_starcoder.py function flash_starcoder_handle (line 5) | def flash_starcoder_handle(launcher): function flash_starcoder (line 11) | async def flash_starcoder(flash_starcoder_handle): function test_flash_starcoder (line 19) | async def test_flash_starcoder(flash_starcoder, response_snapshot): function test_flash_starcoder_default_params (line 31) | async def test_flash_starcoder_default_params(flash_starcoder, response_... function test_flash_starcoder_load (line 48) | async def test_flash_starcoder_load(flash_starcoder, generate_load, resp... FILE: integration-tests/models/test_flash_starcoder2.py function flash_starcoder2_handle (line 5) | def flash_starcoder2_handle(launcher): function flash_starcoder2 (line 11) | async def flash_starcoder2(flash_starcoder2_handle): function test_flash_starcoder2 (line 19) | async def test_flash_starcoder2(flash_starcoder2, response_snapshot): function test_flash_starcoder2_default_params (line 31) | async def test_flash_starcoder2_default_params(flash_starcoder2, respons... function test_flash_starcoder2_load (line 48) | async def test_flash_starcoder2_load( FILE: integration-tests/models/test_flash_starcoder2_lora.py function flash_starcoder2_handle (line 6) | def flash_starcoder2_handle(launcher): function flash_starcoder2 (line 14) | async def flash_starcoder2(flash_starcoder2_handle): function test_flash_starcoder2 (line 20) | async def test_flash_starcoder2(flash_starcoder2, response_snapshot): function test_flash_starcoder2_default_params (line 30) | async def test_flash_starcoder2_default_params(flash_starcoder2, respons... function test_flash_starcoder2_load (line 45) | async def test_flash_starcoder2_load( function test_flash_starcoder2_with_hugcode_adapter (line 59) | async def test_flash_starcoder2_with_hugcode_adapter( FILE: integration-tests/models/test_flash_starcoder_gptq.py function flash_starcoder_gptq_handle (line 5) | def flash_starcoder_gptq_handle(launcher): function flash_starcoder_gptq (line 11) | async def flash_starcoder_gptq(flash_starcoder_gptq_handle): function test_flash_starcoder_gptq (line 18) | async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_respo... function test_flash_starcoder_gptq_load (line 49) | async def test_flash_starcoder_gptq_load( FILE: integration-tests/models/test_grammar_llama.py function non_flash_llama_grammar_handle (line 8) | def non_flash_llama_grammar_handle(launcher): function non_flash_llama_grammar (line 19) | async def non_flash_llama_grammar(non_flash_llama_grammar_handle): function test_non_flash_llama_grammar_json (line 27) | async def test_non_flash_llama_grammar_json(non_flash_llama_grammar, res... FILE: integration-tests/models/test_grammar_response_format_llama.py function llama_grammar_handle (line 8) | def llama_grammar_handle(launcher): function llama_grammar (line 20) | async def llama_grammar(llama_grammar_handle): function test_grammar_response_format_llama_json (line 27) | async def test_grammar_response_format_llama_json(llama_grammar, respons... function test_grammar_response_format_llama_error_if_tools_not_installed (line 103) | async def test_grammar_response_format_llama_error_if_tools_not_installed( FILE: integration-tests/models/test_idefics.py function idefics_handle (line 5) | def idefics_handle(launcher): function idefics (line 13) | async def idefics(idefics_handle): function test_idefics (line 19) | async def test_idefics(idefics, response_snapshot, chicken): function test_idefics_two_images (line 36) | async def test_idefics_two_images(idefics, response_snapshot, chicken, c... function test_idefics_load (line 49) | async def test_idefics_load(idefics, generate_load, response_snapshot, c... FILE: integration-tests/models/test_idefics2.py function flash_idefics2_next_handle (line 5) | def flash_idefics2_next_handle(launcher): function flash_idefics2_next (line 13) | async def flash_idefics2_next(flash_idefics2_next_handle): function test_flash_idefics2_next_simple (line 20) | async def test_flash_idefics2_next_simple( function test_flash_idefics2_two_images (line 36) | async def test_flash_idefics2_two_images( function test_flash_idefics2_next_all_params (line 53) | async def test_flash_idefics2_next_all_params(flash_idefics2_next, respo... function test_flash_idefics2_next_load (line 76) | async def test_flash_idefics2_next_load( FILE: integration-tests/models/test_idefics3.py function flash_idefics3_next_handle (line 5) | def flash_idefics3_next_handle(launcher): function flash_idefics3_next (line 11) | async def flash_idefics3_next(flash_idefics3_next_handle): function test_flash_idefics3_next_simple_url (line 18) | async def test_flash_idefics3_next_simple_url(flash_idefics3_next, respo... FILE: integration-tests/models/test_json_schema_constrain.py function model_handle (line 7) | def model_handle(launcher): function model_fixture (line 18) | async def model_fixture(model_handle): function test_json_schema_basic (line 77) | async def test_json_schema_basic(model_fixture, response_snapshot): function test_json_schema_complex (line 115) | async def test_json_schema_complex(model_fixture, response_snapshot): function test_json_schema_stream (line 159) | async def test_json_schema_stream(model_fixture, response_snapshot): FILE: integration-tests/models/test_llava_next.py function flash_llava_next_handle (line 5) | def flash_llava_next_handle(launcher): function flash_llava_next (line 16) | async def flash_llava_next(flash_llava_next_handle): function test_flash_llava_next_simple (line 24) | async def test_flash_llava_next_simple(flash_llava_next, response_snapsh... function test_flash_llava_next_all_params (line 39) | async def test_flash_llava_next_all_params(flash_llava_next, response_sn... function test_flash_llava_next_load (line 63) | async def test_flash_llava_next_load( FILE: integration-tests/models/test_lora_mistral.py function lora_mistral_handle (line 6) | def lora_mistral_handle(launcher): function lora_mistral (line 19) | async def lora_mistral(lora_mistral_handle): function test_lora_mistral (line 26) | async def test_lora_mistral(lora_mistral, response_snapshot): function test_lora_mistral_without_adapter (line 38) | async def test_lora_mistral_without_adapter(lora_mistral, response_snaps... function test_lora_mistral_with_dbpedia_adapter (line 62) | async def test_lora_mistral_with_dbpedia_adapter(lora_mistral, response_... function test_lora_mistral_with_customer_support_adapter (line 84) | async def test_lora_mistral_with_customer_support_adapter( function test_lora_mistral_without_customer_support_adapter (line 113) | async def test_lora_mistral_without_customer_support_adapter( FILE: integration-tests/models/test_mamba.py function fused_kernel_mamba_handle (line 5) | def fused_kernel_mamba_handle(launcher): function fused_kernel_mamba (line 11) | async def fused_kernel_mamba(fused_kernel_mamba_handle): function test_mamba (line 18) | async def test_mamba(fused_kernel_mamba, response_snapshot): function test_mamba_all_params (line 30) | async def test_mamba_all_params(fused_kernel_mamba, response_snapshot): function test_mamba_load (line 57) | async def test_mamba_load( FILE: integration-tests/models/test_mllama.py function mllama_handle (line 6) | def mllama_handle(launcher): function mllama (line 15) | async def mllama(mllama_handle): function test_mllama_simpl (line 21) | async def test_mllama_simpl(mllama, response_snapshot): function test_mllama_load (line 58) | async def test_mllama_load(mllama, generate_load, response_snapshot): FILE: integration-tests/models/test_mpt.py function mpt_sharded_handle (line 5) | def mpt_sharded_handle(launcher): function mpt_sharded (line 11) | async def mpt_sharded(mpt_sharded_handle): function test_mpt (line 18) | async def test_mpt(mpt_sharded, response_snapshot): function test_mpt_load (line 35) | async def test_mpt_load(mpt_sharded, generate_load, response_snapshot): FILE: integration-tests/models/test_mt0_base.py function mt0_base_handle (line 5) | def mt0_base_handle(launcher): function mt0_base (line 11) | async def mt0_base(mt0_base_handle): function test_mt0_base (line 18) | async def test_mt0_base(mt0_base, response_snapshot): function test_mt0_base_all_params (line 33) | async def test_mt0_base_all_params(mt0_base, response_snapshot): function test_mt0_base_load (line 56) | async def test_mt0_base_load(mt0_base, generate_load, response_snapshot): FILE: integration-tests/models/test_neox.py function neox_handle (line 5) | def neox_handle(launcher): function neox (line 13) | async def neox(neox_handle): function test_neox (line 21) | async def test_neox(neox, response_snapshot): function test_neox_load (line 35) | async def test_neox_load(neox, generate_load, response_snapshot): FILE: integration-tests/models/test_neox_sharded.py function neox_sharded_handle (line 5) | def neox_sharded_handle(launcher): function neox_sharded (line 13) | async def neox_sharded(neox_sharded_handle): function test_neox (line 21) | async def test_neox(neox_sharded, response_snapshot): function test_neox_load (line 35) | async def test_neox_load(neox_sharded, generate_load, response_snapshot): FILE: integration-tests/models/test_opt.py function opt_sharded_handle (line 5) | def opt_sharded_handle(launcher): function opt_sharded (line 11) | async def opt_sharded(opt_sharded_handle): function test_opt (line 18) | async def test_opt(opt_sharded): FILE: integration-tests/models/test_smolvlm.py function flash_smolvlm_next_handle (line 5) | def flash_smolvlm_next_handle(launcher): function flash_smolvlm_next (line 11) | async def flash_smolvlm_next(flash_smolvlm_next_handle): function test_flash_smolvlm_next_simple_url (line 18) | async def test_flash_smolvlm_next_simple_url(flash_smolvlm_next, respons... FILE: integration-tests/models/test_t5_sharded.py function t5_sharded_handle (line 5) | def t5_sharded_handle(launcher): function t5_sharded (line 11) | async def t5_sharded(t5_sharded_handle): function test_t5_sharded (line 18) | async def test_t5_sharded(t5_sharded, response_snapshot): function test_t5_sharded_load (line 30) | async def test_t5_sharded_load(t5_sharded, generate_load, response_snaps... FILE: integration-tests/models/test_tools_llama.py function flash_llama_grammar_tools_handle (line 11) | def flash_llama_grammar_tools_handle(launcher): function flash_llama_grammar_tools (line 21) | async def flash_llama_grammar_tools(flash_llama_grammar_tools_handle): function test_flash_llama_grammar_tools_nostream (line 83) | async def test_flash_llama_grammar_tools_nostream( function test_flash_llama_grammar_tools_openai (line 120) | async def test_flash_llama_grammar_tools_openai( function test_flash_llama_grammar_tools_auto_nostream (line 159) | async def test_flash_llama_grammar_tools_auto_nostream( function test_flash_llama_grammar_tools_choice_nostream (line 198) | async def test_flash_llama_grammar_tools_choice_nostream( function test_flash_llama_grammar_tools_choice_stream (line 237) | async def test_flash_llama_grammar_tools_choice_stream( function test_flash_llama_grammar_tools_insufficient_information_nostream (line 277) | async def test_flash_llama_grammar_tools_insufficient_information_nostream( function test_flash_llama_grammar_tools_insufficient_information_stream (line 311) | async def test_flash_llama_grammar_tools_insufficient_information_stream( function test_flash_llama_grammar_tools_sea_creatures_stream_auto (line 350) | async def test_flash_llama_grammar_tools_sea_creatures_stream_auto( function test_flash_llama_grammar_tools_sea_creatures_stream_required (line 388) | async def test_flash_llama_grammar_tools_sea_creatures_stream_required( function test_flash_llama_grammar_tools_sea_creatures_stream_none (line 429) | async def test_flash_llama_grammar_tools_sea_creatures_stream_none( function test_flash_llama_grammar_tools_sea_creatures_stream_function_object (line 467) | async def test_flash_llama_grammar_tools_sea_creatures_stream_function_o... function test_flash_llama_tool_reply_response (line 510) | async def test_flash_llama_tool_reply_response( FILE: integration-tests/models/test_transformers_olmo.py function flash_llama_handle (line 5) | def flash_llama_handle(launcher): function flash_llama (line 11) | async def flash_llama(flash_llama_handle): function test_flash_llama_simple (line 18) | async def test_flash_llama_simple(flash_llama, response_snapshot): function test_flash_llama_load (line 30) | async def test_flash_llama_load(flash_llama, generate_load, response_sna... FILE: integration-tests/neuron/test_generate.py function tgi_service (line 5) | async def tgi_service(neuron_launcher, neuron_model_config): function test_model_single_request (line 14) | async def test_model_single_request(tgi_service): function test_model_multiple_requests (line 67) | async def test_model_multiple_requests(tgi_service, neuron_generate_load): FILE: integration-tests/neuron/test_implicit_env.py function tgi_service (line 7) | async def tgi_service(request, neuron_launcher, neuron_model_config): function test_model_single_request (line 39) | async def test_model_single_request(tgi_service): FILE: launcher/build.rs function main (line 4) | fn main() -> Result<(), Box> { FILE: launcher/src/env_runtime.rs type Env (line 4) | pub(crate) struct Env { method new (line 15) | pub fn new() -> Self { method fmt (line 33) | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { function nvidia_smi (line 48) | fn nvidia_smi() -> Option { function xpu_smi (line 55) | fn xpu_smi() -> Option { function hl_smi (line 62) | fn hl_smi() -> Option { FILE: launcher/src/gpu.rs function get_cuda_capability (line 1) | pub fn get_cuda_capability() -> Option<(usize, usize)> { FILE: launcher/src/main.rs function compute_optimal (line 28) | fn compute_optimal(config: Option<&Config>, compute: Option<&ComputeType... function human_size (line 48) | fn human_size(size: usize, suffix: &str) -> String { function vram_maximum (line 62) | fn vram_maximum( function get_config (line 91) | fn get_config( function resolve_attention (line 131) | fn resolve_attention(config: &Option, lora_adapters: &Option Option { method flop (line 281) | fn flop(&self) -> Option { method kv_vram_per_tok (line 310) | fn kv_vram_per_tok(&self) -> Option { method mlp_vram_per_tok (line 320) | fn mlp_vram_per_tok(&self) -> Option { method token_vram (line 330) | fn token_vram(&self) -> Option { method model_vram (line 337) | fn model_vram(&self) -> Option { method from (line 355) | fn from(other: RawConfig) -> Self { type Quantization (line 407) | enum Quantization { method fmt (line 450) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { type Dtype (line 490) | enum Dtype { method fmt (line 497) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { type KVCacheDtype (line 511) | enum KVCacheDtype { method fmt (line 520) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { type RopeScaling (line 533) | enum RopeScaling { method fmt (line 539) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { type UsageStatsLevel (line 553) | pub enum UsageStatsLevel { method fmt (line 563) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { type Args (line 582) | struct Args { type ShardStatus (line 917) | enum ShardStatus { function shard_manager (line 923) | fn shard_manager( function shutdown_shards (line 1251) | fn shutdown_shards(shutdown: Arc, shutdown_receiver: &mpsc::... function num_cuda_devices (line 1262) | fn num_cuda_devices() -> Option { type PythonLogLevelEnum (line 1296) | enum PythonLogLevelEnum { type PythonLogLevel (line 1307) | struct PythonLogLevel { type PythonLogRecord (line 1312) | struct PythonLogRecord { type PythonLogMessage (line 1317) | struct PythonLogMessage { method trace (line 1323) | fn trace(&self) { type Error (line 1337) | type Error = serde_json::Error; method try_from (line 1339) | fn try_from(value: &[u8]) -> Result { function log_lines (line 1344) | fn log_lines(mut bufread: BufReader) { function find_num_shards (line 1374) | fn find_num_shards( type LauncherError (line 1415) | enum LauncherError { function download_convert_model (line 1434) | fn download_convert_model( function spawn_shards (line 1579) | fn spawn_shards( type Gpu (line 1692) | enum Gpu { method from (line 1713) | fn from(value: &str) -> Self { method fmt (line 1736) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { type ComputeType (line 1707) | struct ComputeType { method f16_flop (line 1754) | fn f16_flop(&self) -> Option { method vram (line 1787) | fn vram(&self, memory_fraction: f32) -> Option { method from (line 1814) | fn from(value: ComputeType) -> Self { function compute_type (line 1819) | fn compute_type(count: usize) -> Option { function spawn_webserver (line 1831) | fn spawn_webserver( function terminate (line 2015) | fn terminate(process_name: &str, mut process: Child, timeout: Duration) ... function main (line 2038) | fn main() -> Result<(), LauncherError> { FILE: load_tests/benchmarks.py class InferenceEngineRunner (line 15) | class InferenceEngineRunner: method __init__ (line 16) | def __init__(self, model: str): method run (line 19) | def run(self, parameters: list[tuple], gpus: int = 0): method stop (line 22) | def stop(self): class TGIDockerRunner (line 26) | class TGIDockerRunner(InferenceEngineRunner): method __init__ (line 27) | def __init__( method run (line 40) | def run(self, parameters: list[tuple], gpus: int = 0): method stop (line 58) | def stop(self): class BenchmarkRunner (line 63) | class BenchmarkRunner: method __init__ (line 64) | def __init__( method run (line 75) | def run(self, parameters: list[tuple], network_mode): method stop (line 98) | def stop(self): function run_docker (line 103) | def run_docker( function get_gpu_names (line 150) | def get_gpu_names() -> str: function get_gpu_name (line 157) | def get_gpu_name() -> str: function get_num_gpus (line 164) | def get_num_gpus() -> int: function build_df (line 168) | def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame: function main (line 190) | def main(sha, results_file): FILE: load_tests/common.js function get_options (line 18) | function get_options() { function generate_payload (line 60) | function generate_payload(gpt, max_new_tokens) { function run (line 67) | function run() { FILE: load_tests/filter.py function main (line 4) | def main(): FILE: load_tests/long.js function get_options (line 18) | function get_options() { function generate_payload (line 60) | function generate_payload(gpt, max_new_tokens) { function run (line 67) | function run() { FILE: load_tests/orca.py function main (line 6) | def main(): FILE: router/build.rs function main (line 4) | fn main() -> Result<(), Box> { FILE: router/src/chat.rs type ToolCall (line 10) | struct ToolCall { type Call (line 17) | struct Call { type ChatEvent (line 22) | pub(crate) enum ChatEvent { type ChatChoice (line 28) | pub(crate) enum ChatChoice { function parse_output (line 33) | pub(crate) fn parse_output(generated_text: &str) -> Result ChatEvent { function get_tool_call_content (line 309) | fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &S... function test_chat_stream (line 338) | fn test_chat_stream() { function test_chat_stream_usage (line 388) | fn test_chat_stream_usage() { function test_chat_stream_tool_no_tool_simple (line 457) | fn test_chat_stream_tool_no_tool_simple() { function test_chat_stream_tool_no_tool_empty (line 530) | fn test_chat_stream_tool_no_tool_empty() { function test_chat_stream_tool_get_weather (line 596) | fn test_chat_stream_tool_get_weather() { FILE: router/src/config.rs type LlavaNext (line 7) | pub struct LlavaNext { method get_number_of_features (line 89) | pub fn get_number_of_features(&self, height: usize, width: usize) -> u... function get_anyres_image_grid_shape (line 13) | fn get_anyres_image_grid_shape( function select_best_resolution (line 26) | fn select_best_resolution( function get_unpadded_features (line 61) | fn get_unpadded_features( type Llama4VisionConfig (line 109) | pub struct Llama4VisionConfig { type Llama4 (line 117) | pub struct Llama4 { method image_size (line 221) | pub fn image_size(&self) -> usize { method patch_size (line 225) | pub fn patch_size(&self) -> usize { method pixel_shuffle_ratio (line 229) | pub fn pixel_shuffle_ratio(&self) -> f64 { method get_aspect_ratios (line 232) | pub fn get_aspect_ratios( function gcd (line 122) | fn gcd(a: usize, b: usize) -> usize { function get_factors (line 130) | fn get_factors(dividend: usize) -> HashSet { function find_supported_resolutions (line 143) | fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> V... function get_best_fit (line 173) | fn get_best_fit( type ClipVisionModel (line 247) | pub struct ClipVisionModel { type Idefics3 (line 254) | pub struct Idefics3 {} method get_max_longest_edge (line 257) | pub fn get_max_longest_edge(&self) -> usize { method get_number_of_features (line 261) | pub fn get_number_of_features(&self) -> usize { method get_max_longest_edge_for_image_resize (line 265) | pub fn get_max_longest_edge_for_image_resize(&self) -> usize { method get_max_image_size (line 269) | pub fn get_max_image_size(&self) -> usize { type Idefics2 (line 276) | pub struct Idefics2 {} method get_number_of_features (line 279) | pub fn get_number_of_features(&self, _height: usize, _width: usize) ->... type PaliTextConfig (line 286) | pub struct PaliTextConfig { type Paligemma (line 292) | pub struct Paligemma { method get_number_of_features (line 297) | pub fn get_number_of_features(&self, _height: usize, _width: usize) ->... type Qwen2VlVisionConfig (line 304) | pub struct Qwen2VlVisionConfig { type Qwen2Vl (line 319) | pub struct Qwen2Vl { method get_number_of_features (line 324) | pub fn get_number_of_features(&self, height: usize, width: usize) -> u... type Qwen2_5VlVisionConfig (line 332) | pub struct Qwen2_5VlVisionConfig { type Qwen2_5Vl (line 351) | pub struct Qwen2_5Vl { method get_number_of_features (line 356) | pub fn get_number_of_features(&self, height: usize, width: usize) -> u... type Gemma3VisionConfig (line 364) | pub struct Gemma3VisionConfig { type Gemma3 (line 371) | pub struct Gemma3 { type Config (line 378) | pub enum Config { type TextConfig (line 426) | pub struct TextConfig {} type VisionConfig (line 430) | pub struct VisionConfig { function test_llava_next_features (line 440) | fn test_llava_next_features() { FILE: router/src/infer/chat_template.rs function raise_exception (line 10) | pub(crate) fn raise_exception(err_text: String) -> Result Result bool; method start_health (line 40) | fn start_health(&self) -> bool { method name (line 44) | fn name(&self) -> &'static str; type Infer (line 49) | pub struct Infer { method new (line 64) | pub(crate) fn new( method generate_stream (line 100) | pub(crate) async fn generate_stream<'a>( method tokenize (line 204) | pub(crate) async fn tokenize( method apply_chat_template (line 227) | pub(crate) fn apply_chat_template( method generate (line 245) | pub(crate) async fn generate( method generate_best_of (line 321) | pub(crate) async fn generate_best_of( method health (line 357) | pub(crate) async fn health(&self) -> bool { type GeneratedText (line 368) | pub struct GeneratedText { type InferStreamResponse (line 376) | pub enum InferStreamResponse { type InferResponse (line 395) | pub(crate) struct InferResponse { type InferError (line 409) | pub enum InferError { method error_type (line 431) | pub(crate) fn error_type(&self) -> &str { method into_openai_event (line 445) | pub(crate) fn into_openai_event(self) -> Event { type APIError (line 458) | pub struct APIError { type OpenaiErrorEvent (line 464) | pub struct OpenaiErrorEvent { FILE: router/src/infer/tool_grammar.rs type ToolGrammar (line 8) | pub(crate) struct ToolGrammar {} method find_tool_by_name (line 12) | fn find_tool_by_name(tools: &[Tool], name: &str) -> Result Json { function kserve_health_ready (line 92) | pub async fn kserve_health_ready() -> Json { function kerve_server_metadata (line 107) | pub async fn kerve_server_metadata() -> Json { function kserve_model_metadata (line 130) | pub async fn kserve_model_metadata( function kserve_model_metadata_ready (line 151) | pub async fn kserve_model_metadata_ready( function kserve_model_infer (line 169) | pub async fn kserve_model_infer( FILE: router/src/lib.rs type Tokenizer (line 29) | pub enum Tokenizer { type PyTokenizer (line 38) | pub struct PyTokenizer<'a>(pyo3::Bound<'a, pyo3::PyAny>); function from_py (line 41) | fn from_py( type TokenizerTrait (line 66) | trait TokenizerTrait { method encode_trait (line 67) | fn encode_trait( method encode_trait (line 75) | fn encode_trait( method encode_trait (line 85) | fn encode_trait( type HubModelInfo (line 114) | pub struct HubModelInfo { type ChatTemplate (line 122) | pub struct ChatTemplate { type ChatTemplateVersions (line 129) | pub enum ChatTemplateVersions { type HubTokenizerConfig (line 137) | pub struct HubTokenizerConfig { method from_file (line 148) | pub fn from_file>(filename: P) -> Option { type ChatTemplateStandalone (line 156) | pub struct ChatTemplateStandalone { type TokenizerConfigToken (line 162) | pub enum TokenizerConfigToken { method as_str (line 168) | pub fn as_str(&self) -> &str { type HubPreprocessorConfig (line 178) | pub enum HubPreprocessorConfig { method from_file (line 186) | pub fn from_file>(filename: P) -> Option>(filename: P) -> Option { type JsonSchemaConfig (line 227) | struct JsonSchemaConfig { type GrammarType (line 239) | pub(crate) enum GrammarType { type Info (line 261) | pub struct Info { type GenerateParameters (line 303) | pub(crate) struct GenerateParameters { function default_parameters (line 436) | fn default_parameters() -> GenerateParameters { type Prompt (line 462) | pub struct Prompt(pub Vec); type Error (line 472) | type Error = String; method try_from (line 474) | fn try_from(value: PromptDeserializer) -> Result { type PromptDeserializer (line 466) | enum PromptDeserializer { type CompletionRequest (line 492) | pub struct CompletionRequest { type Completion (line 547) | enum Completion { type CompletionFinal (line 555) | pub(crate) struct CompletionFinal { type CompletionComplete (line 567) | pub(crate) struct CompletionComplete { type Chunk (line 575) | pub(crate) struct Chunk { type ChatCompletion (line 585) | pub(crate) struct ChatCompletion { method new (line 699) | pub(crate) fn new( type ChatCompletionComplete (line 598) | pub(crate) struct ChatCompletionComplete { type ChatCompletionLogprobs (line 607) | pub(crate) struct ChatCompletionLogprobs { method from (line 612) | fn from(value: (Token, Vec)) -> Self { method from (line 632) | fn from(value: (Vec, Vec>)) -> Self { type ChatCompletionLogprob (line 666) | pub(crate) struct ChatCompletionLogprob { type ChatCompletionTopLogprob (line 674) | pub(crate) struct ChatCompletionTopLogprob { type Usage (line 681) | pub(crate) struct Usage { type CompletionType (line 690) | enum CompletionType { type ChatCompletionChunk (line 758) | pub(crate) struct ChatCompletionChunk { method new (line 809) | pub(crate) fn new( type ChatCompletionChoice (line 771) | pub(crate) struct ChatCompletionChoice { type ToolCallDelta (line 779) | pub struct ToolCallDelta { type ChatCompletionDelta (line 788) | enum ChatCompletionDelta { type DeltaToolCall (line 794) | pub(crate) struct DeltaToolCall { type Function (line 802) | pub(crate) struct Function { type ChatRequest (line 829) | pub(crate) struct ChatRequest { method try_into_generate (line 941) | fn try_into_generate(self, infer: &Infer) -> Result<(GenerateRequest, ... method next_int_id (line 1038) | fn next_int_id(&self) -> Result> { method next_tool_call_id (line 1054) | fn next_tool_call_id(&self) -> String { type StreamOptions (line 1064) | struct StreamOptions { function default_tool_prompt (line 1071) | pub fn default_tool_prompt() -> String { type TypedChoice (line 1077) | pub enum TypedChoice { type FunctionName (line 1083) | pub struct FunctionName { type ToolChoice (line 1091) | pub enum ToolChoice { method from (line 1127) | fn from(value: ToolTypeDeserializer) -> Self { type ToolTypeDeserializer (line 1113) | enum ToolTypeDeserializer { type JsonSchemaTool (line 1144) | pub struct JsonSchemaTool { type FunctionsMap (line 1151) | struct FunctionsMap { type FunctionRef (line 1157) | struct FunctionRef { type Properties (line 1163) | struct Properties { function serialize_function (line 1168) | fn serialize_function(functions: &Vec, serializer: S) ->... type FunctionDefinition (line 1179) | pub struct FunctionDefinition { function serialize_as_string (line 1187) | fn serialize_as_string(value: &serde_json::Value, serializer: S) -> R... type Tool (line 1196) | pub(crate) struct Tool { type ChatTemplateInputs (line 1205) | pub(crate) struct ChatTemplateInputs<'a> { type ToolCall (line 1214) | pub struct ToolCall { type Url (line 1221) | pub struct Url { type MessageChunk (line 1228) | pub enum MessageChunk { type Message (line 1234) | pub struct Message { type MessageBody (line 1247) | pub enum MessageBody { type MessageContent (line 1262) | pub enum MessageContent { method push (line 1269) | pub fn push(&mut self, chunk: MessageChunk) { type TextMessage (line 1285) | pub struct TextMessage { method from (line 1295) | fn from(value: Message) -> Self { type ToolCallMessage (line 1322) | pub struct ToolCallMessage { type OutputMessage (line 1330) | pub(crate) enum OutputMessage { type GenerateRequest (line 1337) | pub(crate) struct GenerateRequest { method from (line 1366) | fn from(req: CompatGenerateRequest) -> Self { function default_true (line 1350) | fn default_true() -> bool { type CompatGenerateRequest (line 1355) | pub(crate) struct CompatGenerateRequest { type PrefillToken (line 1376) | pub struct PrefillToken { type Token (line 1386) | pub struct Token { type SimpleToken (line 1398) | pub struct SimpleToken { type FinishReason (line 1412) | pub enum FinishReason { method fmt (line 1423) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { method format (line 1433) | pub fn format(&self, use_stop: bool) -> String { type BestOfSequence (line 1442) | pub(crate) struct BestOfSequence { type Details (line 1458) | pub(crate) struct Details { type GenerateResponse (line 1474) | pub(crate) struct GenerateResponse { type ChatTokenizeResponse (line 1482) | pub(crate) struct ChatTokenizeResponse { type TokenizeResponse (line 1489) | pub(crate) struct TokenizeResponse(Vec); type StreamDetails (line 1492) | pub(crate) struct StreamDetails { type StreamResponse (line 1504) | pub(crate) struct StreamResponse { type ErrorResponse (line 1516) | pub(crate) struct ErrorResponse { type ModelInfo (line 1522) | pub(crate) struct ModelInfo { type ModelsInfo (line 1534) | pub(crate) struct ModelsInfo { method default (line 1541) | fn default() -> Self { function get_tokenizer (line 1554) | pub(crate) fn get_tokenizer() -> Tokenizer { function test_hub_nested_tokens_tokenizer_config (line 1562) | fn test_hub_nested_tokens_tokenizer_config() { function test_chat_simple_string (line 1635) | fn test_chat_simple_string() { function test_message_content_append (line 1658) | fn test_message_content_append() { function test_chat_request (line 1687) | fn test_chat_request() { function text_message_convert (line 1717) | fn text_message_convert() { function test_chat_stream_options (line 1733) | fn test_chat_stream_options() { function openai_output (line 1769) | fn openai_output() { function tool_choice_formats (line 1803) | fn tool_choice_formats() { FILE: router/src/logging.rs type TraceParent (line 14) | struct TraceParent { function parse_traceparent (line 22) | fn parse_traceparent(header_value: &str) -> Option { function trace_context_middleware (line 45) | pub async fn trace_context_middleware(mut request: Request, next: Next) ... function init_logging (line 72) | pub fn init_logging(otlp_endpoint: Option, otlp_service_name: St... FILE: router/src/sagemaker.rs type SagemakerRequest (line 17) | pub(crate) enum SagemakerRequest { type SagemakerResponse (line 27) | pub(crate) enum SagemakerResponse { type SagemakerStreamResponse (line 37) | pub(crate) enum SagemakerStreamResponse { function sagemaker_compatibility (line 66) | pub(crate) async fn sagemaker_compatibility( FILE: router/src/server.rs function encoding_to_tokens (line 71) | fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> V... function compat_generate (line 126) | pub(crate) async fn compat_generate( function get_model_info (line 161) | async fn get_model_info(info: Extension) -> Json { function openai_get_model_info (line 176) | async fn openai_get_model_info(info: Extension) -> Json { function get_chat_tokenize (line 199) | async fn get_chat_tokenize( function health (line 230) | async fn health(infer: Extension) -> Result<(), (StatusCode, Json... function generate (line 273) | async fn generate( function generate_internal (line 289) | pub(crate) async fn generate_internal( function generate_stream (line 476) | async fn generate_stream( function generate_stream_internal (line 508) | async fn generate_stream_internal( function completions (line 715) | pub(crate) async fn completions( function chat_completions (line 1168) | pub(crate) async fn chat_completions( function tokenize (line 1317) | async fn tokenize( function metrics (line 1334) | async fn metrics(prom_handle: Extension) -> String { type ComputeType (line 1339) | pub(crate) struct ComputeType(String); type ApiDoc (line 1429) | pub struct ApiDoc; function schema (line 1431) | pub fn schema() -> ApiDoc { function py_resolve_tokenizer (line 1435) | pub fn py_resolve_tokenizer( function legacy_tokenizer_handle (line 1461) | pub fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Opt... function run (line 1502) | pub async fn run( function start (line 1870) | async fn start( function get_hub_model_info (line 2338) | pub async fn get_hub_model_info(api: &ApiRepo) -> Option { function get_tokenizer_config (line 2357) | pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option Self { method from (line 2429) | fn from(err: InferError) -> Self { type WebServerError (line 2440) | pub enum WebServerError { FILE: router/src/usage_stats.rs constant TELEMETRY_URL (line 15) | const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi"; type UsageStatsLevel (line 18) | pub enum UsageStatsLevel { type UserAgent (line 25) | pub struct UserAgent { method new (line 32) | pub fn new(reduced_args: Args) -> Self { type EventType (line 42) | pub enum EventType { type UsageStatsEvent (line 50) | pub struct UsageStatsEvent { method new (line 58) | pub fn new(user_agent: UserAgent, event_type: EventType, error_reason:... method send (line 65) | pub async fn send(&self) { type Args (line 81) | pub struct Args { method new (line 106) | pub fn new( type Env (line 155) | pub struct Env { method new (line 388) | pub fn new() -> Self { method is_hpu_device (line 398) | pub fn is_hpu_device(&self) -> bool { type NvidiaSmiInfo (line 165) | struct NvidiaSmiInfo { method new (line 188) | fn new() -> Option> { type XpuSmiInfo (line 239) | struct XpuSmiInfo { method new (line 249) | fn new() -> Option> { type HpuSmiInfo (line 294) | struct HpuSmiInfo { method new (line 307) | fn new() -> Option> { type SystemInfo (line 348) | pub struct SystemInfo { method new (line 357) | fn new() -> Self { method default (line 382) | fn default() -> Self { function is_container (line 403) | pub fn is_container() -> io::Result { FILE: router/src/validation.rs type Validation (line 29) | pub struct Validation { method new (line 43) | pub(crate) fn new( method tokenize (line 105) | pub async fn tokenize( method validate_input (line 133) | async fn validate_input( method validate (line 204) | pub(crate) async fn validate( method validate_best_of (line 451) | pub(crate) fn validate_best_of(&self, best_of: usize) -> Result Option { function format_to_mimetype (line 558) | fn format_to_mimetype(format: ImageFormat) -> String { function fetch_image (line 570) | fn fetch_image( function image_tokens (line 639) | fn image_tokens( function image_tokens_fixup (line 802) | fn image_tokens_fixup(config: &Config, text: String) -> String { function prepare_input (line 813) | fn prepare_input( type TokenizerRequest (line 865) | type TokenizerRequest = ( type Image (line 872) | pub struct Image { type Chunk (line 878) | pub enum Chunk { type ChunksToString (line 885) | pub trait ChunksToString { method chunks_to_string (line 887) | fn chunks_to_string(&self) -> String; method chunks_to_string (line 891) | fn chunks_to_string(&self) -> String { type ValidGrammar (line 905) | pub enum ValidGrammar { type ValidParameters (line 911) | pub struct ValidParameters { type ValidStoppingParameters (line 935) | pub struct ValidStoppingParameters { type ValidGenerateRequest (line 948) | pub struct ValidGenerateRequest { type ValidationError (line 962) | pub enum ValidationError { function test_validation_max_new_tokens (line 1041) | async fn test_validation_max_new_tokens() { function test_validation_input_length (line 1077) | async fn test_validation_input_length() { function test_validation_best_of_sampling (line 1112) | async fn test_validation_best_of_sampling() { function test_validation_top_p (line 1153) | async fn test_validation_top_p() { function test_validation_top_n_tokens (line 1225) | async fn test_validation_top_n_tokens() { function test_prepare_input_chunks (line 1309) | async fn test_prepare_input_chunks() { function test_idefics2_correct_n_fake_tokens (line 1366) | async fn test_idefics2_correct_n_fake_tokens() { FILE: router/src/vertex.rs type GenerateVertexInstance (line 15) | pub(crate) struct GenerateVertexInstance { type VertexInstance (line 25) | pub(crate) enum VertexInstance { type VertexRequest (line 32) | pub(crate) struct VertexRequest { type VertexResponse (line 38) | pub(crate) struct VertexResponse { function vertex_compatibility (line 71) | pub(crate) async fn vertex_compatibility( function vertex_deserialization (line 159) | fn vertex_deserialization() { FILE: server/bounds-from-nix.py function is_optional (line 12) | def is_optional(info: Union[str, Dict[str, str]]) -> bool: FILE: server/exllama_kernels/exllama_kernels/exllama_ext.cpp function check_cuda (line 21) | void check_cuda(cudaError_t ret) function get_groupsize (line 68) | int get_groupsize(torch::Tensor w, torch::Tensor w_zeros) function set_tuning_params (line 80) | void set_tuning_params function cleanup (line 95) | void cleanup() function prepare_buffers (line 104) | void prepare_buffers function make_q4 (line 126) | uintptr_t make_q4 function q4_matmul (line 168) | void q4_matmul function column_remap (line 218) | void column_remap function PYBIND11_MODULE (line 246) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) FILE: server/exllama_kernels/exllama_kernels/tuning.h type ExLlamaTuning (line 6) | struct ExLlamaTuning FILE: server/exllamav2_kernels/exllamav2_kernels/ext.cpp function make_q_matrix (line 26) | uintptr_t make_q_matrix function gemm_half_q_half (line 100) | void gemm_half_q_half function PYBIND11_MODULE (line 135) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) FILE: server/tests/conftest.py function default_pb_parameters (line 10) | def default_pb_parameters(): function default_pb_stop_parameters (line 22) | def default_pb_stop_parameters(): FILE: server/tests/models/test_bloom.py function default_bloom (line 17) | def default_bloom(): function bloom_560m_tokenizer (line 29) | def bloom_560m_tokenizer(): function default_pb_request (line 34) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters): function default_pb_batch (line 47) | def default_pb_batch(default_pb_request): function default_bloom_batch (line 52) | def default_bloom_batch(default_pb_batch, bloom_560m_tokenizer): function default_multi_requests_bloom_batch (line 59) | def default_multi_requests_bloom_batch(default_pb_request, bloom_560m_to... function test_batch_from_pb (line 72) | def test_batch_from_pb(default_pb_batch, default_bloom_batch): function test_batch_concatenate_no_prefill (line 102) | def test_batch_concatenate_no_prefill(default_bloom_batch): function test_causal_lm_batch_type (line 107) | def test_causal_lm_batch_type(default_bloom): function test_causal_lm_generate_token (line 111) | def test_causal_lm_generate_token(default_bloom, default_bloom_batch): function test_causal_lm_generate_token_completion (line 160) | def test_causal_lm_generate_token_completion(default_bloom, default_bloo... function test_causal_lm_generate_token_completion_multi (line 180) | def test_causal_lm_generate_token_completion_multi( function test_batch_concatenate (line 230) | def test_batch_concatenate( FILE: server/tests/models/test_causal_lm.py function default_causal_lm (line 12) | def default_causal_lm(): function gpt2_tokenizer (line 17) | def gpt2_tokenizer(): function default_pb_request (line 24) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters): function default_pb_batch (line 37) | def default_pb_batch(default_pb_request): function default_causal_lm_batch (line 42) | def default_causal_lm_batch(default_pb_batch, gpt2_tokenizer): function default_multi_requests_causal_lm_batch (line 49) | def default_multi_requests_causal_lm_batch(default_pb_request, gpt2_toke... function test_batch_from_pb (line 62) | def test_batch_from_pb(default_pb_batch, default_causal_lm_batch): function test_batch_concatenate_no_prefill (line 92) | def test_batch_concatenate_no_prefill(default_causal_lm_batch): function test_causal_lm_batch_type (line 97) | def test_causal_lm_batch_type(default_causal_lm): function test_causal_lm_generate_token (line 101) | def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_b... function test_causal_lm_generate_token_completion (line 152) | def test_causal_lm_generate_token_completion( function test_causal_lm_generate_token_completion_multi (line 172) | def test_causal_lm_generate_token_completion_multi( function test_batch_concatenate (line 224) | def test_batch_concatenate( FILE: server/tests/models/test_model.py function get_test_model (line 9) | def get_test_model(): function test_decode_streaming_english_spaces (line 31) | def test_decode_streaming_english_spaces(): function test_decode_streaming_chinese_utf8 (line 52) | def test_decode_streaming_chinese_utf8(): FILE: server/tests/models/test_santacoder.py function default_santacoder (line 8) | def default_santacoder(): function default_pb_request (line 13) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters): function default_pb_batch (line 26) | def default_pb_batch(default_pb_request): function default_fim_pb_request (line 31) | def default_fim_pb_request(default_pb_parameters, default_pb_stop_parame... function default_fim_pb_batch (line 50) | def default_fim_pb_batch(default_fim_pb_request): function test_santacoder_generate_token_completion (line 55) | def test_santacoder_generate_token_completion(default_santacoder, defaul... function test_fim_santacoder_generate_token_completion (line 81) | def test_fim_santacoder_generate_token_completion( FILE: server/tests/models/test_seq2seq_lm.py function mt0_small_tokenizer (line 13) | def mt0_small_tokenizer(): function default_seq2seq_lm (line 22) | def default_seq2seq_lm(): function default_pb_request (line 27) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters): function default_pb_batch (line 40) | def default_pb_batch(default_pb_request): function default_seq2seq_lm_batch (line 45) | def default_seq2seq_lm_batch(default_pb_batch, mt0_small_tokenizer): function default_multi_requests_seq2seq_lm_batch (line 52) | def default_multi_requests_seq2seq_lm_batch(default_pb_request, mt0_smal... function test_batch_from_pb (line 65) | def test_batch_from_pb(default_pb_batch, default_seq2seq_lm_batch): function test_batch_concatenate_no_prefill (line 96) | def test_batch_concatenate_no_prefill(default_seq2seq_lm_batch): function test_seq2seq_lm_batch_type (line 101) | def test_seq2seq_lm_batch_type(default_seq2seq_lm): function test_seq2seq_lm_generate_token (line 105) | def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_l... function test_seq2seq_lm_generate_token_completion (line 172) | def test_seq2seq_lm_generate_token_completion( function test_seq2seq_lm_generate_token_completion_multi (line 189) | def test_seq2seq_lm_generate_token_completion_multi( function test_batch_concatenate (line 226) | def test_batch_concatenate( FILE: server/tests/utils/test_adapter.py function test_parse_lora_adapters_empty (line 11) | def test_parse_lora_adapters_empty(): function test_parse_lora_adapters_single (line 16) | def test_parse_lora_adapters_single(): function test_parse_lora_adapters_with_path (line 21) | def test_parse_lora_adapters_with_path(): function test_parse_lora_adapters_with_path_and_revision (line 28) | def test_parse_lora_adapters_with_path_and_revision(): function test_parse_lora_adapters_multiple (line 35) | def test_parse_lora_adapters_multiple(): function test_parse_lora_adapters_invalid_format (line 46) | def test_parse_lora_adapters_invalid_format(): function test_get_attn_weights (line 54) | def test_get_attn_weights(): function test_get_mlp_weights_with_gate_up_proj (line 86) | def test_get_mlp_weights_with_gate_up_proj(): function test_get_mlp_weights_without_gate_up_proj (line 106) | def test_get_mlp_weights_without_gate_up_proj(): function test_get_attn_weights_different_layers (line 119) | def test_get_attn_weights_different_layers(layer_index): function test_get_mlp_weights_different_layers (line 141) | def test_get_mlp_weights_different_layers(layer_index): function test_get_attn_weights_llama_compatibility (line 156) | def test_get_attn_weights_llama_compatibility(): function test_get_mlp_weights_llama_compatibility (line 185) | def test_get_mlp_weights_llama_compatibility(): function test_get_attn_weights_gemma_compatibility (line 202) | def test_get_attn_weights_gemma_compatibility(): function test_get_mlp_weights_gemma_compatibility (line 231) | def test_get_mlp_weights_gemma_compatibility(): FILE: server/tests/utils/test_convert.py function test_convert_files (line 10) | def test_convert_files(): FILE: server/tests/utils/test_hub.py function offline (line 20) | def offline(): function fresh_cache (line 28) | def fresh_cache(): function prefetched (line 41) | def prefetched(): function test_weight_hub_files_offline_error (line 53) | def test_weight_hub_files_offline_error(offline, fresh_cache): function test_weight_hub_files_offline_ok (line 59) | def test_weight_hub_files_offline_ok(prefetched, offline): function test_weight_hub_files (line 73) | def test_weight_hub_files(): function test_weight_hub_files_llm (line 78) | def test_weight_hub_files_llm(): function test_weight_hub_files_empty (line 83) | def test_weight_hub_files_empty(): function test_download_weights (line 88) | def test_download_weights(): function test_weight_files_revision_error (line 96) | def test_weight_files_revision_error(): function test_weight_files_not_cached_error (line 101) | def test_weight_files_not_cached_error(fresh_cache): FILE: server/tests/utils/test_layers.py class ProcessGroup (line 7) | class ProcessGroup: method __init__ (line 8) | def __init__(self, rank: int, world_size: int): method size (line 12) | def size(self) -> int: method rank (line 15) | def rank(self) -> int: class Weights (line 19) | class Weights: method __init__ (line 20) | def __init__(self, rank: int, world_size: int, vocab_size: int, hidden... method get_partial_sharded (line 26) | def get_partial_sharded(self, name: str, dim: int): method get_shape (line 38) | def get_shape(self, name: str): function test_weight_hub_files_offline_error (line 42) | def test_weight_hub_files_offline_error(): FILE: server/tests/utils/test_tokens.py function test_stop_sequence_criteria (line 10) | def test_stop_sequence_criteria(): function test_stop_sequence_criteria_escape (line 19) | def test_stop_sequence_criteria_escape(): function test_stopping_criteria (line 28) | def test_stopping_criteria(): function test_stopping_criteria_eos (line 34) | def test_stopping_criteria_eos(): function test_stopping_criteria_max (line 40) | def test_stopping_criteria_max(): function test_batch_top_tokens (line 49) | def test_batch_top_tokens(): FILE: server/tests/utils/test_watermark.py function test_seed_rng (line 13) | def test_seed_rng(): function test_get_greenlist_ids (line 20) | def test_get_greenlist_ids(): function test_calc_greenlist_mask (line 28) | def test_calc_greenlist_mask(): function test_bias_greenlist_logits (line 37) | def test_bias_greenlist_logits(): function test_call (line 49) | def test_call(): FILE: server/tests/utils/test_weights.py function gptq_weights_loader (line 20) | def gptq_weights_loader(): function gptq_weights_loader_awq (line 33) | def gptq_weights_loader_awq(): function marlin_weights_loader (line 46) | def marlin_weights_loader(): class MockSlice (line 293) | class MockSlice: method __init__ (line 294) | def __init__(self, tensor): method get_shape (line 297) | def get_shape(self): method __getitem__ (line 300) | def __getitem__(self, idx): function mock_get_slice (line 304) | def mock_get_slice(tensor_name, filename): function mock_handle (line 309) | def mock_handle(filename, device, dtype): class MockSafeOpen (line 315) | class MockSafeOpen: method __init__ (line 316) | def __init__(self, filename, framework, dummy_fs): method keys (line 321) | def keys(self): method __enter__ (line 324) | def __enter__(self): method __exit__ (line 327) | def __exit__(self, exc_type, exc_val, exc_tb): class MockWeights (line 331) | class MockWeights(Weights): method __init__ (line 332) | def __init__( method _get_handle (line 369) | def _get_handle(self, filename: Union[Path, str]): method get_shape (line 377) | def get_shape(self, tensor_name: str): method get_tensor (line 382) | def get_tensor(self, tensor_name: str): function test_weights (line 391) | def test_weights(): function test_get_tensor (line 406) | def test_get_tensor(): function test_get_weights_col_packed (line 439) | def test_get_weights_col_packed(): function test_get_weights_col_packed_block_size (line 473) | def test_get_weights_col_packed_block_size(): function test_get_weights_col_packed_block_size_arr (line 507) | def test_get_weights_col_packed_block_size_arr(): function test_get_multi_weights_col (line 541) | def test_get_multi_weights_col(): function test_get_weights_row (line 577) | def test_get_weights_row(): function test_get_weights_col_awq (line 606) | def test_get_weights_col_awq(gptq_weights_loader_awq): function test_get_weights_col_gtpq (line 648) | def test_get_weights_col_gtpq(gptq_weights_loader): function test_get_weights_col_exl2 (line 687) | def test_get_weights_col_exl2(): function test_get_weights_col_marlin (line 723) | def test_get_weights_col_marlin(marlin_weights_loader): function test_get_weights_col_packed_awq (line 753) | def test_get_weights_col_packed_awq(gptq_weights_loader_awq): function test_get_weights_col_packed_exl2 (line 795) | def test_get_weights_col_packed_exl2(): function test_get_weights_col_packed_gptq (line 833) | def test_get_weights_col_packed_gptq(gptq_weights_loader): function test_get_weights_col_packed_marlin (line 873) | def test_get_weights_col_packed_marlin(marlin_weights_loader): function test_get_multi_weights_col_awq (line 906) | def test_get_multi_weights_col_awq(gptq_weights_loader_awq): function test_get_multi_weights_col_exl2 (line 946) | def test_get_multi_weights_col_exl2(): function test_get_multi_weights_col_gptq (line 969) | def test_get_multi_weights_col_gptq(gptq_weights_loader): function test_get_multi_weights_col_marlin (line 1009) | def test_get_multi_weights_col_marlin(marlin_weights_loader): function test_get_weights_row_awq (line 1040) | def test_get_weights_row_awq(gptq_weights_loader_awq): function test_get_weights_row_exl2 (line 1079) | def test_get_weights_row_exl2(): function test_get_weights_row_gptq (line 1116) | def test_get_weights_row_gptq(gptq_weights_loader): function test_get_weights_row_marlin (line 1155) | def test_get_weights_row_marlin(marlin_weights_loader): FILE: server/text_generation_server/adapters/config.py class ModuleMap (line 15) | class ModuleMap: class AdapterConfig (line 21) | class AdapterConfig(ABC): method map_weights_for_model (line 25) | def map_weights_for_model( FILE: server/text_generation_server/adapters/lora.py function get_start_stop_idxs_for_rank (line 31) | def get_start_stop_idxs_for_rank(offset, size, rank, world_size): function shard_on_dim (line 38) | def shard_on_dim( function shard_lora_weights (line 57) | def shard_lora_weights( class LoraConfig (line 75) | class LoraConfig(AdapterConfig): method map_weights_for_model (line 82) | def map_weights_for_model( method load (line 104) | def load(cls, adapter_id: str, api_token: str) -> "LoraConfig": class LoraWeights (line 118) | class LoraWeights(AdapterWeights): method __init__ (line 121) | def __init__( method weights_a (line 155) | def weights_a(self) -> torch.Tensor: method weights_b (line 161) | def weights_b(self) -> torch.Tensor: method weights_a_t (line 167) | def weights_a_t(self) -> torch.Tensor: method weights_b_t (line 173) | def weights_b_t(self) -> torch.Tensor: method _transpose_weights (line 178) | def _transpose_weights(self): method get_batch_types (line 186) | def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]: method prepare_weights (line 206) | def prepare_weights( class RankSegments (line 287) | class RankSegments: class BatchLoraWeights (line 304) | class BatchLoraWeights(BatchAdapterWeights): method has_adapter (line 311) | def has_adapter(self, adapter_index: int) -> bool: method can_vectorize (line 314) | def can_vectorize(self, pg: ProcessGroup) -> bool: method load (line 321) | def load( class IPEXBatchLoraWeights (line 490) | class IPEXBatchLoraWeights(BatchLoraWeights): method load (line 492) | def load( function get_scaling_factor (line 598) | def get_scaling_factor( function _convert_lora (line 609) | def _convert_lora(v: AdapterWeights) -> AdapterWeights: FILE: server/text_generation_server/adapters/weights.py class AdapterBatchMetadata (line 14) | class AdapterBatchMetadata: class AdapterWeights (line 30) | class AdapterWeights(ABC): method get_batch_types (line 32) | def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]: method speculative_tokens (line 36) | def speculative_tokens(self) -> int: class BatchAdapterWeights (line 40) | class BatchAdapterWeights(ABC): method has_adapter (line 42) | def has_adapter(self, adapter_index: int) -> bool: method load (line 46) | def load( class LayerAdapterWeights (line 56) | class LayerAdapterWeights: method __init__ (line 59) | def __init__(self): method add_adapter (line 62) | def add_adapter(self, adapter_idx: int, weights: AdapterWeights): method remove_adapter (line 65) | def remove_adapter(self, adapter_idx: int): method is_empty (line 70) | def is_empty(self) -> bool: method get_data (line 73) | def get_data( class AdapterBatchData (line 98) | class AdapterBatchData: method from_meta (line 107) | def from_meta( method ranks (line 122) | def ranks(self) -> Set[int]: method layer_names (line 134) | def layer_names(self) -> Set[str]: method adapter_keys (line 137) | def adapter_keys(self) -> Set[str]: method max_rank (line 144) | def max_rank(self) -> int: FILE: server/text_generation_server/cache.py class Cache (line 10) | class Cache: method __init__ (line 11) | def __init__(self): method pop (line 14) | def pop(self, batch_id: int) -> Optional[B]: method set (line 17) | def set(self, entry: B): method delete (line 21) | def delete(self, batch_id: int): method clear (line 28) | def clear(self): method __len__ (line 33) | def __len__(self): FILE: server/text_generation_server/cli.py class Quantization (line 18) | class Quantization(str, Enum): class Dtype (line 31) | class Dtype(str, Enum): class KVCacheDtype (line 36) | class KVCacheDtype(str, Enum): function serve (line 42) | def serve( function download_weights (line 135) | def download_weights( function quantize (line 339) | def quantize( FILE: server/text_generation_server/interceptor.py class ExceptionInterceptor (line 11) | class ExceptionInterceptor(AsyncServerInterceptor): method __init__ (line 12) | def __init__(self, shutdown_callback): method intercept (line 15) | async def intercept( FILE: server/text_generation_server/layers/attention/common.py class Seqlen (line 7) | class Seqlen: method __init__ (line 15) | def __init__( method clamp (line 50) | def clamp(self, max): FILE: server/text_generation_server/layers/attention/cuda.py function paged_attention (line 30) | def paged_attention( function attention (line 228) | def attention( FILE: server/text_generation_server/layers/attention/flash_attn_triton.py function cdiv_fn (line 31) | def cdiv_fn(x, y): function max_fn (line 36) | def max_fn(x, y): function dropout_offsets (line 41) | def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): function dropout_rng (line 48) | def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): function dropout_mask (line 57) | def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): function load_fn (line 64) | def load_fn(block_ptr, first, second, pad): function _attn_fwd_inner (line 77) | def _attn_fwd_inner( function attn_fwd (line 311) | def attn_fwd( function check_args (line 668) | def check_args( class _attention (line 701) | class _attention(torch.autograd.Function): method forward (line 703) | def forward( FILE: server/text_generation_server/layers/attention/flashinfer.py function unpad_2d_mask (line 24) | def unpad_2d_mask( function get_workspace (line 38) | def get_workspace(device): function create_prefill_with_paged_kv_state (line 46) | def create_prefill_with_paged_kv_state( function use_prefill_with_paged_kv_state (line 58) | def use_prefill_with_paged_kv_state( function create_prefill_state (line 130) | def create_prefill_state( function create_decode_state (line 141) | def create_decode_state( function create_decode_state_cuda_graphs (line 159) | def create_decode_state_cuda_graphs( function use_decode_state (line 188) | def use_decode_state( FILE: server/text_generation_server/layers/attention/ipex.py function attention (line 17) | def attention( function paged_attention (line 89) | def paged_attention( FILE: server/text_generation_server/layers/attention/kv_cache.py class KVScales (line 28) | class KVScales: method __post_init__ (line 43) | def __post_init__(self): class KVCache (line 51) | class KVCache: method __init__ (line 58) | def __init__( method can_scale (line 135) | def can_scale(self, kv_scales: KVScales) -> bool: method dtype (line 155) | def dtype(self): method key (line 160) | def key(self): method value (line 166) | def value(self): method store (line 171) | def store( function paged_reshape_and_cache (line 243) | def paged_reshape_and_cache( function get_kv_scales (line 310) | def get_kv_scales(weights: Weights, prefix: str) -> KVScales: FILE: server/text_generation_server/layers/attention/rocm.py function _use_rocm_custom_paged_attention (line 32) | def _use_rocm_custom_paged_attention( function paged_attention (line 51) | def paged_attention( function attention (line 273) | def attention( FILE: server/text_generation_server/layers/awq/conversion_utils.py function pack (line 9) | def pack(imatrix: torch.Tensor, direction: str = "column"): function unpack (line 35) | def unpack(qmatrix: torch.Tensor, direction: str = "column"): function apply_order (line 61) | def apply_order( function fast_awq_to_gptq (line 83) | def fast_awq_to_gptq(qweight, qzeros): FILE: server/text_generation_server/layers/awq/quantize/cuda.py class WQLinear (line 19) | class WQLinear(nn.Module): method __init__ (line 20) | def __init__( method forward (line 43) | def forward(self, x): FILE: server/text_generation_server/layers/awq/quantize/ipex.py class WQLinear (line 7) | class WQLinear(nn.Module): method __init__ (line 8) | def __init__( method forward (line 44) | def forward(self, x): FILE: server/text_generation_server/layers/bnb.py class BNBWeight (line 10) | class BNBWeight(UnquantizedWeight): method get_linear (line 13) | def get_linear(self, bias: torch.Tensor): class Linear8bitLt (line 17) | class Linear8bitLt(torch.nn.Module): method __init__ (line 18) | def __init__( method init_8bit_state (line 49) | def init_8bit_state(self): method forward (line 55) | def forward(self, x: torch.Tensor): class BNBFP4Weight (line 76) | class BNBFP4Weight(UnquantizedWeight): method get_linear (line 79) | def get_linear(self, bias: torch.Tensor): class BNBNF4Weight (line 84) | class BNBNF4Weight(UnquantizedWeight): method get_linear (line 87) | def get_linear(self, bias: torch.Tensor): class Linear4bit (line 91) | class Linear4bit(torch.nn.Module): method __init__ (line 92) | def __init__(self, weight, bias, quant_type): method forward (line 104) | def forward(self, x: torch.Tensor): FILE: server/text_generation_server/layers/compressed_tensors/loader.py class CompressedTensorsLoader (line 34) | class CompressedTensorsLoader(WeightsLoader): method __init__ (line 37) | def __init__(self, config: Dict[str, Any]): method get_weights (line 74) | def get_weights(self, weights: Weights, prefix: str): method get_weights_col_packed (line 78) | def get_weights_col_packed( method get_multi_weights_col (line 87) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 91) | def get_weights_row(self, weights: Weights, prefix: str): method _get_target_loaders (line 95) | def _get_target_loaders( method _create_loader_for_group (line 126) | def _create_loader_for_group( method _lookup_loader (line 181) | def _lookup_loader(self, prefix: str) -> WeightsLoader: FILE: server/text_generation_server/layers/compressed_tensors/w8a8_int.py class W8A8IntLoader (line 22) | class W8A8IntLoader(WeightsLoader): method __init__ (line 27) | def __init__( method __str__ (line 54) | def __str__(self) -> str: method get_weights (line 63) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 78) | def get_weights_col_packed( method get_multi_weights_col (line 106) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_weights_row (line 128) | def get_weights_row(self, weights: "Weights", prefix: str): function _get_tensor_or_else (line 147) | def _get_tensor_or_else( class Int8Weight (line 159) | class Int8Weight(Weight): method get_linear (line 164) | def get_linear(self, bias: torch.Tensor): class W8A8IntLinear (line 183) | class W8A8IntLinear(torch.nn.Module): method __init__ (line 184) | def __init__( method forward (line 210) | def forward(self, input: torch.Tensor) -> torch.Tensor: FILE: server/text_generation_server/layers/compressed_tensors/w8an_fp.py class W8ANFpLoader (line 16) | class W8ANFpLoader(WeightsLoader): method __init__ (line 21) | def __init__( method __str__ (line 43) | def __str__(self) -> str: method get_weights (line 51) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 75) | def get_weights_col_packed( method get_multi_weights_col (line 118) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_weights_row (line 169) | def get_weights_row(self, weights: "Weights", prefix: str): FILE: server/text_generation_server/layers/compressed_tensors/wna16_int.py class WNA16IntLoader (line 12) | class WNA16IntLoader(WeightsLoader): method __init__ (line 17) | def __init__(self, weights: QuantizationArgs): method __str__ (line 24) | def __str__(self) -> str: method get_weights (line 29) | def get_weights(self, weights: Weights, prefix: str): method get_weights_col_packed (line 61) | def get_weights_col_packed( method get_multi_weights_col (line 103) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 148) | def get_weights_row(self, weights: Weights, prefix: str): FILE: server/text_generation_server/layers/compressed_tensors/wna16_int_24.py class WNA16Int24Loader (line 11) | class WNA16Int24Loader(WeightsLoader): method __init__ (line 16) | def __init__(self, weight_args: QuantizationArgs): method __str__ (line 30) | def __str__(self) -> str: method get_weights (line 35) | def get_weights(self, weights: Weights, prefix: str): method get_weights_col_packed (line 49) | def get_weights_col_packed( method get_multi_weights_col (line 71) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 88) | def get_weights_row(self, weights: Weights, prefix: str): FILE: server/text_generation_server/layers/conv.py function load_conv2d (line 6) | def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_... function load_conv2d_no_bias (line 23) | def load_conv2d_no_bias( FILE: server/text_generation_server/layers/eetq.py class EETQWeight (line 13) | class EETQWeight(UnquantizedWeight): method get_linear (line 16) | def get_linear(self, bias: torch.Tensor): class EETQLinear (line 27) | class EETQLinear(torch.nn.Module): method __init__ (line 28) | def __init__( method forward (line 44) | def forward(self, input: torch.Tensor) -> torch.Tensor: FILE: server/text_generation_server/layers/exl2.py class Exl2Weight (line 9) | class Exl2Weight(Weight): method __post_init__ (line 20) | def __post_init__(self): method device (line 25) | def device(self) -> torch.device: method get_linear (line 28) | def get_linear(self, bias: torch.Tensor): class Exl2WeightsLoader (line 34) | class Exl2WeightsLoader(WeightsLoader): method get_weights (line 37) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 61) | def get_weights_col_packed( method get_weights_col (line 69) | def get_weights_col(self, weights: Weights, prefix: str): method get_multi_weights_col (line 73) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 76) | def get_weights_row(self, weights: Weights, prefix: str): FILE: server/text_generation_server/layers/fp8.py function get_fp8_linear (line 44) | def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]: function normalize_e4m3fn_to_native_float8 (line 79) | def normalize_e4m3fn_to_native_float8( function per_tensor_dequantize (line 103) | def per_tensor_dequantize( function requantize_with_max_scale (line 113) | def requantize_with_max_scale( function fp8_quantize (line 136) | def fp8_quantize( class HybridFP8UnquantLoader (line 187) | class HybridFP8UnquantLoader(WeightsLoader): method __init__ (line 190) | def __init__( method get_weights (line 200) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 239) | def get_weights_col_packed( method get_multi_weights_col (line 289) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_weights_row (line 357) | def get_weights_row(self, weights: "Weights", prefix: str): class Fp8Weight (line 400) | class Fp8Weight(Weight): method get_linear (line 409) | def get_linear(self, bias: torch.Tensor): class Fp8Linear (line 428) | class Fp8Linear(torch.nn.Module): method __init__ (line 431) | def __init__( method from_unquant (line 465) | def from_unquant(cls, weight, bias, dtype): method from_fp8 (line 477) | def from_fp8( method get_shared_device_identity (line 500) | def get_shared_device_identity(cls, device): method forward (line 507) | def forward(self, input: torch.Tensor) -> torch.Tensor: function _load_scalar_or_matrix_scale (line 582) | def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: t... FILE: server/text_generation_server/layers/gptq/__init__.py class GPTQWeight (line 19) | class GPTQWeight(Weight): method __post_init__ (line 29) | def __post_init__(self): method device (line 34) | def device(self) -> torch.device: method get_linear (line 37) | def get_linear(self, bias: torch.Tensor): class GPTQWeightsLoader (line 93) | class GPTQWeightsLoader(WeightsLoader): method __init__ (line 98) | def __init__( method get_weights (line 117) | def get_weights(self, weights: Weights, prefix: str): method is_layer_skipped_quantization (line 197) | def is_layer_skipped_quantization( method get_weights_col_packed (line 202) | def get_weights_col_packed( method get_multi_weights_col (line 262) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 331) | def get_weights_row(self, weights: Weights, prefix: str): method _get_gptq_params (line 433) | def _get_gptq_params(self, weights: Weights): FILE: server/text_generation_server/layers/gptq/custom_autotune.py class Autotuner (line 14) | class Autotuner(triton.KernelInterface): method __init__ (line 15) | def __init__( method _bench (line 64) | def _bench(self, *args, config, **meta): method run (line 96) | def run(self, *args, **kwargs): method prune_configs (line 133) | def prune_configs(self, kwargs): method warmup (line 157) | def warmup(self, *args, **kwargs): function autotune (line 170) | def autotune( function matmul248_kernel_config_pruner (line 217) | def matmul248_kernel_config_pruner(configs, nargs): FILE: server/text_generation_server/layers/gptq/exllama.py function ext_make_q4 (line 9) | def ext_make_q4(qweight, qzeros, scales, g_idx, device): function ext_q4_matmul (line 16) | def ext_q4_matmul(x, q4, q4_width): function set_device (line 36) | def set_device(device): function create_exllama_buffers (line 41) | def create_exllama_buffers(max_total_tokens: int): class Ex4bitLinear (line 66) | class Ex4bitLinear(torch.nn.Module): method __init__ (line 69) | def __init__(self, weight: GPTQWeight, bias): method forward (line 129) | def forward(self, x): FILE: server/text_generation_server/layers/gptq/exllamav2.py class _ExtraTensors (line 28) | class _ExtraTensors: function ext_gemm_half_q_half (line 36) | def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda): function make_group_map (line 45) | def make_group_map(q_groups: torch.Tensor, num_qrows: int): function ext_make_q_matrix (line 67) | def ext_make_q_matrix( function set_device (line 154) | def set_device(device): function create_exllama_buffers (line 159) | def create_exllama_buffers(max_total_tokens: int): class QuantLinear (line 178) | class QuantLinear(nn.Module): method __init__ (line 183) | def __init__( method post_init (line 215) | def post_init(self, temp_dq): method forward (line 226) | def forward(self, x, force_cuda=False): method temp_dq_size (line 233) | def temp_dq_size(self): method temp_fwd_size (line 236) | def temp_fwd_size(self, max_input_len, max_batch_size): method scratch_space_fixed (line 239) | def scratch_space_fixed(self, max_input_len, max_batch_size): class ExLlamaV2DeviceTensors (line 243) | class ExLlamaV2DeviceTensors: method __init__ (line 250) | def __init__(self, device, scratch_bytes): method prepare (line 254) | def prepare(self): method get_scratch_slice (line 259) | def get_scratch_slice(self, size_bytes): FILE: server/text_generation_server/layers/gptq/ipex.py class QuantLinear (line 9) | class QuantLinear(nn.Module): method __init__ (line 10) | def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsi... method new (line 44) | def new(cls, bits, groupsize, infeatures, outfeatures, bias): method pack (line 65) | def pack(self, linear, scales, zeros, g_idx=None): method forward (line 122) | def forward(self, x): FILE: server/text_generation_server/layers/gptq/quantize.py class Quantizer (line 25) | class Quantizer(nn.Module): method __init__ (line 26) | def __init__(self, shape=1): method configure (line 32) | def configure( method _quantize (line 54) | def _quantize(self, x, scale, zero, maxq): method find_params (line 60) | def find_params(self, x, weight=False): method quantize (line 145) | def quantize(self, x): method enabled (line 151) | def enabled(self): method ready (line 154) | def ready(self): class GPTQ (line 158) | class GPTQ: method __init__ (line 159) | def __init__(self, layer, observe=False): method add_batch (line 174) | def add_batch(self, inp, out): method print_loss (line 209) | def print_loss(self, name, q_weight, weight_error, timecost): method fasterquant (line 243) | def fasterquant( method free (line 357) | def free(self): function get_wikitext2 (line 366) | def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code): function get_ptb (line 398) | def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code): function get_c4 (line 430) | def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code): function get_ptb_new (line 498) | def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code): function get_c4_new (line 530) | def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code): function get_loaders (line 584) | def get_loaders( function find_layers (line 599) | def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""): function sequential (line 615) | def sequential( function make_quant_linear (line 754) | def make_quant_linear(module, names, bits, groupsize, name=""): function pack (line 780) | def pack(model, quantizers, bits, groupsize): function setdeepattr (line 794) | def setdeepattr(module, full_name, tensor): function getdeepattr (line 802) | def getdeepattr(module, full_name): function load_weights_pre_hook (line 810) | def load_weights_pre_hook(module_name, weights, recursive=False): function load_weights_post_hook (line 842) | def load_weights_post_hook(module_name, weights, recursive=False): function quantize (line 867) | def quantize( FILE: server/text_generation_server/layers/gptq/triton.py function matmul_248_kernel (line 105) | def matmul_248_kernel( function matmul248 (line 204) | def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq): class QuantLinearFunction (line 244) | class QuantLinearFunction(torch.autograd.Function): method forward (line 247) | def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): class QuantLinear (line 252) | class QuantLinear(nn.Module): method __init__ (line 253) | def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsi... method new (line 273) | def new(cls, bits, groupsize, infeatures, outfeatures, bias): method pack (line 294) | def pack(self, linear, scales, zeros, g_idx=None): method forward (line 351) | def forward(self, x): FILE: server/text_generation_server/layers/gptq/utils.py function torch_snr_error (line 5) | def torch_snr_error( FILE: server/text_generation_server/layers/layernorm.py function load_layer_norm (line 11) | def load_layer_norm(cls, prefix, weights, eps): function load_layer_norm_no_bias (line 23) | def load_layer_norm_no_bias(cls, prefix, weights, eps): class FastLayerNorm (line 39) | class FastLayerNorm(nn.LayerNorm): method forward (line 40) | def forward(self, hidden_states, residual=None): method forward (line 78) | def forward(self, hidden_states, residual=None): method forward (line 89) | def forward(self, hidden_states, residual=None): class FastLayerNorm (line 77) | class FastLayerNorm(nn.LayerNorm): method forward (line 40) | def forward(self, hidden_states, residual=None): method forward (line 78) | def forward(self, hidden_states, residual=None): method forward (line 89) | def forward(self, hidden_states, residual=None): class FastLayerNorm (line 88) | class FastLayerNorm(nn.LayerNorm): method forward (line 40) | def forward(self, hidden_states, residual=None): method forward (line 78) | def forward(self, hidden_states, residual=None): method forward (line 89) | def forward(self, hidden_states, residual=None): class FastRMSNorm (line 101) | class FastRMSNorm(nn.Module): method __init__ (line 102) | def __init__(self, weight: torch.Tensor, eps: float): method load (line 109) | def load(cls, prefix, weights, eps=1e-6): method forward (line 113) | def forward(self, hidden_states, residual=None): FILE: server/text_generation_server/layers/linear.py class FastLinear (line 21) | class FastLinear(torch.nn.Module): method __init__ (line 22) | def __init__( method load (line 35) | def load(cls, config, prefix: str, weights, bias: bool): method forward (line 43) | def forward(self, input: torch.Tensor) -> torch.Tensor: class FastLinearROCm (line 47) | class FastLinearROCm(torch.nn.Module): method __init__ (line 48) | def __init__( method load (line 69) | def load(cls, config, prefix: str, weights, bias: bool): method forward (line 77) | def forward(self, inp: torch.Tensor) -> torch.Tensor: function get_linear (line 116) | def get_linear(weight, bias): FILE: server/text_generation_server/layers/lora.py class LoraLinear (line 35) | class LoraLinear(nn.Module): method __init__ (line 36) | def __init__( method forward_layer_type (line 44) | def forward_layer_type( method forward_lora (line 212) | def forward_lora( method collect_lora_a (line 231) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor: class TensorParallelMultiAdapterLinear (line 235) | class TensorParallelMultiAdapterLinear(LoraLinear): method __init__ (line 236) | def __init__( method load (line 249) | def load( method forward (line 261) | def forward( method collect_lora_a (line 304) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor: class TensorParallelAdapterRowLinear (line 319) | class TensorParallelAdapterRowLinear(LoraLinear): method __init__ (line 320) | def __init__(self, base_layer, layer_id, layer_name, process_group): method load (line 325) | def load(cls, base_layer, layer_id, layer_name, process_group): method forward (line 328) | def forward( method collect_lora_a (line 347) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor: FILE: server/text_generation_server/layers/marlin/fp8.py class GPTQMarlinFP8Linear (line 25) | class GPTQMarlinFP8Linear(nn.Module): method __init__ (line 30) | def __init__( method from_unquant (line 60) | def from_unquant(cls, weight, bias, dtype): method from_fp8 (line 65) | def from_fp8( method forward (line 75) | def forward(self, A: torch.Tensor) -> torch.Tensor: function pack_fp8_as_int32 (line 97) | def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor: function repack_fp8_for_marlin (line 128) | def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor): FILE: server/text_generation_server/layers/marlin/gptq.py function can_use_gptq_marlin (line 39) | def can_use_gptq_marlin( class GPTQMarlinWeightsLoader (line 55) | class GPTQMarlinWeightsLoader(WeightsLoader): method __init__ (line 60) | def __init__( method get_weights (line 77) | def get_weights(self, weights: Weights, prefix: str): method get_weights_col_packed (line 110) | def get_weights_col_packed( method get_multi_weights_col (line 153) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 195) | def get_weights_row(self, weights: Weights, prefix: str): method _get_gptq_params (line 237) | def _get_gptq_params(self, weights: Weights): class GPTQMarlinWeight (line 253) | class GPTQMarlinWeight(Weight): method __post_init__ (line 266) | def __post_init__(self): method get_linear (line 272) | def get_linear(self, bias: torch.Tensor): function repack_gptq_for_marlin (line 279) | def repack_gptq_for_marlin( class GPTQMarlinLinear (line 371) | class GPTQMarlinLinear(nn.Module): method __init__ (line 377) | def __init__( method forward (line 422) | def forward(self, A: torch.Tensor) -> torch.Tensor: function awq_to_marlin_zero_points (line 450) | def awq_to_marlin_zero_points( function _check_valid_shape (line 474) | def _check_valid_shape(in_features: int, out_features: int): FILE: server/text_generation_server/layers/marlin/marlin.py class MarlinWeightsLoader (line 20) | class MarlinWeightsLoader(WeightsLoader): method __init__ (line 23) | def __init__(self, *, bits: int, is_marlin_24: bool): method get_weights (line 27) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 58) | def get_weights_col_packed( method get_multi_weights_col (line 89) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],... method get_weights_row (line 128) | def get_weights_row(self, weights: Weights, prefix: str): class MarlinWeight (line 170) | class MarlinWeight(Weight): method __post_init__ (line 182) | def __post_init__(self): method get_linear (line 186) | def get_linear(self, bias: torch.Tensor): class MarlinLinear (line 190) | class MarlinLinear(nn.Module): method __init__ (line 191) | def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tenso... method forward (line 223) | def forward(self, A: torch.Tensor) -> torch.Tensor: class GPTQMarlin24Weight (line 252) | class GPTQMarlin24Weight: method __post_init__ (line 268) | def __post_init__(self): method get_linear (line 273) | def get_linear(self, bias: torch.Tensor): class GPTQMarlin24Linear (line 280) | class GPTQMarlin24Linear(nn.Module): method __init__ (line 281) | def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch... method forward (line 346) | def forward(self, A: torch.Tensor) -> torch.Tensor: FILE: server/text_generation_server/layers/marlin/util.py function _check_marlin_kernels (line 23) | def _check_marlin_kernels(): function get_perms (line 37) | def get_perms() -> Tuple[List[int], List[int]]: function permute_scales (line 47) | def permute_scales(scales: torch.Tensor): function get_pack_factor (line 60) | def get_pack_factor(bits: int) -> int: function pack_cols (line 66) | def pack_cols( function unpack_cols (line 92) | def unpack_cols( function marlin_zero_points (line 124) | def marlin_zero_points( FILE: server/text_generation_server/layers/medusa.py class ResBlock (line 12) | class ResBlock(torch.nn.Module): method __init__ (line 13) | def __init__(self, config, prefix, weights): method forward (line 20) | def forward(self, x): class MedusaModel (line 24) | class MedusaModel(torch.nn.Module): method __init__ (line 25) | def __init__(self, config, medusa_config, weights): method forward (line 34) | def forward(self, x): class MedusaHead (line 41) | class MedusaHead(torch.nn.Module): method __init__ (line 42) | def __init__(self, config, medusa_config, prefix, weights): method forward (line 55) | def forward(self, x): class MedusaHeadV1 (line 62) | class MedusaHeadV1(nn.Module): method __init__ (line 63) | def __init__(self, lm_head, medusa): method load (line 69) | def load(config, prefix: str, weights): method forward (line 97) | def forward( class MedusaHeadV2 (line 109) | class MedusaHeadV2(nn.Module): method __init__ (line 110) | def __init__(self, config, prefix, weights): method forward (line 150) | def forward(self, x): FILE: server/text_generation_server/layers/mlp.py class MLPSpeculatorLayerNorm (line 11) | class MLPSpeculatorLayerNorm(nn.Module): method __init__ (line 27) | def __init__( method forward (line 39) | def forward(self, x): function simple_norm (line 51) | def simple_norm(x: torch.Tensor, eps=1e-06): class MLPSpeculatorModelTied (line 58) | class MLPSpeculatorModelTied(torch.nn.Module): method __init__ (line 59) | def __init__(self, config, prefix, weights): method forward (line 96) | def forward( class MLPSpeculatorModel (line 142) | class MLPSpeculatorModel(torch.nn.Module): method __init__ (line 143) | def __init__(self, config, prefix, weights): method forward (line 192) | def forward( class MLPSpeculatorHead (line 235) | class MLPSpeculatorHead(nn.Module): method __init__ (line 236) | def __init__(self, lm_head, mlp_speculator, scale_input: bool): method forward (line 242) | def forward( method load (line 257) | def load(config, prefix: str, weights): FILE: server/text_generation_server/layers/moe/__init__.py class MoELayer (line 45) | class MoELayer(Protocol): method __init__ (line 46) | def __init__( method forward (line 64) | def forward( class DenseMoELayer (line 69) | class DenseMoELayer(nn.Module): method __init__ (line 77) | def __init__( method forward (line 158) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... class SparseMoELayer (line 197) | class SparseMoELayer(nn.Module): method __init__ (line 204) | def __init__( method forward (line 265) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... method is_supported (line 269) | def is_supported(weights: Weights) -> bool: FILE: server/text_generation_server/layers/moe/fp8.py class FP8SparseMoELayer (line 20) | class FP8SparseMoELayer(nn.Module): method __init__ (line 21) | def __init__( method forward (line 72) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... function _load_expert_weights (line 94) | def _load_expert_weights( function _load_expert_multi_weights_col (line 143) | def _load_expert_multi_weights_col( function _load_expert_weights_row (line 161) | def _load_expert_weights_row( FILE: server/text_generation_server/layers/moe/fused_moe_ipex.py function grouped_topk (line 21) | def grouped_topk( function fused_topk (line 53) | def fused_topk( FILE: server/text_generation_server/layers/moe/gptq_marlin.py function can_use_marlin_moe_gemm (line 29) | def can_use_marlin_moe_gemm( class GPTQMarlinMoEWeight (line 47) | class GPTQMarlinMoEWeight: class GPTQMarlinSparseMoELayer (line 56) | class GPTQMarlinSparseMoELayer(nn.Module): method __init__ (line 61) | def __init__( method forward (line 119) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... function _load_expert_multi_weights_col (line 151) | def _load_expert_multi_weights_col( function _load_expert_weights_row (line 171) | def _load_expert_weights_row( function _pack_weight (line 191) | def _pack_weight( function fused_marlin_moe (line 243) | def fused_marlin_moe( FILE: server/text_generation_server/layers/moe/unquantized.py class UnquantizedSparseMoELayer (line 18) | class UnquantizedSparseMoELayer(nn.Module): method __init__ (line 19) | def __init__( method forward (line 68) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ... function _load_expert_multi_weights_col (line 107) | def _load_expert_multi_weights_col( function _load_expert_weights_row (line 137) | def _load_expert_weights_row( function fused_moe (line 166) | def fused_moe( FILE: server/text_generation_server/layers/rotary.py function _create_inv_freq (line 17) | def _create_inv_freq(dim, base, device): function _get_rope_config (line 24) | def _get_rope_config(config): class PositionRotaryEmbedding (line 34) | class PositionRotaryEmbedding(nn.Module): method __init__ (line 35) | def __init__(self, inv_freq, scaling_factor): method forward (line 46) | def forward( method static (line 83) | def static(cls, config, dim, base, device): method load (line 206) | def load(cls, config, prefix, weights): method _update_cos_sin_cache (line 251) | def _update_cos_sin_cache(self, dtype, device, seqlen): method get_cos_sin (line 270) | def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: t... class SuRotaryEmbedding (line 289) | class SuRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 290) | def __init__( method _update_cos_sin_cache (line 309) | def _update_cos_sin_cache(self, dtype, device, seqlen): class Phi3LongRoPEScaledRotaryEmbedding (line 336) | class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 337) | def __init__( method _update_cos_sin_cache (line 362) | def _update_cos_sin_cache(self, dtype, device, seqlen): class DynamicPositionRotaryEmbedding (line 393) | class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 394) | def __init__(self, dim, max_position_embeddings, base, device, scaling... method _update_cos_sin_cache (line 401) | def _update_cos_sin_cache(self, dtype, device, seqlen): function find_correction_dim (line 427) | def find_correction_dim(num_rotations, dim, base=10000, max_position_emb... function find_correction_range (line 434) | def find_correction_range( function linear_ramp_mask (line 442) | def linear_ramp_mask(min, max, dim): function get_mscale (line 451) | def get_mscale(scale: float = 1.0, mscale: float = 1.0): class YarnPositionRotaryEmbedding (line 457) | class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): method __init__ (line 458) | def __init__( method _update_cos_sin_cache (line 490) | def _update_cos_sin_cache(self, dtype, device, seqlen): function apply_llama3_scaling (line 532) | def apply_llama3_scaling( class RotaryPositionEmbeddingMultimodalSections (line 561) | class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding): method __init__ (line 562) | def __init__(self, inv_freq: torch.Tensor, scaling_factor: float, sect... method _update_cos_sin_cache (line 574) | def _update_cos_sin_cache( method get_cos_sin (line 591) | def get_cos_sin( FILE: server/text_generation_server/layers/speculative.py class SpeculativeHead (line 9) | class SpeculativeHead(torch.nn.Module): method __init__ (line 10) | def __init__(self, lm_head, speculator): method load (line 16) | def load(config, prefix: str, weights): method forward (line 44) | def forward( FILE: server/text_generation_server/layers/tensor_parallel.py class LayerConcat (line 11) | class LayerConcat(torch.nn.Module): method __init__ (line 17) | def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1): method forward (line 25) | def forward(self, x: torch.Tensor): class SuperLayer (line 30) | class SuperLayer(torch.nn.Module): method __init__ (line 31) | def __init__(self, linear): method forward (line 35) | def forward(self, x): class TensorParallelHead (line 39) | class TensorParallelHead(SuperLayer): method __init__ (line 40) | def __init__(self, linear, process_group, should_gather: bool): method load (line 46) | def load(config, prefix: str, weights): method forward (line 75) | def forward(self, input: torch.Tensor) -> torch.Tensor: class TensorParallelColumnLinear (line 118) | class TensorParallelColumnLinear(SuperLayer): method load_gate_up (line 120) | def load_gate_up(cls, config, prefix: str, weights, bias: bool): method load_qkv (line 131) | def load_qkv( method load (line 154) | def load(cls, config, prefix: str, weights, bias: bool): method load_multi (line 164) | def load_multi(cls, config, prefixes: List[str], weights, bias: bool, ... class TensorParallelRowLinear (line 183) | class TensorParallelRowLinear(SuperLayer): method __init__ (line 184) | def __init__(self, linear, process_group): method load (line 189) | def load(cls, config, prefix: str, weights, bias: bool): method forward (line 202) | def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.T... class TensorParallelEmbedding (line 212) | class TensorParallelEmbedding(torch.nn.Module): method __init__ (line 213) | def __init__(self, prefix: str, weights, reduce=True): method forward (line 235) | def forward(self, input: torch.Tensor) -> torch.Tensor: FILE: server/text_generation_server/models/__init__.py class ModelType (line 217) | class ModelType(enum.Enum): function get_model (line 425) | def get_model( function get_model_with_lora_adapters (line 1802) | def get_model_with_lora_adapters( FILE: server/text_generation_server/models/bloom.py class BloomCausalLMBatch (line 15) | class BloomCausalLMBatch(CausalLMBatch): method from_pb (line 17) | def from_pb( class BLOOMSharded (line 29) | class BLOOMSharded(CausalLM): method batch_type (line 31) | def batch_type(self) -> Type[CausalLMBatch]: method forward (line 34) | def forward( FILE: server/text_generation_server/models/causal_lm.py class CausalLMBatch (line 38) | class CausalLMBatch(Batch): method to_pb (line 73) | def to_pb(self) -> generate_pb2.CachedBatch: method from_pb (line 83) | def from_pb( method filter (line 175) | def filter(self, request_ids: List[int]) -> Optional["CausalLMBatch"]: method concatenate (line 279) | def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch": method __len__ (line 491) | def __len__(self): class CausalLMBatchKeysLast (line 496) | class CausalLMBatchKeysLast(CausalLMBatch): class CausalLM (line 500) | class CausalLM(Model): method __init__ (line 501) | def __init__( method fallback (line 584) | def fallback( method batch_type (line 659) | def batch_type(self) -> Type[CausalLMBatch]: method forward (line 662) | def forward( method generate_token (line 686) | def generate_token( FILE: server/text_generation_server/models/custom_modeling/bloom_modeling.py function _make_causal_mask (line 68) | def _make_causal_mask( function _expand_mask (line 88) | def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor: function build_alibi_tensor (line 99) | def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int) -> ... function dropout_add (line 156) | def dropout_add( function _split_heads (line 178) | def _split_heads( function _merge_heads (line 210) | def _merge_heads(x: torch.Tensor, num_heads: int, head_dim: int) -> torc... class BloomAttention (line 236) | class BloomAttention(nn.Module): method __init__ (line 237) | def __init__(self, prefix, config: BloomConfig, weights): method compute_attention (line 280) | def compute_attention( method forward (line 357) | def forward( class BloomMLP (line 435) | class BloomMLP(nn.Module): method __init__ (line 436) | def __init__(self, prefix, config: BloomConfig, weights): method forward (line 450) | def forward( class BloomBlock (line 474) | class BloomBlock(nn.Module): method __init__ (line 475) | def __init__(self, layer_id: int, config: BloomConfig, weights): method forward (line 500) | def forward( class BloomPreTrainedModel (line 556) | class BloomPreTrainedModel(PreTrainedModel): method _convert_to_standard_cache (line 562) | def _convert_to_standard_cache( method _convert_to_bloom_cache (line 582) | def _convert_to_bloom_cache( class BloomModel (line 601) | class BloomModel(BloomPreTrainedModel): method __init__ (line 602) | def __init__(self, config: BloomConfig, weights): method _prepare_attn_mask (line 635) | def _prepare_attn_mask( method set_input_embeddings (line 664) | def set_input_embeddings(self, new_embeddings: torch.Tensor): method forward (line 667) | def forward( class BloomForCausalLM (line 818) | class BloomForCausalLM(BloomPreTrainedModel): method __init__ (line 819) | def __init__(self, prefix: str, config, weights): method prepare_inputs_for_generation (line 829) | def prepare_inputs_for_generation( method forward (line 860) | def forward( FILE: server/text_generation_server/models/custom_modeling/clip.py class CLIPVisionEmbeddings (line 23) | class CLIPVisionEmbeddings(nn.Module): method __init__ (line 24) | def __init__(self, prefix, config: CLIPVisionConfig, weights): method forward (line 56) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: class CLIPTextEmbeddings (line 70) | class CLIPTextEmbeddings(nn.Module): method __init__ (line 71) | def __init__(self, config: CLIPTextConfig): method forward (line 87) | def forward( class CLIPAttention (line 109) | class CLIPAttention(nn.Module): method __init__ (line 112) | def __init__(self, prefix, config, weights): method _shape (line 142) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 149) | def forward( class CLIPMLP (line 234) | class CLIPMLP(nn.Module): method __init__ (line 235) | def __init__(self, prefix, config, weights): method forward (line 246) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class CLIPEncoderLayer (line 253) | class CLIPEncoderLayer(nn.Module): method __init__ (line 254) | def __init__(self, prefix, config: CLIPConfig, weights): method forward (line 268) | def forward( class CLIPPreTrainedModel (line 299) | class CLIPPreTrainedModel(nn.Module): class CLIPEncoder (line 386) | class CLIPEncoder(nn.Module): method __init__ (line 395) | def __init__(self, prefix, config: CLIPConfig, weights): method forward (line 407) | def forward( class CLIPTextTransformer (line 446) | class CLIPTextTransformer(nn.Module): method __init__ (line 447) | def __init__(self, prefix: str, config: CLIPTextConfig, weights=None): method forward (line 461) | def forward( class CLIPTextModel (line 533) | class CLIPTextModel(CLIPPreTrainedModel): method __init__ (line 538) | def __init__(self, prefix, config: CLIPTextConfig): method forward (line 544) | def forward( class CLIPVisionTransformer (line 575) | class CLIPVisionTransformer(nn.Module): method __init__ (line 576) | def __init__(self, prefix, config: CLIPVisionConfig, weights): method forward (line 591) | def forward( class CLIPVisionModel (line 619) | class CLIPVisionModel(CLIPPreTrainedModel): method __init__ (line 624) | def __init__(self, config: CLIPVisionConfig): method get_input_embeddings (line 630) | def get_input_embeddings(self) -> nn.Module: method forward (line 633) | def forward( class CLIPModel (line 665) | class CLIPModel(nn.Module): method __init__ (line 666) | def __init__(self, prefix, config: CLIPConfig, weights): method get_text_features (line 691) | def get_text_features( method get_image_features (line 724) | def get_image_features( method forward (line 760) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py class CohereRotary (line 56) | class CohereRotary(PositionRotaryEmbedding): method forward (line 57) | def forward( class CohereLayerNorm (line 101) | class CohereLayerNorm(nn.Module): method __init__ (line 102) | def __init__(self, prefix, weights, eps): method forward (line 110) | def forward(self, hidden_states): function load_attention (line 156) | def load_attention(config, prefix, weights): function _load_gqa (line 169) | def _load_gqa(config, prefix: str, weights): class FlashCohereAttention (line 201) | class FlashCohereAttention(torch.nn.Module): method __init__ (line 202) | def __init__( method forward (line 262) | def forward( class CohereMLP (line 334) | class CohereMLP(nn.Module): method __init__ (line 335) | def __init__(self, prefix, config, weights): method forward (line 366) | def forward(self, hidden_states): class FlashCohereLayer (line 374) | class FlashCohereLayer(nn.Module): method __init__ (line 375) | def __init__(self, prefix: str, layer_id, config, weights): method forward (line 390) | def forward( class FlashCohereModel (line 427) | class FlashCohereModel(torch.nn.Module): method __init__ (line 428) | def __init__(self, prefix: str, config, weights): method forward (line 458) | def forward( class FlashCohereForCausalLM (line 498) | class FlashCohereForCausalLM(torch.nn.Module): method __init__ (line 499) | def __init__(self, prefix: str, config, weights): method forward (line 522) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py class DbrxAttentionConfig (line 55) | class DbrxAttentionConfig(PretrainedConfig): method __init__ (line 56) | def __init__( class DbrxFFNConfig (line 77) | class DbrxFFNConfig(PretrainedConfig): method __init__ (line 78) | def __init__( class DbrxConfig (line 112) | class DbrxConfig(PretrainedConfig): method __init__ (line 119) | def __init__( method num_key_value_heads (line 172) | def num_key_value_heads(self): function promote_scalar (line 178) | def promote_scalar(x: torch.Tensor) -> torch.Tensor: function load_attention (line 182) | def load_attention(config, prefix, weights): function _load_experts (line 193) | def _load_experts(config, prefix, weights): function _load_experts_quantized (line 224) | def _load_experts_quantized(config, prefix, weights, cls): class DbrxAttention (line 264) | class DbrxAttention(torch.nn.Module): method __init__ (line 265) | def __init__( method forward (line 310) | def forward( class DbrxNormAttentionNorm (line 374) | class DbrxNormAttentionNorm(nn.Module): method __init__ (line 375) | def __init__( method forward (line 394) | def forward( function select_experts (line 429) | def select_experts( function round_up (line 447) | def round_up(x: torch.Tensor, value: int): class BlockSparseMoE (line 451) | class BlockSparseMoE(nn.Module): method __init__ (line 452) | def __init__(self, prefix, config: DbrxConfig, weights): method forward (line 501) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DenseMoE (line 533) | class DenseMoE(nn.Module): method __init__ (line 534) | def __init__(self, prefix, config: DbrxConfig, weights): method forward (line 584) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DbrxLayer (line 631) | class DbrxLayer(nn.Module): method __init__ (line 632) | def __init__(self, prefix: str, layer_id, config, weights): method forward (line 643) | def forward( class DbrxModel (line 675) | class DbrxModel(torch.nn.Module): method __init__ (line 676) | def __init__(self, prefix: str, config, weights): method forward (line 702) | def forward( class FlashDbrxForCausalLM (line 741) | class FlashDbrxForCausalLM(torch.nn.Module): method __init__ (line 742) | def __init__(self, prefix: str, config, weights): method forward (line 757) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py class DeepseekV2Config (line 51) | class DeepseekV2Config(PretrainedConfig): method __init__ (line 52) | def __init__( class DeepseekV2Attention (line 157) | class DeepseekV2Attention(torch.nn.Module): method __init__ (line 158) | def __init__( method forward (line 258) | def forward( class DeepseekV2MLP (line 366) | class DeepseekV2MLP(nn.Module): method __init__ (line 367) | def __init__(self, prefix: str, config, weights, intermediate_size: int): method forward (line 397) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True): class DeepseekV2MoE (line 421) | class DeepseekV2MoE(nn.Module): method __init__ (line 422) | def __init__( method forward (line 464) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DeepseekV2Layer (line 484) | class DeepseekV2Layer(nn.Module): method __init__ (line 485) | def __init__(self, prefix, layer_id, config, weights): method forward (line 523) | def forward( class DeepseekV2Model (line 561) | class DeepseekV2Model(torch.nn.Module): method __init__ (line 562) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 588) | def forward( class FlashDeepseekV2ForCausalLM (line 627) | class FlashDeepseekV2ForCausalLM(torch.nn.Module): method __init__ (line 628) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 640) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py class DeepseekV3Config (line 51) | class DeepseekV3Config(PretrainedConfig): method __init__ (line 52) | def __init__( class DeepseekV3Attention (line 157) | class DeepseekV3Attention(torch.nn.Module): method __init__ (line 158) | def __init__( method forward (line 258) | def forward( class DeepseekV3MLP (line 366) | class DeepseekV3MLP(nn.Module): method __init__ (line 367) | def __init__(self, prefix: str, config, weights, intermediate_size: int): method forward (line 397) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True): class DeepseekV3MoE (line 421) | class DeepseekV3MoE(nn.Module): method __init__ (line 422) | def __init__( method forward (line 473) | def forward(self, x: torch.Tensor) -> torch.Tensor: class DeepseekV3Layer (line 493) | class DeepseekV3Layer(nn.Module): method __init__ (line 494) | def __init__(self, prefix, layer_id, config, weights): method forward (line 532) | def forward( class DeepseekV3Model (line 570) | class DeepseekV3Model(torch.nn.Module): method __init__ (line 571) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 597) | def forward( class FlashDeepseekV3ForCausalLM (line 636) | class FlashDeepseekV3ForCausalLM(torch.nn.Module): method __init__ (line 637) | def __init__(self, prefix: str, config, weights: Weights): method forward (line 649) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py class Gemma2Config (line 50) | class Gemma2Config(PretrainedConfig): method __init__ (line 51) | def __init__( class Gemma2FastRMSNorm (line 106) | class Gemma2FastRMSNorm(FastRMSNorm): method load (line 108) | def load(cls, prefix: str, weights, eps=1e-6): method forward (line 118) | def forward(self, hidden_states, residual=None): function load_attention (line 129) | def load_attention(config, prefix: str, weights): function _load_gqa (line 142) | def _load_gqa(config, prefix: str, weights): class FlashGemma2Attention (line 164) | class FlashGemma2Attention(torch.nn.Module): method __init__ (line 165) | def __init__( method forward (line 230) | def forward( class Gemma2MLP (line 298) | class Gemma2MLP(nn.Module): method __init__ (line 299) | def __init__(self, prefix, config, weights, layer_id): method forward (line 348) | def forward(self, hidden_states, adapter_data): class FlashGemma2Layer (line 356) | class FlashGemma2Layer(nn.Module): method __init__ (line 357) | def __init__( method forward (line 392) | def forward( class FlashGemma2Model (line 434) | class FlashGemma2Model(torch.nn.Module): method __init__ (line 435) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 462) | def forward( class FlashGemma2ForCausalLM (line 503) | class FlashGemma2ForCausalLM(torch.nn.Module): method __init__ (line 504) | def __init__(self, prefix: str, config, weights, *, causal: bool = True): method forward (line 533) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py class Gemma3FastRMSNorm (line 62) | class Gemma3FastRMSNorm(FastRMSNorm): method load (line 64) | def load(cls, prefix: str, weights, eps=1e-6): method forward (line 74) | def forward(self, hidden_states, residual=None): function load_attention (line 85) | def load_attention(config, prefix: str, weights): function _load_gqa (line 98) | def _load_gqa(config, prefix: str, weights): class FlashGemma3Attention (line 120) | class FlashGemma3Attention(torch.nn.Module): method __init__ (line 121) | def __init__( method forward (line 203) | def forward( class Gemma3MLP (line 329) | class Gemma3MLP(nn.Module): method __init__ (line 330) | def __init__(self, prefix, config, weights, layer_id): method forward (line 379) | def forward(self, hidden_states, adapter_data): class FlashGemma3Layer (line 387) | class FlashGemma3Layer(nn.Module): method __init__ (line 388) | def __init__( method forward (line 423) | def forward( class FlashGemma3Model (line 467) | class FlashGemma3Model(torch.nn.Module): method __init__ (line 468) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 496) | def forward( class FlashGemma3ForCausalLM (line 545) | class FlashGemma3ForCausalLM(torch.nn.Module): method __init__ (line 546) | def __init__(self, prefix: str, config, weights, *, causal: bool = True): method forward (line 576) | def forward( class Gemma3MultimodalInputProjection (line 610) | class Gemma3MultimodalInputProjection(torch.nn.Module): method __init__ (line 611) | def __init__(self, prefix, config, weights): method forward (line 633) | def forward(self, vision_outputs: torch.Tensor): class Gemma3ForConditionalGeneration (line 654) | class Gemma3ForConditionalGeneration(nn.Module): method __init__ (line 655) | def __init__(self, prefix, config, weights): method get_attention_mask (line 705) | def get_attention_mask( method get_vision_embeds (line 766) | def get_vision_embeds( method get_inputs_embeds (line 782) | def get_inputs_embeds( method forward (line 799) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py class GemmaConfig (line 48) | class GemmaConfig(PretrainedConfig): method __init__ (line 49) | def __init__( class GemmaFastRMSNorm (line 104) | class GemmaFastRMSNorm(FastRMSNorm): method load (line 106) | def load(cls, prefix: str, weights, eps=1e-6): method forward (line 116) | def forward(self, hidden_states, residual=None): function load_attention (line 127) | def load_attention(config, prefix: str, weights): function _load_gqa (line 140) | def _load_gqa(config, prefix: str, weights): class FlashGemmaAttention (line 162) | class FlashGemmaAttention(torch.nn.Module): method __init__ (line 163) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 202) | def forward( class GemmaMLP (line 264) | class GemmaMLP(nn.Module): method __init__ (line 265) | def __init__(self, prefix: str, config, weights): method forward (line 296) | def forward(self, hidden_states): class FlashGemmaLayer (line 302) | class FlashGemmaLayer(nn.Module): method __init__ (line 303) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 319) | def forward( class FlashGemmaModel (line 357) | class FlashGemmaModel(torch.nn.Module): method __init__ (line 358) | def __init__(self, prefix: str, config, weights, causal: bool): method forward (line 383) | def forward( class FlashGemmaForCausalLM (line 422) | class FlashGemmaForCausalLM(torch.nn.Module): method __init__ (line 423) | def __init__(self, prefix: str, config, weights, *, causal: bool = True): method forward (line 450) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py function load_qkv (line 42) | def load_qkv(config, prefix: str, weights, head_size, num_heads): function _load_qkv_gptq (line 57) | def _load_qkv_gptq(config, prefix: str, weights): function _load_qkv (line 88) | def _load_qkv(config, prefix: str, weights, head_size, num_heads): function load_row (line 135) | def load_row(config, prefix: str, weights, bias: bool): function load_col (line 154) | def load_col(config, prefix: str, weights, bias: bool): class FlashGPT2Attention (line 169) | class FlashGPT2Attention(torch.nn.Module): method __init__ (line 170) | def __init__( method forward (line 210) | def forward( class GPT2MLP (line 263) | class GPT2MLP(nn.Module): method __init__ (line 264) | def __init__(self, prefix: str, config, weights): method forward (line 294) | def forward(self, hidden_states): class FlashGPT2Layer (line 300) | class FlashGPT2Layer(nn.Module): method __init__ (line 301) | def __init__(self, prefix: str, config, weights): method forward (line 317) | def forward( class FlashGPT2Model (line 352) | class FlashGPT2Model(torch.nn.Module): method __init__ (line 353) | def __init__(self, prefix: str, config, weights): method forward (line 383) | def forward( class FlashGPT2ForCausalLM (line 416) | class FlashGPT2ForCausalLM(torch.nn.Module): method __init__ (line 417) | def __init__(self, prefix: str, config, weights): method forward (line 436) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py function load_attention (line 49) | def load_attention(config, prefix: str, weights): function load_row (line 59) | def load_row(config, prefix: str, weights, bias: bool): class GPTJRotary (line 72) | class GPTJRotary(PositionRotaryEmbedding): method forward (line 73) | def forward( class FlashGPTJAttention (line 117) | class FlashGPTJAttention(torch.nn.Module): method __init__ (line 118) | def __init__( method forward (line 164) | def forward( class GPTJMLP (line 227) | class GPTJMLP(nn.Module): method __init__ (line 228) | def __init__(self, prefix: str, config, weights): method forward (line 253) | def forward(self, hidden_states): class FlashGPTJLayer (line 259) | class FlashGPTJLayer(nn.Module): method __init__ (line 260) | def __init__(self, prefix: str, config, weights): method forward (line 271) | def forward( class FlashGPTJModel (line 303) | class FlashGPTJModel(torch.nn.Module): method __init__ (line 304) | def __init__(self, prefix: str, config, weights): method forward (line 333) | def forward( class FlashGPTJForCausalLM (line 373) | class FlashGPTJForCausalLM(torch.nn.Module): method __init__ (line 374) | def __init__(self, prefix: str, config, weights): method forward (line 387) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_llama_modeling.py function load_attention (line 72) | def load_attention(config, prefix: str, weights, layer_id): function no_fp8 (line 125) | def no_fp8(weights: Weights): class FlashLlamaAttention (line 137) | class FlashLlamaAttention(torch.nn.Module): method __init__ (line 138) | def __init__( method forward (line 206) | def forward( class Phi3MoE (line 270) | class Phi3MoE(nn.Module): method __init__ (line 271) | def __init__( method forward (line 294) | def forward(self, x, adapter_data) -> torch.Tensor: class LlamaMLP (line 306) | class LlamaMLP(nn.Module): method __init__ (line 307) | def __init__(self, prefix, config, weights, index): method forward (line 379) | def forward(self, hidden_states, adapter_data): class FlashLlamaLayer (line 407) | class FlashLlamaLayer(nn.Module): method __init__ (line 408) | def __init__(self, index, prefix, config, weights): method forward (line 459) | def forward( class FlashLlamaModel (line 503) | class FlashLlamaModel(torch.nn.Module): method __init__ (line 504) | def __init__(self, prefix, config, weights): method forward (line 572) | def forward( class FlashLlamaForCausalLM (line 617) | class FlashLlamaForCausalLM(torch.nn.Module): method __init__ (line 618) | def __init__(self, prefix: str, config, weights, name=None): method forward (line 663) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py class MistralConfig (line 57) | class MistralConfig(PretrainedConfig): method __init__ (line 60) | def __init__( class MistralAttention (line 111) | class MistralAttention(torch.nn.Module): method __init__ (line 112) | def __init__(self, prefix: str, config, weights, layer_id): method forward (line 180) | def forward( class MistralMLP (line 252) | class MistralMLP(nn.Module): method __init__ (line 253) | def __init__(self, prefix: str, config, weights, layer_id): method forward (line 307) | def forward(self, hidden_states, adapter_data): class MistralLayer (line 333) | class MistralLayer(nn.Module): method __init__ (line 334) | def __init__(self, prefix: str, config, weights, layer_id): method forward (line 355) | def forward( class MistralModel (line 397) | class MistralModel(torch.nn.Module): method __init__ (line 398) | def __init__(self, prefix: str, config, weights): method forward (line 425) | def forward( class FlashMistralForCausalLM (line 467) | class FlashMistralForCausalLM(torch.nn.Module): method __init__ (line 468) | def __init__(self, prefix: str, config, weights, name=None): method forward (line 500) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py class MixtralConfig (line 48) | class MixtralConfig(PretrainedConfig): method __init__ (line 51) | def __init__( function promote_scalar (line 106) | def promote_scalar(x: torch.Tensor) -> torch.Tensor: function load_attention (line 110) | def load_attention(config, prefix: str, weights): function _load_gqa (line 123) | def _load_gqa(config, prefix: str, weights): function _load_experts (line 146) | def _load_experts(config, prefix: str, mat, weights): class MixtralAttention (line 182) | class MixtralAttention(torch.nn.Module): method __init__ (line 183) | def __init__( method forward (line 230) | def forward( function select_experts (line 300) | def select_experts(gate_logits: torch.Tensor, top_k: int): function round_up (line 313) | def round_up(x: torch.Tensor, value: int): class MixtralMoE (line 317) | class MixtralMoE(nn.Module): method __init__ (line 318) | def __init__( method forward (line 342) | def forward(self, x: torch.Tensor) -> torch.Tensor: class MixtralLayer (line 354) | class MixtralLayer(nn.Module): method __init__ (line 355) | def __init__(self, prefix: str, layer_id, config, weights): method forward (line 379) | def forward( class MixtralModel (line 419) | class MixtralModel(torch.nn.Module): method __init__ (line 420) | def __init__(self, prefix: str, config, weights): method forward (line 451) | def forward( class FlashMixtralForCausalLM (line 493) | class FlashMixtralForCausalLM(torch.nn.Module): method __init__ (line 494) | def __init__(self, prefix: str, config, weights): method forward (line 510) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_neox_modeling.py class GPTNeoXConfig (line 51) | class GPTNeoXConfig(TransformersGPTNeoXConfig): function load_row (line 57) | def load_row(config, prefix: str, weights, bias: bool): function load_qkv (line 73) | def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_... class FlashNeoxAttention (line 98) | class FlashNeoxAttention(torch.nn.Module): method __init__ (line 99) | def __init__(self, config, prefix, weights): method forward (line 142) | def forward( class FlashMLP (line 204) | class FlashMLP(nn.Module): method __init__ (line 205) | def __init__(self, config, prefix, weights): method forward (line 226) | def forward(self, hidden_states): class FlashNeoXLayer (line 233) | class FlashNeoXLayer(nn.Module): method __init__ (line 234) | def __init__(self, layer_id, config, weights): method forward (line 257) | def forward( class FlashGPTNeoXPreTrainedModel (line 318) | class FlashGPTNeoXPreTrainedModel(PreTrainedModel): class FlashGPTNeoXModel (line 325) | class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel): method __init__ (line 326) | def __init__(self, prefix: str, config, weights): method forward (line 351) | def forward( class FlashGPTNeoXForCausalLM (line 390) | class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel): method __init__ (line 391) | def __init__(self, prefix, config, weights): method forward (line 405) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py class PaliGemmaForConditionalGeneration (line 29) | class PaliGemmaForConditionalGeneration(nn.Module): method __init__ (line 30) | def __init__(self, prefix, config, weights): method get_vision_embeds (line 67) | def get_vision_embeds( method get_inputs_embeds (line 83) | def get_inputs_embeds( method forward (line 96) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_phi_modeling.py class PhiConfig (line 30) | class PhiConfig(PretrainedConfig): method __init__ (line 31) | def __init__( function load_attention (line 70) | def load_attention(config, prefix, weights): function _load_gqa (line 83) | def _load_gqa(config, prefix: str, weights): class FlashPhiAttention (line 107) | class FlashPhiAttention(torch.nn.Module): method __init__ (line 108) | def __init__( method forward (line 155) | def forward( class PhiMLP (line 226) | class PhiMLP(nn.Module): method __init__ (line 227) | def __init__(self, prefix, config, weights): method forward (line 255) | def forward(self, hidden_states): class FlashPhiLayer (line 261) | class FlashPhiLayer(nn.Module): method __init__ (line 262) | def __init__(self, prefix: str, layer_id, config, weights): method forward (line 276) | def forward( class FlashPhiModel (line 310) | class FlashPhiModel(torch.nn.Module): method __init__ (line 311) | def __init__(self, prefix: str, config, weights): method forward (line 343) | def forward( class FlashPhiForCausalLM (line 382) | class FlashPhiForCausalLM(torch.nn.Module): method __init__ (line 383) | def __init__(self, prefix: str, config, weights): method forward (line 398) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py class PhiMoEConfig (line 30) | class PhiMoEConfig(PretrainedConfig): method __init__ (line 121) | def __init__( method _rope_scaling_validation (line 191) | def _rope_scaling_validation(self): FILE: server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py function load_attention (line 28) | def load_attention(config, prefix, weights, layer_id): function _load_gqa (line 55) | def _load_gqa(config, prefix: str, weights): class Qwen2Attention (line 68) | class Qwen2Attention(torch.nn.Module): method __init__ (line 69) | def __init__( method forward (line 124) | def forward( class Qwen2MLP (line 196) | class Qwen2MLP(nn.Module): method __init__ (line 197) | def __init__(self, prefix, config, weights, index): method forward (line 246) | def forward(self, hidden_states, adapter_data): class Qwen2Layer (line 254) | class Qwen2Layer(nn.Module): method __init__ (line 255) | def __init__(self, prefix, layer_id, config, weights): method forward (line 273) | def forward( class Qwen2Model (line 313) | class Qwen2Model(torch.nn.Module): method __init__ (line 314) | def __init__(self, prefix: str, config, weights): method forward (line 343) | def forward( class Qwen2ForCausalLM (line 387) | class Qwen2ForCausalLM(torch.nn.Module): method __init__ (line 388) | def __init__(self, prefix: str, config, weights): method forward (line 416) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_rw_modeling.py function load_row (line 25) | def load_row(config, prefix: str, weights, bias: bool): class RWConfig (line 41) | class RWConfig(PretrainedConfig): method __init__ (line 48) | def __init__( class FlashRWAttention (line 128) | class FlashRWAttention(torch.nn.Module): method __init__ (line 129) | def __init__( method forward (line 177) | def forward( class FlashRWLargeAttention (line 240) | class FlashRWLargeAttention(torch.nn.Module): method __init__ (line 241) | def __init__( method forward (line 298) | def forward( class FlashMLP (line 362) | class FlashMLP(nn.Module): method __init__ (line 363) | def __init__(self, config, prefix: str, weights): method forward (line 374) | def forward(self, hidden_states): class FlashRWLayer (line 381) | class FlashRWLayer(nn.Module): method __init__ (line 382) | def __init__( method forward (line 429) | def forward( class FlashRWLayerNorm (line 489) | class FlashRWLayerNorm(nn.Module): method __init__ (line 490) | def __init__(self, config, prefix: str, weights): method forward (line 520) | def forward( class FlashRWLargeLayer (line 534) | class FlashRWLargeLayer(nn.Module): method __init__ (line 535) | def __init__(self, layer_id, prefix: str, config, weights): method forward (line 552) | def forward( class FlashRWPreTrainedModel (line 592) | class FlashRWPreTrainedModel(PreTrainedModel): class FlashRWModel (line 596) | class FlashRWModel(FlashRWPreTrainedModel): method __init__ (line 597) | def __init__(self, prefix: str, config, weights): method forward (line 630) | def forward( class FlashRWForCausalLM (line 669) | class FlashRWForCausalLM(FlashRWPreTrainedModel): method __init__ (line 670) | def __init__(self, prefix: str, config, weights): method forward (line 682) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py function load_multi_mqa (line 27) | def load_multi_mqa( function _load_multi_mqa_gptq (line 44) | def _load_multi_mqa_gptq( function _load_multi_mqa (line 131) | def _load_multi_mqa( function load_col (line 201) | def load_col(config, prefix: str, weights, bias: bool): function load_row (line 214) | def load_row(config, prefix: str, weights, bias: bool): class FlashMQAttention (line 230) | class FlashMQAttention(torch.nn.Module): method __init__ (line 231) | def __init__(self, prefix, config, weights): method forward (line 266) | def forward( class MLP (line 323) | class MLP(nn.Module): method __init__ (line 324) | def __init__(self, prefix, config, weights): method forward (line 345) | def forward(self, hidden_states): class Block (line 352) | class Block(nn.Module): method __init__ (line 353) | def __init__(self, prefix: str, layer_id, config, weights): method forward (line 373) | def forward( class FlashSantacoderModel (line 402) | class FlashSantacoderModel(nn.Module): method __init__ (line 403) | def __init__(self, prefix: str, config, weights): method forward (line 437) | def forward( class FlashSantacoderForCausalLM (line 471) | class FlashSantacoderForCausalLM(nn.Module): method __init__ (line 472) | def __init__(self, prefix, config, weights): method forward (line 486) | def forward( FILE: server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py class Starcoder2Config (line 54) | class Starcoder2Config(PretrainedConfig): method __init__ (line 57) | def __init__( function load_attention (line 114) | def load_attention(config, prefix, weights, layer_id): function _load_gqa (line 141) | def _load_gqa(config, prefix: str, weights): class Starcoder2Attention (line 173) | class Starcoder2Attention(torch.nn.Module): method __init__ (line 174) | def __init__( method forward (line 230) | def forward( class Starcoder2MLP (line 302) | class Starcoder2MLP(nn.Module): method __init__ (line 303) | def __init__(self, prefix, config, weights, index): method forward (line 345) | def forward(self, hidden_states, adapter_data): class Starcoder2GatedMLP (line 351) | class Starcoder2GatedMLP(nn.Module): method __init__ (line 352) | def __init__(self, index, prefix, config, weights): method forward (line 401) | def forward(self, hidden_states, adapter_data): class Starcoder2Layer (line 420) | class Starcoder2Layer(nn.Module): method __init__ (line 421) | def __init__(self, layer_id, config, weights): method forward (line 443) | def forward( class Starcoder2Model (line 485) | class Starcoder2Model(torch.nn.Module): method __init__ (line 486) | def __init__(self, prefix, config, weights): method forward (line 515) | def forward( class FlashStarcoder2ForCausalLM (line 559) | class FlashStarcoder2ForCausalLM(torch.nn.Module): method __init__ (line 560) | def __init__(self, prefix, config, weights): method forward (line 589) | def forward( FILE: server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py class Gemma3TextConfig (line 32) | class Gemma3TextConfig(PretrainedConfig): method __init__ (line 154) | def __init__( class Gemma3Config (line 220) | class Gemma3Config(PretrainedConfig): method __init__ (line 274) | def __init__( FILE: server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py class Gemma3ImageProcessor (line 63) | class Gemma3ImageProcessor(BaseImageProcessor): method __init__ (line 99) | def __init__( method pan_and_scan (line 135) | def pan_and_scan( method _process_images_for_pas (line 226) | def _process_images_for_pas( method preprocess (line 252) | def preprocess( FILE: server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py class Gemma3ImagesKwargs (line 38) | class Gemma3ImagesKwargs(ImagesKwargs): class Gemma3ProcessorKwargs (line 46) | class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): class Gemma3Processor (line 60) | class Gemma3Processor(ProcessorMixin): method __init__ (line 67) | def __init__( method __call__ (line 105) | def __call__( method batch_decode (line 181) | def batch_decode(self, *args, **kwargs): method decode (line 189) | def decode(self, *args, **kwargs): method model_input_names (line 198) | def model_input_names(self): FILE: server/text_generation_server/models/custom_modeling/gemma3/utils.py function is_valid_list_of_images (line 22) | def is_valid_list_of_images(images: List): function make_nested_list_of_images (line 26) | def make_nested_list_of_images( FILE: server/text_generation_server/models/custom_modeling/idefics2.py function repeat_kv (line 39) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class Idefics2VisionEmbeddings (line 53) | class Idefics2VisionEmbeddings(nn.Module): method __init__ (line 64) | def __init__(self, prefix, config, weights): method forward (line 91) | def forward( class Idefics2VisionAttention (line 134) | class Idefics2VisionAttention(nn.Module): method __init__ (line 135) | def __init__(self, prefix, config, weights): method forward (line 164) | def forward( class Idefics2VisionMLP (line 232) | class Idefics2VisionMLP(nn.Module): method __init__ (line 233) | def __init__(self, prefix, config, weights): method forward (line 244) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Idefics2EncoderLayer (line 251) | class Idefics2EncoderLayer(nn.Module): method __init__ (line 252) | def __init__(self, prefix, config, weights): method forward (line 269) | def forward( class Idefics2Encoder (line 291) | class Idefics2Encoder(nn.Module): method __init__ (line 292) | def __init__(self, prefix, config, weights): method forward (line 305) | def forward( class Idefics2VisionTransformer (line 319) | class Idefics2VisionTransformer(nn.Module): method __init__ (line 320) | def __init__(self, prefix, config, weights): method forward (line 335) | def forward( class Idefics2MLP (line 380) | class Idefics2MLP(nn.Module): method __init__ (line 381) | def __init__(self, prefix, config, weights): method forward (line 408) | def forward(self, hidden_states): class Idefics2RMSNorm (line 418) | class Idefics2RMSNorm(nn.Module): method __init__ (line 419) | def __init__(self, prefix, weights, eps): method forward (line 429) | def forward(self, hidden_states): class Idefics2PerceiverAttention (line 437) | class Idefics2PerceiverAttention(nn.Module): method __init__ (line 438) | def __init__(self, prefix, config, weights): method forward (line 472) | def forward( class Idefics2PerceiverLayer (line 544) | class Idefics2PerceiverLayer(nn.Module): method __init__ (line 545) | def __init__(self, prefix, config, weights): method forward (line 572) | def forward( class Idefics2PerceiverResampler (line 605) | class Idefics2PerceiverResampler(nn.Module): method __init__ (line 606) | def __init__(self, prefix, config, weights) -> None: method forward (line 632) | def forward( class Idefics2Connector (line 664) | class Idefics2Connector(nn.Module): method __init__ (line 665) | def __init__(self, prefix, config, weights): method forward (line 674) | def forward(self, image_hidden_states, attention_mask): class Idefics2ForConditionalGeneration (line 682) | class Idefics2ForConditionalGeneration(nn.Module): method __init__ (line 683) | def __init__(self, prefix, config, weights): method _merge_input_ids_with_image_features (line 723) | def _merge_input_ids_with_image_features( method get_vision_embeds (line 736) | def get_vision_embeds( method get_inputs_embeds (line 805) | def get_inputs_embeds( method forward (line 820) | def forward( FILE: server/text_generation_server/models/custom_modeling/idefics3.py function repeat_kv (line 38) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class Idefics3VisionEmbeddings (line 52) | class Idefics3VisionEmbeddings(nn.Module): method __init__ (line 63) | def __init__(self, prefix, config, weights): method forward (line 90) | def forward( class Idefics3VisionAttention (line 133) | class Idefics3VisionAttention(nn.Module): method __init__ (line 134) | def __init__(self, prefix, config, weights): method forward (line 163) | def forward( class Idefics3VisionMLP (line 231) | class Idefics3VisionMLP(nn.Module): method __init__ (line 232) | def __init__(self, prefix, config, weights): method forward (line 243) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Idefics3EncoderLayer (line 250) | class Idefics3EncoderLayer(nn.Module): method __init__ (line 251) | def __init__(self, prefix, config, weights): method forward (line 268) | def forward( class Idefics3Encoder (line 290) | class Idefics3Encoder(nn.Module): method __init__ (line 291) | def __init__(self, prefix, config, weights): method forward (line 304) | def forward( class Idefics3VisionTransformer (line 318) | class Idefics3VisionTransformer(nn.Module): method __init__ (line 319) | def __init__(self, prefix, config, weights): method forward (line 334) | def forward( class Idefics3SimpleMLP (line 379) | class Idefics3SimpleMLP(nn.Module): method __init__ (line 380) | def __init__(self, prefix, config, weights): method forward (line 391) | def forward(self, x): class Idefics3Connector (line 395) | class Idefics3Connector(nn.Module): method __init__ (line 396) | def __init__(self, prefix, config, weights): method pixel_shuffle (line 401) | def pixel_shuffle(self, x, scale_factor=2): method forward (line 417) | def forward(self, image_hidden_states): class Idefics3ForConditionalGeneration (line 423) | class Idefics3ForConditionalGeneration(nn.Module): method __init__ (line 424) | def __init__(self, prefix, config, weights): method _merge_input_ids_with_image_features (line 466) | def _merge_input_ids_with_image_features( method get_vision_embeds (line 479) | def get_vision_embeds( method get_inputs_embeds (line 547) | def get_inputs_embeds( method forward (line 562) | def forward( FILE: server/text_generation_server/models/custom_modeling/idefics_config.py class IdeficsVisionConfig (line 31) | class IdeficsVisionConfig(PretrainedConfig): method __init__ (line 75) | def __init__( class IdeficsPerceiverConfig (line 107) | class IdeficsPerceiverConfig(PretrainedConfig): method __init__ (line 132) | def __init__( class IdeficsConfig (line 152) | class IdeficsConfig(PretrainedConfig): method __init__ (line 228) | def __init__( method to_dict (line 314) | def to_dict(self): FILE: server/text_generation_server/models/custom_modeling/idefics_image_processing.py function convert_to_rgb (line 48) | def convert_to_rgb(image): class IdeficsImageProcessor (line 61) | class IdeficsImageProcessor(BaseImageProcessor): method __init__ (line 81) | def __init__( method preprocess (line 96) | def preprocess( method fetch_images (line 184) | def fetch_images(self, image_url_or_urls: Union[str, List[str]]): method rescale (line 226) | def rescale( method normalize (line 260) | def normalize( FILE: server/text_generation_server/models/custom_modeling/idefics_modeling.py class BaseModelOutputWithPastImage (line 61) | class BaseModelOutputWithPastImage(BaseModelOutputWithPast): class CausalLMOutputWithPastImage (line 66) | class CausalLMOutputWithPastImage(CausalLMOutputWithPast): function expand_inputs_for_generation (line 81) | def expand_inputs_for_generation( function update_model_kwargs_for_generation (line 129) | def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder... function prepare_inputs_for_generation (line 166) | def prepare_inputs_for_generation(input_ids, past=None, **kwargs): function freeze_model (line 201) | def freeze_model(model, module_exceptions=[]): class IdeficsDecoupledPartialTPEmbedding (line 220) | class IdeficsDecoupledPartialTPEmbedding(nn.Module): method __init__ (line 221) | def __init__( method forward (line 235) | def forward(self, input_ids): class IdeficsDecoupledTensorParallelLinear (line 254) | class IdeficsDecoupledTensorParallelLinear(nn.Module): method __init__ (line 263) | def __init__( method forward (line 277) | def forward(self, input: torch.Tensor) -> torch.Tensor: method extra_repr (line 284) | def extra_repr(self) -> str: function _make_causal_mask (line 296) | def _make_causal_mask( function _expand_mask (line 326) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option... class IdeficsRMSNorm (line 342) | class IdeficsRMSNorm(nn.Module): method __init__ (line 343) | def __init__(self, prefix, weights, eps=1e-6): method forward (line 353) | def forward(self, hidden_states, residual=None): class IdeficsMLP (line 445) | class IdeficsMLP(nn.Module): method __init__ (line 446) | def __init__( method forward (line 468) | def forward(self, hidden_states): class IdeficsAttention (line 478) | class IdeficsAttention(nn.Module): method __init__ (line 481) | def __init__( method _shape (line 553) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 560) | def forward( class IdeficsDecoderLayer (line 687) | class IdeficsDecoderLayer(nn.Module): method __init__ (line 688) | def __init__(self, layer_id: int, config: IdeficsConfig, weights): method forward (line 715) | def forward( class IdeficsGatedCrossAttentionLayer (line 774) | class IdeficsGatedCrossAttentionLayer(nn.Module): method __init__ (line 775) | def __init__(self, layer_id, config: IdeficsConfig, weights): method forward (line 813) | def forward( class IdeficsPreTrainedModel (line 908) | class IdeficsPreTrainedModel(PreTrainedModel): class IdeficsModel (line 999) | class IdeficsModel(IdeficsPreTrainedModel): method __init__ (line 1007) | def __init__(self, config: IdeficsConfig, weights): method _prepare_decoder_attention_mask (line 1091) | def _prepare_decoder_attention_mask( method forward (line 1119) | def forward( class IdeficsForVisionText2Text (line 1417) | class IdeficsForVisionText2Text(IdeficsPreTrainedModel): method __init__ (line 1418) | def __init__( method forward (line 1434) | def forward( method prepare_inputs_for_generation (line 1525) | def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): method _expand_inputs_for_generation (line 1533) | def _expand_inputs_for_generation( method _update_model_kwargs_for_generation (line 1540) | def _update_model_kwargs_for_generation( method _reorder_cache (line 1548) | def _reorder_cache(past, beam_idx): FILE: server/text_generation_server/models/custom_modeling/idefics_perceiver.py class IdeficsPerceiverResampler (line 52) | class IdeficsPerceiverResampler(nn.Module): method __init__ (line 53) | def __init__( method forward (line 127) | def forward(self, context: torch.Tensor) -> torch.Tensor: class IdeficsPerceiverAttention (line 140) | class IdeficsPerceiverAttention(nn.Module): method __init__ (line 141) | def __init__( method forward (line 194) | def forward(self, context: torch.Tensor, latents: torch.Tensor) -> tor... class IdeficsMLP (line 242) | class IdeficsMLP(nn.Module): method __init__ (line 243) | def __init__( method forward (line 268) | def forward( FILE: server/text_generation_server/models/custom_modeling/idefics_processing.py function incremental_to_binary_attention_mask (line 41) | def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1): function image_attention_mask_for_packed_input_ids (line 57) | def image_attention_mask_for_packed_input_ids(input_ids, tokenizer): function is_url (line 104) | def is_url(string): function is_image (line 113) | def is_image(string): class IdeficsProcessor (line 119) | class IdeficsProcessor(ProcessorMixin): method __init__ (line 138) | def __init__( method __call__ (line 168) | def __call__( method batch_decode (line 425) | def batch_decode(self, *args, **kwargs): method decode (line 432) | def decode(self, *args, **kwargs): method model_input_names (line 440) | def model_input_names(self): FILE: server/text_generation_server/models/custom_modeling/idefics_vision.py class IdeficsVisionModelOutput (line 41) | class IdeficsVisionModelOutput(ModelOutput): class IdeficsVisionEmbeddings (line 70) | class IdeficsVisionEmbeddings(nn.Module): method __init__ (line 71) | def __init__(self, prefix, config, weights): method forward (line 100) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: class IdeficsVisionAttention (line 115) | class IdeficsVisionAttention(nn.Module): method __init__ (line 118) | def __init__(self, prefix, config, weights): method _shape (line 153) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 160) | def forward( class IdeficsVisionMLP (line 252) | class IdeficsVisionMLP(nn.Module): method __init__ (line 253) | def __init__(self, prefix, config, weights): method forward (line 264) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class IdeficsVisionEncoderLayer (line 272) | class IdeficsVisionEncoderLayer(nn.Module): method __init__ (line 273) | def __init__(self, prefix, config, weights): method forward (line 289) | def forward( class IdeficsVisionEncoder (line 331) | class IdeficsVisionEncoder(nn.Module): method __init__ (line 340) | def __init__(self, prefix, config, weights): method forward (line 355) | def forward( class IdeficsVisionTransformer (line 458) | class IdeficsVisionTransformer(nn.Module): method __init__ (line 459) | def __init__(self, prefix, config, weights): method forward (line 479) | def forward( FILE: server/text_generation_server/models/custom_modeling/llava_next.py function get_anyres_image_grid_shape (line 37) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): function unpad_image (line 60) | def unpad_image(tensor, original_size): class LlavaNextMultiModalProjector (line 94) | class LlavaNextMultiModalProjector(nn.Module): method __init__ (line 95) | def __init__(self, prefix, config, weights): method forward (line 106) | def forward(self, image_features): class LlavaNextForConditionalGeneration (line 113) | class LlavaNextForConditionalGeneration(nn.Module): method __init__ (line 114) | def __init__(self, prefix, config, weights): method _merge_input_ids_with_image_features (line 149) | def _merge_input_ids_with_image_features( method get_vision_embeds (line 166) | def get_vision_embeds( method get_inputs_embeds (line 254) | def get_inputs_embeds( method forward (line 271) | def forward( FILE: server/text_generation_server/models/custom_modeling/mamba_modeling.py class InferenceParams (line 25) | class InferenceParams: class MambaConfig (line 36) | class MambaConfig(PretrainedConfig): method __init__ (line 37) | def __init__( class MambaBlock (line 71) | class MambaBlock(nn.Module): method __init__ (line 72) | def __init__(self, prefix, config, weights, layer_id): method forward (line 94) | def forward(self, hidden_states: torch.Tensor, inference_params=None): method step (line 140) | def step(self, hidden_states, conv_state, ssm_state): class ResidualBlock (line 170) | class ResidualBlock(nn.Module): method __init__ (line 171) | def __init__(self, prefix, config, weights, layer_id): method forward (line 180) | def forward( class MambaModel (line 195) | class MambaModel(nn.Module): method __init__ (line 196) | def __init__(self, config, weights): method forward (line 218) | def forward( FILE: server/text_generation_server/models/custom_modeling/mllama.py function _prepare_aspect_ratio_attention_mask (line 46) | def _prepare_aspect_ratio_attention_mask( function _prepare_4d_causal_attention_mask_with_cache_position (line 78) | def _prepare_4d_causal_attention_mask_with_cache_position( function _prepare_cross_attention_mask (line 142) | def _prepare_cross_attention_mask( class MllamaVisionMLP (line 175) | class MllamaVisionMLP(nn.Module): method __init__ (line 176) | def __init__(self, *, prefix, config, weights): method forward (line 187) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class MllamaVisionSdpaAttention (line 194) | class MllamaVisionSdpaAttention(nn.Module): method __init__ (line 195) | def __init__(self, *, prefix, config, weights): method forward (line 216) | def forward( class MllamaVisionEncoderLayer (line 253) | class MllamaVisionEncoderLayer(nn.Module): method __init__ (line 254) | def __init__(self, *, prefix, config, weights, is_gated: bool): method forward (line 285) | def forward( class MllamaVisionEncoder (line 306) | class MllamaVisionEncoder(nn.Module): method __init__ (line 307) | def __init__(self, *, prefix, config, weights, is_gated: bool, num_lay... method forward (line 320) | def forward( class MllamaPrecomputedAspectRatioEmbedding (line 338) | class MllamaPrecomputedAspectRatioEmbedding(nn.Module): method __init__ (line 339) | def __init__(self, *, prefix, config, weights): method forward (line 352) | def forward( class MllamaPrecomputedPositionEmbedding (line 365) | class MllamaPrecomputedPositionEmbedding(nn.Module): method __init__ (line 366) | def __init__(self, *, prefix, config, weights): method forward (line 387) | def forward( class MllamaVisionModel (line 407) | class MllamaVisionModel(nn.Module): method __init__ (line 408) | def __init__(self, *, prefix, config, weights): method apply_class_embedding (line 484) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T... method forward (line 490) | def forward( class MllamaTextCrossAttention (line 622) | class MllamaTextCrossAttention(nn.Module): method __init__ (line 625) | def __init__(self, *, prefix, config, weights, layer_idx): method forward (line 674) | def forward( class MllamaTextMLP (line 778) | class MllamaTextMLP(nn.Module): method __init__ (line 779) | def __init__(self, *, prefix, config, weights): method forward (line 801) | def forward(self, x): class FlashLlamaCrossLayer (line 811) | class FlashLlamaCrossLayer(torch.nn.Module): method __init__ (line 814) | def __init__(self, *, prefix, config, weights, index) -> None: method forward (line 842) | def forward( class MllamaTextRMSNorm (line 889) | class MllamaTextRMSNorm(nn.Module): method __init__ (line 890) | def __init__(self, weight, eps): method load (line 896) | def load(cls, *, prefix, weights, eps): method forward (line 902) | def forward(self, hidden_states): method extra_repr (line 909) | def extra_repr(self): class MllamaForConditionalGeneration (line 913) | class MllamaForConditionalGeneration(nn.Module): method __init__ (line 914) | def __init__(self, prefix, config, weights): method vision_forward (line 935) | def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_... method forward (line 953) | def forward( FILE: server/text_generation_server/models/custom_modeling/mpt_modeling.py function load_col (line 30) | def load_col(config, prefix, weights, bias): function _reset_is_causal (line 81) | def _reset_is_causal( function scaled_multihead_dot_product_attention (line 94) | def scaled_multihead_dot_product_attention( function check_valid_inputs (line 167) | def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bflo... function flash_attn_fn (line 179) | def flash_attn_fn( function triton_flash_attn_fn (line 254) | def triton_flash_attn_fn( class MultiheadAttention (line 325) | class MultiheadAttention(nn.Module): method __init__ (line 332) | def __init__( method forward (line 386) | def forward( class MultiQueryAttention (line 423) | class MultiQueryAttention(nn.Module): method __init__ (line 430) | def __init__(self, config, prefix, weights, verbose=False): method forward (line 479) | def forward( function attn_bias_shape (line 517) | def attn_bias_shape( function build_attn_bias (line 534) | def build_attn_bias( function gen_slopes (line 557) | def gen_slopes(n_heads, alibi_bias_max=8, device=None): function build_alibi_bias (line 567) | def build_alibi_bias( class MPTMLP (line 591) | class MPTMLP(nn.Module): method __init__ (line 592) | def __init__(self, config, prefix, weights): method forward (line 608) | def forward(self, x): class MPTBlock (line 612) | class MPTBlock(nn.Module): method __init__ (line 613) | def __init__(self, config, prefix, weights): method forward (line 640) | def forward( function _cast_if_autocast_enabled (line 663) | def _cast_if_autocast_enabled(tensor): class LPLayerNorm (line 675) | class LPLayerNorm(torch.nn.LayerNorm): method __init__ (line 676) | def __init__( method forward (line 701) | def forward(self, x): function rms_norm (line 722) | def rms_norm(x, weight=None, eps=1e-05): class RMSNorm (line 729) | class RMSNorm(torch.nn.Module): method __init__ (line 730) | def __init__( method forward (line 742) | def forward(self, x): class LPRMSNorm (line 746) | class LPRMSNorm(RMSNorm): method __init__ (line 747) | def __init__( method forward (line 758) | def forward(self, x): class MPTPreTrainedModel (line 779) | class MPTPreTrainedModel(PreTrainedModel): class MPTModel (line 784) | class MPTModel(MPTPreTrainedModel): method __init__ (line 785) | def __init__(self, prefix: str, config, weights): method _attn_bias (line 859) | def _attn_bias( method _apply_prefix_mask (line 917) | def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: tor... method _apply_sequence_id (line 940) | def _apply_sequence_id( method forward (line 956) | def forward( class MPTForCausalLM (line 1088) | class MPTForCausalLM(MPTPreTrainedModel): method __init__ (line 1089) | def __init__(self, prefix: str, config, weights): method forward (line 1115) | def forward( method prepare_inputs_for_generation (line 1168) | def prepare_inputs_for_generation( method _reorder_cache (line 1202) | def _reorder_cache(past_key_values, beam_idx): FILE: server/text_generation_server/models/custom_modeling/neox_modeling.py function make_causal_mask (line 53) | def make_causal_mask( function expand_mask (line 73) | def expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor: function prepare_attn_mask (line 84) | def prepare_attn_mask( class GPTNeoXPreTrainedModel (line 111) | class GPTNeoXPreTrainedModel(PreTrainedModel): class GPTNeoXAttention (line 118) | class GPTNeoXAttention(nn.Module): method __init__ (line 119) | def __init__(self, config, prefix, weights): method forward (line 161) | def forward( method _split_heads (line 239) | def _split_heads(cls, tensor, num_attention_heads, attn_head_size): method _merge_heads (line 252) | def _merge_heads(cls, tensor, num_attention_heads, attn_head_size): method _attn (line 265) | def _attn(self, query, key, value, attention_mask=None, head_mask=None): class RotaryEmbedding (line 310) | class RotaryEmbedding(torch.nn.Module): method __init__ (line 311) | def __init__(self, dim, max_position_embeddings, base=10000, device=No... method rotate_half (line 324) | def rotate_half(x): method _create_cos_sin (line 331) | def _create_cos_sin(inv_freq, max_position_embeddings, dtype, device): method forward (line 340) | def forward(self, q, k, position_ids, seq_len=None): function rotary_forward (line 356) | def rotary_forward(q, k, cos, sin, position_ids): class GPTNeoXMLP (line 371) | class GPTNeoXMLP(nn.Module): method __init__ (line 372) | def __init__(self, config, prefix, weights): method forward (line 387) | def forward(self, hidden_states): class GPTNeoXLayer (line 394) | class GPTNeoXLayer(nn.Module): method __init__ (line 395) | def __init__(self, layer_id, prefix: str, config, weights): method forward (line 415) | def forward( class GPTNeoXModel (line 462) | class GPTNeoXModel(GPTNeoXPreTrainedModel): method __init__ (line 463) | def __init__(self, prefix: str, config, weights): method forward (line 485) | def forward( class GPTNeoxForCausalLM (line 628) | class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel): method __init__ (line 631) | def __init__(self, prefix: str, config, weights): method forward (line 644) | def forward( method prepare_inputs_for_generation (line 744) | def prepare_inputs_for_generation( method _reorder_cache (line 786) | def _reorder_cache(self, past_key_values, beam_idx): FILE: server/text_generation_server/models/custom_modeling/opt_modeling.py function _make_causal_mask (line 43) | def _make_causal_mask( function _expand_mask (line 77) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option... class OPTLearnedPositionalEmbedding (line 93) | class OPTLearnedPositionalEmbedding(nn.Module): method __init__ (line 98) | def __init__(self, prefix: str, weights): method forward (line 107) | def forward( class OPTAttention (line 124) | class OPTAttention(nn.Module): method __init__ (line 127) | def __init__( method _shape (line 175) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 182) | def forward( class OPTDecoderLayer (line 316) | class OPTDecoderLayer(nn.Module): method __init__ (line 317) | def __init__(self, layer_id: int, prefix: str, config: OPTConfig, weig... method forward (line 345) | def forward( class OPTPreTrainedModel (line 429) | class OPTPreTrainedModel(PreTrainedModel): class OPTDecoder (line 433) | class OPTDecoder(OPTPreTrainedModel): method __init__ (line 434) | def __init__(self, prefix: str, config: OPTConfig, weights): method _prepare_decoder_attention_mask (line 492) | def _prepare_decoder_attention_mask( method forward (line 519) | def forward( class OPTModel (line 703) | class OPTModel(OPTPreTrainedModel): method __init__ (line 704) | def __init__(self, prefix: str, config: OPTConfig, weights): method forward (line 709) | def forward( class OPTForCausalLM (line 760) | class OPTForCausalLM(OPTPreTrainedModel): method __init__ (line 761) | def __init__(self, prefix, config, weights): method forward (line 774) | def forward( method prepare_inputs_for_generation (line 829) | def prepare_inputs_for_generation( method _reorder_cache (line 856) | def _reorder_cache(past_key_values, beam_idx): FILE: server/text_generation_server/models/custom_modeling/phi_modeling.py class PhiConfig (line 22) | class PhiConfig(PretrainedConfig): method __init__ (line 23) | def __init__( class RotaryEmbedding (line 67) | class RotaryEmbedding(nn.Module): method __init__ (line 68) | def __init__(self, dim, max_seq_len): method apply_rotary_emb_qkv (line 78) | def apply_rotary_emb_qkv(self, qkv, seqlen_offset): class PhiCausalLMHead (line 115) | class PhiCausalLMHead(nn.Module): method __init__ (line 116) | def __init__(self, config, weights): method forward (line 127) | def forward(self, hidden_states): class PhiMHA (line 134) | class PhiMHA(nn.Module): method __init__ (line 135) | def __init__(self, prefix, config, weights): method forward (line 155) | def forward( class PhiMLP (line 196) | class PhiMLP(nn.Module): method __init__ (line 197) | def __init__(self, prefix, config, weights): method forward (line 215) | def forward(self, hidden_states): class PhiBlock (line 223) | class PhiBlock(nn.Module): method __init__ (line 224) | def __init__(self, layer_id, config, weights): method forward (line 233) | def forward( class PhiModel (line 250) | class PhiModel(nn.Module): method __init__ (line 251) | def __init__(self, prefix: str, config, weights): method forward (line 265) | def forward( class PhiForCausalLM (line 291) | class PhiForCausalLM(torch.nn.Module): method __init__ (line 292) | def __init__(self, prefix: str, config, weights): method forward (line 303) | def forward( FILE: server/text_generation_server/models/custom_modeling/qwen2_5_vl.py class Qwen2_5_VLVideosProcessorKwargs (line 63) | class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): class Qwen2_5_VLProcessorKwargs (line 67) | class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): class Qwen2_5_VLProcessor (line 77) | class Qwen2_5_VLProcessor(ProcessorMixin): method __init__ (line 97) | def __init__( method __call__ (line 112) | def __call__( method batch_decode (line 232) | def batch_decode(self, *args, **kwargs): method decode (line 239) | def decode(self, *args, **kwargs): method post_process_image_text_to_text (line 246) | def post_process_image_text_to_text(self, generated_outputs): method model_input_names (line 265) | def model_input_names(self): class Qwen2_5_VLVisionConfig (line 275) | class Qwen2_5_VLVisionConfig(PretrainedConfig): method __init__ (line 279) | def __init__( class Qwen2_5_VLConfig (line 315) | class Qwen2_5_VLConfig(PretrainedConfig): method __init__ (line 316) | def __init__( function rotate_half (line 379) | def rotate_half(x): function apply_rotary_pos_emb_vision (line 386) | def apply_rotary_pos_emb_vision( class Qwen2_5VLAttention (line 400) | class Qwen2_5VLAttention(nn.Module): method __init__ (line 401) | def __init__(self, *, prefix, config, weights): method forward (line 425) | def forward( class Qwen2_5VLVisionMLP (line 529) | class Qwen2_5VLVisionMLP(nn.Module): method __init__ (line 530) | def __init__(self, *, prefix, config, weights): method forward (line 548) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Qwen2_5VLVisionBlock (line 556) | class Qwen2_5VLVisionBlock(nn.Module): method __init__ (line 557) | def __init__(self, prefix, config, weights): method forward (line 580) | def forward( class Qwen2_5VLPatchMerger (line 592) | class Qwen2_5VLPatchMerger(nn.Module): method __init__ (line 593) | def __init__(self, *, prefix, config, weights): method forward (line 608) | def forward(self, hidden_states) -> torch.Tensor: class Qwen2_5VisionModel (line 617) | class Qwen2_5VisionModel(nn.Module): method __init__ (line 618) | def __init__(self, *, prefix, config, weights): method apply_class_embedding (line 665) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T... method get_window_index (line 671) | def get_window_index(self, grid_thw): method forward (line 718) | def forward( class Qwen2_5VLForConditionalGeneration (line 817) | class Qwen2_5VLForConditionalGeneration(nn.Module): method __init__ (line 818) | def __init__(self, prefix, config, weights): method get_position_ids (line 867) | def get_position_ids( method get_vision_embeds (line 944) | def get_vision_embeds( method get_inputs_embeds (line 954) | def get_inputs_embeds( method forward (line 967) | def forward( FILE: server/text_generation_server/models/custom_modeling/qwen2_vl.py function rotate_half (line 50) | def rotate_half(x): function apply_rotary_pos_emb_vision (line 57) | def apply_rotary_pos_emb_vision( class Qwen2VLAttention (line 71) | class Qwen2VLAttention(nn.Module): method __init__ (line 72) | def __init__(self, *, prefix, config, weights): method forward (line 95) | def forward( class Qwen2VLVisionMLP (line 199) | class Qwen2VLVisionMLP(nn.Module): method __init__ (line 200) | def __init__(self, *, prefix, config, weights): method forward (line 210) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Qwen2VLVisionBlock (line 217) | class Qwen2VLVisionBlock(nn.Module): method __init__ (line 218) | def __init__(self, prefix, config, weights): method forward (line 241) | def forward( class Qwen2VLPatchMerger (line 252) | class Qwen2VLPatchMerger(nn.Module): method __init__ (line 253) | def __init__(self, *, prefix, config, weights): method forward (line 268) | def forward(self, hidden_states) -> torch.Tensor: class Qwen2VisionModel (line 277) | class Qwen2VisionModel(nn.Module): method __init__ (line 278) | def __init__(self, *, prefix, config, weights): method apply_class_embedding (line 320) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T... method forward (line 326) | def forward( class Qwen2VLForConditionalGeneration (line 393) | class Qwen2VLForConditionalGeneration(nn.Module): method __init__ (line 394) | def __init__(self, prefix, config, weights): method get_position_ids (line 448) | def get_position_ids( method get_vision_embeds (line 522) | def get_vision_embeds( method get_inputs_embeds (line 532) | def get_inputs_embeds( method forward (line 545) | def forward( FILE: server/text_generation_server/models/custom_modeling/siglip.py class SiglipVisionEmbeddings (line 21) | class SiglipVisionEmbeddings(nn.Module): method __init__ (line 22) | def __init__(self, prefix, config: SiglipVisionConfig, weights): method forward (line 52) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: class SiglipAttention (line 62) | class SiglipAttention(nn.Module): method __init__ (line 65) | def __init__(self, prefix, config, weights): method _shape (line 95) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 102) | def forward( class SiglipMLP (line 163) | class SiglipMLP(nn.Module): method __init__ (line 164) | def __init__(self, prefix, config, weights): method forward (line 175) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class SiglipEncoderLayer (line 182) | class SiglipEncoderLayer(nn.Module): method __init__ (line 183) | def __init__(self, prefix, config: SiglipConfig, weights): method forward (line 197) | def forward( class SiglipMultiheadAttentionPoolingHead (line 216) | class SiglipMultiheadAttentionPoolingHead(nn.Module): method __init__ (line 219) | def __init__(self, prefix, config: SiglipVisionConfig, weights): method forward (line 229) | def forward(self, hidden_state): function _trunc_normal_ (line 242) | def _trunc_normal_(tensor, mean, std, a, b): function trunc_normal_tf_ (line 278) | def trunc_normal_tf_( function variance_scaling_ (line 308) | def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="no... function lecun_normal_ (line 333) | def lecun_normal_(tensor): function default_flax_embed_init (line 337) | def default_flax_embed_init(tensor): class SiglipEncoder (line 341) | class SiglipEncoder(nn.Module): method __init__ (line 350) | def __init__(self, prefix, config: SiglipConfig, weights): method forward (line 362) | def forward( class SiglipVisionTransformer (line 377) | class SiglipVisionTransformer(nn.Module): method __init__ (line 378) | def __init__(self, prefix, config: SiglipVisionConfig, weights): method forward (line 389) | def forward( FILE: server/text_generation_server/models/custom_modeling/t5_modeling.py class PartialTPEmbedding (line 58) | class PartialTPEmbedding(nn.Module): method __init__ (line 59) | def __init__(self, prefix: str, weights): method forward (line 64) | def forward(self, input: torch.Tensor) -> torch.Tensor: function layer_norm (line 69) | def layer_norm(hidden_states, weight, epsilon): class T5LayerNorm (line 85) | class T5LayerNorm(nn.Module): method __init__ (line 86) | def __init__(self, prefix, weights, eps=1e-6): method forward (line 95) | def forward(self, hidden_states): class T5DenseActDense (line 117) | class T5DenseActDense(nn.Module): method __init__ (line 118) | def __init__(self, config: T5Config, prefix, weights): method forward (line 146) | def forward(self, hidden_states): class T5DenseGatedActDense (line 159) | class T5DenseGatedActDense(nn.Module): method __init__ (line 160) | def __init__(self, config: T5Config, prefix, weights): method forward (line 190) | def forward(self, hidden_states): class T5LayerFF (line 204) | class T5LayerFF(nn.Module): method __init__ (line 205) | def __init__(self, config: T5Config, prefix, weights): method forward (line 223) | def forward(self, hidden_states): class T5Attention (line 230) | class T5Attention(nn.Module): method __init__ (line 231) | def __init__( method _relative_position_bucket (line 274) | def _relative_position_bucket( method compute_bias (line 328) | def compute_bias(self, query_length, key_length, device=None): method forward (line 355) | def forward( class T5LayerSelfAttention (line 504) | class T5LayerSelfAttention(nn.Module): method __init__ (line 505) | def __init__(self, config, prefix, weights, has_relative_attention_bia... method forward (line 520) | def forward( class T5LayerCrossAttention (line 547) | class T5LayerCrossAttention(nn.Module): method __init__ (line 548) | def __init__(self, config, prefix, weights): method forward (line 563) | def forward( class T5Block (line 594) | class T5Block(nn.Module): method __init__ (line 595) | def __init__(self, config, prefix, weights, has_relative_attention_bia... method forward (line 621) | def forward( class T5PreTrainedModel (line 746) | class T5PreTrainedModel(PreTrainedModel): method _shift_right (line 754) | def _shift_right(self, input_ids): class T5Stack (line 786) | class T5Stack(T5PreTrainedModel): method __init__ (line 787) | def __init__(self, config, prefix, weights, embed_tokens): method forward (line 811) | def forward( class T5ForConditionalGeneration (line 1015) | class T5ForConditionalGeneration(T5PreTrainedModel): method __init__ (line 1016) | def __init__(self, config: T5Config, weights): method forward (line 1056) | def forward( method prepare_inputs_for_generation (line 1167) | def prepare_inputs_for_generation( method prepare_decoder_input_ids_from_labels (line 1196) | def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): method _reorder_cache (line 1199) | def _reorder_cache(self, past_key_values, beam_idx): FILE: server/text_generation_server/models/custom_modeling/vlm.py function load_text_model (line 1) | def load_text_model(prefix, config, weights, name=None): function load_vision_model (line 43) | def load_vision_model(prefix, config, weights): FILE: server/text_generation_server/models/flash_causal_lm.py function small_power_of_2 (line 87) | def small_power_of_2(n: int): function init_cpu_threads_env (line 91) | def init_cpu_threads_env(rank_id: int, world_size: int): class FlashCausalLMBatch (line 126) | class FlashCausalLMBatch(Batch): method to_pb (line 210) | def to_pb(self) -> generate_pb2.CachedBatch: method batch_tokenized_inputs (line 224) | def batch_tokenized_inputs( method from_tokenized (line 244) | def from_tokenized( method from_pb (line 473) | def from_pb( method filter (line 485) | def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": method concatenate (line 691) | def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCau... method prepare_for_prefill (line 948) | def prepare_for_prefill(self): method __len__ (line 1177) | def __len__(self): class FlashCausalLM (line 1193) | class FlashCausalLM(Model): method __init__ (line 1194) | def __init__( method batch_type (line 1344) | def batch_type(self) -> Type[FlashCausalLMBatch]: method init_kv_cache (line 1347) | def init_kv_cache( method cuda_graph_warmup (line 1369) | def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): method warmup (line 1507) | def warmup( method tunableop_warmup (line 1686) | def tunableop_warmup(self, seqlen: int, max_bt: int): method forward (line 1726) | def forward( method generate_token (line 1891) | def generate_token( method _forward_context (line 2432) | def _forward_context( FILE: server/text_generation_server/models/galactica.py function _insert_split_marker (line 29) | def _insert_split_marker(m: re.Match): function escape_custom_split_sequence (line 46) | def escape_custom_split_sequence(text): class GalacticaCausalLMBatch (line 63) | class GalacticaCausalLMBatch(CausalLMBatch): method from_pb (line 65) | def from_pb( FILE: server/text_generation_server/models/globals.py function set_adapter_to_index (line 70) | def set_adapter_to_index(adapter_to_index: Dict[str, int]): function get_adapter_to_index (line 75) | def get_adapter_to_index(): FILE: server/text_generation_server/models/idefics_causal_lm.py class IdeficsCausalLMBatch (line 44) | class IdeficsCausalLMBatch(Batch): method to_pb (line 80) | def to_pb(self) -> generate_pb2.CachedBatch: method from_pb (line 90) | def from_pb( method from_pb_processor (line 100) | def from_pb_processor( method filter (line 231) | def filter(self, request_ids: List[int]) -> Optional["IdeficsCausalLMB... method concatenate (line 347) | def concatenate( method __len__ (line 584) | def __len__(self): class IdeficsCausalLM (line 588) | class IdeficsCausalLM(Model): method __init__ (line 589) | def __init__( method batch_type (line 672) | def batch_type(self) -> Type[IdeficsCausalLMBatch]: method forward (line 675) | def forward( method generate_token (line 708) | def generate_token( FILE: server/text_generation_server/models/mamba.py function new_inference_params (line 36) | def new_inference_params( class MambaBatch (line 78) | class MambaBatch(Batch): method to_pb (line 113) | def to_pb(self) -> generate_pb2.CachedBatch: method from_pb (line 123) | def from_pb( method filter (line 199) | def filter(self, request_ids: List[int]) -> Optional["MambaBatch"]: method concatenate (line 281) | def concatenate(cls, batches: List["MambaBatch"]) -> "MambaBatch": method __len__ (line 402) | def __len__(self): class Mamba (line 406) | class Mamba(Model): method __init__ (line 407) | def __init__( method batch_type (line 475) | def batch_type(self) -> Type[MambaBatch]: method warmup (line 478) | def warmup( method cuda_graph_warmup (line 501) | def cuda_graph_warmup(self, batch_size: int): method tunableop_warmup (line 544) | def tunableop_warmup(self, batch_size: int, seqlen: int): method forward (line 568) | def forward( method generate_token (line 616) | def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tup... FILE: server/text_generation_server/models/metadata_kernels.py function has_triton (line 18) | def has_triton(): function block_tables_to_padded (line 30) | def block_tables_to_padded( function block_tables_to_ragged (line 51) | def block_tables_to_ragged( function copy_next_input_ids_inplace (line 99) | def copy_next_input_ids_inplace( function prepare_position_slot_ids (line 126) | def prepare_position_slot_ids( function slots_filtering (line 145) | def slots_filtering( function triton_slots_filtering (line 164) | def triton_slots_filtering( function triton_block_tables_to_padded (line 193) | def triton_block_tables_to_padded( function triton_block_tables_to_ragged (line 224) | def triton_block_tables_to_ragged( function triton_copy_next_input_ids_inplace (line 255) | def triton_copy_next_input_ids_inplace( function triton_prepare_position_slot_ids (line 311) | def triton_prepare_position_slot_ids( FILE: server/text_generation_server/models/mllama_causal_lm.py class MllamaCausalLMBatch (line 26) | class MllamaCausalLMBatch(VlmCausalLMBatch): method prepare_for_prefill (line 32) | def prepare_for_prefill(self): method concatenate (line 37) | def concatenate(cls, batches): method filter (line 60) | def filter(self, request_ids: List[int]): method batch_tokenized_inputs (line 92) | def batch_tokenized_inputs( method from_pb_processor (line 158) | def from_pb_processor( class MllamaCausalLM (line 202) | class MllamaCausalLM(VlmCausalLM): method set_inputs_embeds (line 203) | def set_inputs_embeds(self, batch): method cuda_graph_warmup (line 207) | def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): method forward (line 210) | def forward( FILE: server/text_generation_server/models/model.py class Model (line 29) | class Model(ABC): method __init__ (line 30) | def __init__( method info (line 105) | def info(self) -> InfoResponse: method batch_type (line 123) | def batch_type(self) -> Type[B]: method generate_token (line 127) | def generate_token( method warmup (line 132) | def warmup( method decode_token (line 144) | def decode_token( method check_initialized (line 173) | def check_initialized(self): FILE: server/text_generation_server/models/seq2seq_lm.py class Seq2SeqLMBatch (line 36) | class Seq2SeqLMBatch(Batch): method to_pb (line 76) | def to_pb(self) -> generate_pb2.CachedBatch: method from_pb (line 87) | def from_pb( method filter (line 181) | def filter(self, request_ids: List[int]) -> Optional["Seq2SeqLMBatch"]: method concatenate (line 296) | def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBat... method __len__ (line 538) | def __len__(self): class Seq2SeqLM (line 542) | class Seq2SeqLM(Model): method __init__ (line 543) | def __init__( method fallback (line 621) | def fallback( method batch_type (line 685) | def batch_type(self) -> Type[Seq2SeqLMBatch]: method forward (line 688) | def forward( method generate_token (line 726) | def generate_token( FILE: server/text_generation_server/models/transformers_flash_causal_lm.py function tgi_flash_attention_forward (line 20) | def tgi_flash_attention_forward( class TransformersFlashCausalLM (line 96) | class TransformersFlashCausalLM(FlashCausalLM): method __init__ (line 97) | def __init__( method fallback (line 237) | def fallback( method _model_forward (line 255) | def _model_forward( FILE: server/text_generation_server/models/transformers_flash_vlm.py function tgi_flash_attention_forward (line 37) | def tgi_flash_attention_forward( class TransformersFlashVlmCausalLM (line 150) | class TransformersFlashVlmCausalLM(VlmCausalLM): method __init__ (line 151) | def __init__( method get_position_ids (line 319) | def get_position_ids(self, input_ids, image_grid_thw, position_ids): method pre_process_inputs (line 322) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill): method post_process_outputs (line 328) | def post_process_outputs(self, logits, lm_head_indices): method fallback (line 332) | def fallback( method _model_forward (line 358) | def _model_forward( class TransformersQwen2VlmCausalLM (line 418) | class TransformersQwen2VlmCausalLM(TransformersFlashVlmCausalLM): method get_position_ids (line 419) | def get_position_ids(self, input_ids: torch.Tensor, image_grid_thw: to... method post_process_outputs (line 492) | def post_process_outputs(self, logits, lm_head_indices): method pre_process_inputs (line 495) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill): class TransformersGemma3VlmCausalLM (line 501) | class TransformersGemma3VlmCausalLM(TransformersFlashVlmCausalLM): method get_attention_mask (line 502) | def get_attention_mask(self, input_ids, cu_seqlen_prefill): method pre_process_inputs (line 557) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill): class TransformersLlama4VlmCausalLM (line 573) | class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM): method pre_process_inputs (line 574) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill): method get_vision_embeds (line 580) | def get_vision_embeds( method get_inputs_embeds (line 598) | def get_inputs_embeds(self, input_ids, vision_embeds=None): FILE: server/text_generation_server/models/types.py class Batch (line 13) | class Batch(ABC): method to_pb (line 15) | def to_pb(self) -> generate_pb2.CachedBatch: method from_pb (line 20) | def from_pb( method filter (line 30) | def filter(self, request_ids: List[int]) -> "Batch": method concatenate (line 35) | def concatenate(cls, batches: List["Batch"]) -> "Batch": method __len__ (line 39) | def __len__(self): class GeneratedText (line 44) | class GeneratedText: method to_pb (line 50) | def to_pb(self) -> generate_pb2.GeneratedText: class Tokens (line 60) | class Tokens: method to_pb (line 66) | def to_pb(self) -> generate_pb2.Tokens: method __len__ (line 74) | def __len__(self): method __add__ (line 77) | def __add__(self, other: "Tokens") -> "Tokens": class Generation (line 87) | class Generation: method to_pb (line 95) | def to_pb(self) -> generate_pb2.Generation: FILE: server/text_generation_server/models/vlm_causal_lm.py function prompt_split_image_llama4 (line 33) | def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk): function _prompt_split_image (line 61) | def _prompt_split_image( function get_anyres_image_grid_shape (line 90) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): function image_text_replacement (line 113) | def image_text_replacement(processor, image_input, config) -> str: function image_text_replacement_fixup (line 186) | def image_text_replacement_fixup(config, text: str) -> str: function preprocess_text (line 194) | def preprocess_text(config, text: str) -> str: function preprocess_image (line 200) | def preprocess_image(config, img): function get_unpadded_features (line 215) | def get_unpadded_features( function get_number_of_features (line 242) | def get_number_of_features(height: int, width: int, config) -> int: function scatter_image_embeds (line 269) | def scatter_image_embeds( function gather_image_embeds (line 283) | def gather_image_embeds( class ImagePositions (line 293) | class ImagePositions: class VlmCausalLMBatch (line 301) | class VlmCausalLMBatch(FlashCausalLMBatch): method concatenate (line 315) | def concatenate(cls, batches): method filter (line 348) | def filter(self, request_ids: List[int]): method batch_tokenized_inputs (line 379) | def batch_tokenized_inputs( method get_image_positions (line 457) | def get_image_positions( method from_pb_processor (line 528) | def from_pb_processor( method prepare_for_prefill (line 551) | def prepare_for_prefill(self): method update_encoder_cache (line 619) | def update_encoder_cache(self, encoder_outputs, request_id, img_pos): method gather_vision_embeds (line 624) | def gather_vision_embeds(self): method free_encoder_cache (line 687) | def free_encoder_cache(self): class VlmCausalLM (line 696) | class VlmCausalLM(FlashCausalLM): method __init__ (line 697) | def __init__( method batch_type (line 729) | def batch_type(self) -> Type[VlmCausalLMBatch]: method cuda_graph_warmup (line 732) | def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): method get_vision_embeds (line 875) | def get_vision_embeds( method get_inputs_embeds (line 890) | def get_inputs_embeds( method encode_images (line 900) | def encode_images(self, batch): method set_inputs_embeds (line 939) | def set_inputs_embeds(self, batch): method forward (line 953) | def forward( FILE: server/text_generation_server/server.py class SignalHandler (line 41) | class SignalHandler: method __init__ (line 44) | def __init__(self): method set_keep_processing (line 48) | def set_keep_processing(self, value: bool): method exit_gracefully (line 51) | def exit_gracefully(self, signum, frame): class TextGenerationService (line 56) | class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServi... method __init__ (line 57) | def __init__( method Info (line 73) | async def Info(self, request, context): method Health (line 76) | async def Health(self, request, context): method ServiceDiscovery (line 81) | async def ServiceDiscovery(self, request, context): method ClearCache (line 84) | async def ClearCache(self, request, context): method FilterBatch (line 91) | async def FilterBatch(self, request, context): method Warmup (line 100) | async def Warmup(self, request, context): method Prefill (line 151) | async def Prefill(self, request, context): method Decode (line 193) | async def Decode(self, request, context): function serve (line 229) | def serve( FILE: server/text_generation_server/tracing.py class UDSOpenTelemetryAioServerInterceptor (line 16) | class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterce... method __init__ (line 17) | def __init__(self): method _start_span (line 20) | def _start_span(self, handler_call_details, context, set_status_on_exc... function setup_tracing (line 57) | def setup_tracing(otlp_service_name: str, otlp_endpoint: str): FILE: server/text_generation_server/utils/adapter.py class AdapterInfo (line 28) | class AdapterInfo: class AdapterParameters (line 35) | class AdapterParameters: class AdapterSource (line 44) | class AdapterSource: function parse_lora_adapters (line 50) | def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]: function load_and_merge_adapters (line 71) | def load_and_merge_adapters( class AdapterParametersContainer (line 99) | class AdapterParametersContainer: method __hash__ (line 103) | def __hash__(self) -> int: function _load_and_merge (line 108) | def _load_and_merge( function check_architectures (line 149) | def check_architectures( function load_module_map (line 188) | def load_module_map( function get_attn_weights (line 236) | def get_attn_weights(i, layer): function get_mlp_weights (line 259) | def get_mlp_weights(i, layer): function build_layer_weight_lookup (line 303) | def build_layer_weight_lookup(model): FILE: server/text_generation_server/utils/chunks.py function concat_text_chunks (line 8) | def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str: FILE: server/text_generation_server/utils/convert.py function _remove_duplicate_names (line 12) | def _remove_duplicate_names( function convert_file (line 62) | def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]): function convert_files (line 96) | def convert_files(pt_files: List[Path], sf_files: List[Path], discard_na... FILE: server/text_generation_server/utils/dist.py class FakeBarrier (line 16) | class FakeBarrier: method wait (line 17) | def wait(self): class FakeGroup (line 21) | class FakeGroup(ProcessGroup): method __init__ (line 22) | def __init__(self, rank, size): method allreduce (line 27) | def allreduce(self, *args, **kwargs): method allgather (line 30) | def allgather(self, inputs, local_tensor, **kwargs): method barrier (line 38) | def barrier(self, *args, **kwargs): method size (line 41) | def size(self): method rank (line 44) | def rank(self): function initialize_torch_distributed (line 48) | def initialize_torch_distributed(): FILE: server/text_generation_server/utils/hub.py function _cached_weight_files (line 21) | def _cached_weight_files( function _weight_hub_files_from_model_info (line 32) | def _weight_hub_files_from_model_info( function _weight_files_from_dir (line 46) | def _weight_files_from_dir(d: Path, extension: str) -> List[str]: function _get_cached_revision_directory (line 62) | def _get_cached_revision_directory( function weight_hub_files (line 97) | def weight_hub_files( function try_to_load_from_cache (line 119) | def try_to_load_from_cache( function weight_files (line 133) | def weight_files( function download_weights (line 188) | def download_weights( FILE: server/text_generation_server/utils/import_utils.py function is_ipex_available (line 9) | def is_ipex_available(): function get_cuda_free_memory (line 13) | def get_cuda_free_memory(device, memory_fraction): function get_xpu_free_memory (line 20) | def get_xpu_free_memory(device, memory_fraction): function get_cpu_free_memory (line 29) | def get_cpu_free_memory(device, memory_fraction): function noop (line 38) | def noop(*args, **kwargs): FILE: server/text_generation_server/utils/kernels.py function load_kernel (line 9) | def load_kernel(*, module: str, repo_id: str): FILE: server/text_generation_server/utils/log.py function log_once (line 6) | def log_once(log, msg: str, master=True): function log_master (line 13) | def log_master(log, msg: str): FILE: server/text_generation_server/utils/logits_process.py class StaticWarper (line 25) | class StaticWarper: method __init__ (line 26) | def __init__( method __call__ (line 50) | def __call__(self, scores): function static_warper (line 79) | def static_warper( class HeterogeneousRepetitionPenaltyLogitsProcessor (line 90) | class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor): method __init__ (line 102) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t... method __call__ (line 108) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 119) | def filter(self, indices): class FrequencyPenaltyLogitsProcessor (line 127) | class FrequencyPenaltyLogitsProcessor(LogitsProcessor): method __init__ (line 136) | def __init__(self, penalty: float): method __call__ (line 139) | def __call__( class HeterogeneousFrequencyPenaltyLogitsProcessor (line 151) | class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor): method __init__ (line 161) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t... method __call__ (line 167) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 182) | def filter(self, indices): class HeterogeneousTemperatureLogitsWarper (line 190) | class HeterogeneousTemperatureLogitsWarper: method __init__ (line 201) | def __init__( method __call__ (line 209) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 213) | def filter(self, indices): class HeterogeneousTopPLogitsWarper (line 221) | class HeterogeneousTopPLogitsWarper(LogitsProcessor): method __init__ (line 237) | def __init__( method __call__ (line 252) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 272) | def filter(self, indices): class HeterogeneousTopKLogitsWarper (line 280) | class HeterogeneousTopKLogitsWarper(LogitsProcessor): method __init__ (line 295) | def __init__( method __call__ (line 323) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 344) | def filter(self, indices): class HeterogeneousTypicalLogitsWarper (line 361) | class HeterogeneousTypicalLogitsWarper(LogitsProcessor): method __init__ (line 377) | def __init__( method __call__ (line 399) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 435) | def filter(self, indices): class HeterogeneousProcessorWrapper (line 451) | class HeterogeneousProcessorWrapper(LogitsProcessor): method __init__ (line 459) | def __init__( method __call__ (line 465) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t... method filter (line 470) | def filter(self, indices): class GrammarLogitProcessor (line 482) | class GrammarLogitProcessor(LogitsProcessor): method __init__ (line 486) | def __init__( method __call__ (line 499) | def __call__( method advance (line 513) | def advance(self, next_token_id, fsm_grammar_state): method _advance (line 519) | def _advance(next_token_id, fsm_grammar_state, fsm): method _cached_compile_fsm (line 527) | def _cached_compile_fsm( method _cached_adapt_tokenizer (line 547) | def _cached_adapt_tokenizer(tokenizer): class HeterogeneousGrammarLogitProcessor (line 575) | class HeterogeneousGrammarLogitProcessor(LogitsProcessor): method __init__ (line 576) | def __init__(self, tokenizer, device, grammars, grammar_types): method __call__ (line 589) | def __call__( method advance_batch (line 605) | def advance_batch(self, next_token_ids, fsm_grammar_states): method advance_at_index (line 613) | def advance_at_index(self, next_token_id, fsm_grammar_state, index): method filter (line 620) | def filter(self, indices): FILE: server/text_generation_server/utils/merges/strategies.py class AdapterParameters (line 17) | class AdapterParameters: method __init__ (line 18) | def __init__( function _apply_weights (line 28) | def _apply_weights( class MergeStrategy (line 44) | class MergeStrategy(ABC): method merge (line 45) | def merge( class LinearMerge (line 51) | class LinearMerge(MergeStrategy): method __init__ (line 52) | def __init__(self, **kwargs): method merge (line 55) | def merge( class TiesMerge (line 62) | class TiesMerge(MergeStrategy): method __init__ (line 63) | def __init__(self, density: float, majority_sign_method: str = "total"... method merge (line 67) | def merge( class DareLinearMerge (line 86) | class DareLinearMerge(MergeStrategy): method __init__ (line 87) | def __init__(self, density: float, **kwargs): method merge (line 90) | def merge( class DareTiesMerge (line 102) | class DareTiesMerge(MergeStrategy): method __init__ (line 103) | def __init__(self, density: float, majority_sign_method: str = "total"... method merge (line 107) | def merge( function merge_adapters (line 136) | def merge_adapters( function _validate_lora_configs (line 193) | def _validate_lora_configs(lora_configs: List["LoraConfig"]): function _merge_lora_configs (line 207) | def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig": FILE: server/text_generation_server/utils/merges/utils.py function magnitude_based_pruning (line 23) | def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> tor... function random_pruning (line 39) | def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) ... function prune (line 56) | def prune( function calculate_majority_sign_mask (line 83) | def calculate_majority_sign_mask( function disjoint_merge (line 105) | def disjoint_merge(task_tensors, majority_sign_mask): FILE: server/text_generation_server/utils/peft.py function download_and_unload_peft (line 10) | def download_and_unload_peft(model_id, revision, trust_remote_code): function download_peft (line 48) | def download_peft( FILE: server/text_generation_server/utils/prefill_chunking.py function set_support_chunking (line 7) | def set_support_chunking(support_chunking: bool): function get_support_chunking (line 12) | def get_support_chunking() -> bool: function set_max_prefill_tokens (line 17) | def set_max_prefill_tokens(max_prefill_tokens: int): function get_max_prefill_tokens (line 22) | def get_max_prefill_tokens() -> int: FILE: server/text_generation_server/utils/quantization.py class _QuantizerConfig (line 16) | class _QuantizerConfig: class _FP8QuantizerConfig (line 28) | class _FP8QuantizerConfig: function _get_config_json (line 32) | def _get_config_json(model_id: str, revision: Optional[str], filename: s... function _get_quantizer_config (line 47) | def _get_quantizer_config(model_id, revision): function get_loader (line 124) | def get_loader( FILE: server/text_generation_server/utils/segments.py function find_segments (line 11) | def find_segments( class SegmentConcatBuilder (line 31) | class SegmentConcatBuilder: method __init__ (line 32) | def __init__(self): method concat (line 36) | def concat(self, adapter_segments: torch.Tensor, segment_indices: List... method build (line 61) | def build(self) -> Tuple[torch.Tensor, List[int]]: FILE: server/text_generation_server/utils/speculate.py function get_speculate (line 4) | def get_speculate() -> int: function set_speculate (line 9) | def set_speculate(speculate: int): FILE: server/text_generation_server/utils/tokens.py class NextTokenChooser (line 24) | class NextTokenChooser: method __init__ (line 25) | def __init__( method __call__ (line 81) | def __call__(self, input_ids, scores): method advance_grammar (line 100) | def advance_grammar(self, next_id: int): method from_pb (line 108) | def from_pb( class StopSequenceCriteria (line 131) | class StopSequenceCriteria: method __init__ (line 132) | def __init__(self, stop_sequence: str): method __call__ (line 136) | def __call__(self, output: str) -> bool: class StoppingCriteria (line 142) | class StoppingCriteria: method __init__ (line 143) | def __init__( method __call__ (line 167) | def __call__(self, last_token: int, last_output: str) -> Tuple[bool, O... method from_pb (line 191) | def from_pb( function create_n_gram_speculation (line 209) | def create_n_gram_speculation( class HeterogeneousNextTokenChooser (line 233) | class HeterogeneousNextTokenChooser: method __init__ (line 234) | def __init__( method __call__ (line 326) | def __call__( method advance_grammar (line 415) | def advance_grammar(self, next_ids: List[int]): method advance_grammar_single (line 423) | def advance_grammar_single(self, grammar_state_index: int, next_id: int): method filter (line 434) | def filter(self, indices): method from_pb (line 477) | def from_pb( class Sampling (line 506) | class Sampling: method __init__ (line 507) | def __init__(self, seed: int, device: str = "cpu"): method __call__ (line 512) | def __call__(self, logits): class Greedy (line 520) | class Greedy: method __call__ (line 521) | def __call__(self, logits): class HeterogeneousSampling (line 525) | class HeterogeneousSampling: method __init__ (line 530) | def __init__(self, do_sample: List[bool], seeds: List[int], device: to... method __call__ (line 543) | def __call__(self, logits): method filter (line 553) | def filter(self, indices): function batch_top_tokens (line 567) | def batch_top_tokens( FILE: server/text_generation_server/utils/watermark.py class WatermarkLogitsProcessor (line 26) | class WatermarkLogitsProcessor(LogitsProcessor): method __init__ (line 27) | def __init__( method _seed_rng (line 40) | def _seed_rng(self, input_ids: Union[List[int], torch.LongTensor]): method _get_greenlist_ids (line 55) | def _get_greenlist_ids( method _calc_greenlist_mask (line 70) | def _calc_greenlist_mask( method _bias_greenlist_logits (line 79) | def _bias_greenlist_logits( method __call__ (line 85) | def __call__( FILE: server/text_generation_server/utils/weights.py class WeightsLoader (line 13) | class WeightsLoader(ABC): method get_weights (line 25) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 32) | def get_weights_col_packed( method get_weights_col (line 52) | def get_weights_col(self, weights: "Weights", prefix: str): method get_multi_weights_col (line 60) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_weights_row (line 68) | def get_weights_row(self, weights: "Weights", prefix: str): class Weight (line 76) | class Weight(ABC): method get_linear (line 81) | def get_linear(self, bias: torch.Tensor): class UnquantizedWeight (line 87) | class UnquantizedWeight(Weight): method get_linear (line 90) | def get_linear(self, bias: torch.Tensor): class DefaultWeightsLoader (line 99) | class DefaultWeightsLoader(WeightsLoader): method __init__ (line 102) | def __init__(self, weight_class: Type[UnquantizedWeight]): method get_weights (line 114) | def get_weights(self, weights: "Weights", prefix: str): method get_weights_col_packed (line 117) | def get_weights_col_packed( method get_multi_weights_col (line 129) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str... method get_weights_row (line 133) | def get_weights_row(self, weights: "Weights", prefix: str): class Weights (line 139) | class Weights: method __init__ (line 140) | def __init__( method _get_handle (line 170) | def _get_handle(self, filename): method get_filename (line 177) | def get_filename(self, tensor_name: str) -> (str, str): method _get_slice (line 194) | def _get_slice(self, tensor_name: str): method has_tensor (line 200) | def has_tensor(self, tensor_name: str): method get_shape (line 207) | def get_shape(self, tensor_name: str): method get_tensor (line 210) | def get_tensor( method get_partial_sharded (line 235) | def get_partial_sharded( method get_sharded (line 268) | def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_d... method get_packed_sharded (line 281) | def get_packed_sharded( method get_weights (line 347) | def get_weights(self, prefix: str): method get_weights_col_packed_qkv (line 350) | def get_weights_col_packed_qkv( method get_weights_col_packed_gate_up (line 360) | def get_weights_col_packed_gate_up(self, prefix: str): method get_weights_col_packed (line 363) | def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, ... method get_weights_col (line 373) | def get_weights_col(self, prefix: str): method get_multi_weights_col (line 376) | def get_multi_weights_col(self, prefixes: List[str], dim: int): method get_tensor_shard (line 379) | def get_tensor_shard(self, var, dim): method get_weights_row (line 395) | def get_weights_row(self, prefix: str): method use_loader (line 399) | def use_loader(self, weights_loader: WeightsLoader): method loader (line 413) | def loader(self): function _blocks_to_block_sizes (line 417) | def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]... FILE: update_doc.py function check_cli (line 32) | def check_cli(check: bool): function check_supported_models (line 85) | def check_supported_models(check: bool): function get_openapi_schema (line 126) | def get_openapi_schema(): function check_openapi (line 138) | def check_openapi(check: bool): function main (line 191) | def main():