SYMBOL INDEX (5123 symbols across 423 files)

FILE: backends/client/build.rs
  function main (line 3) | fn main() -> Result<(), Box<dyn std::error::Error>> {

FILE: backends/client/src/lib.rs
  type Health (line 15) | pub trait Health {
    method device_health (line 17) | async fn device_health(&self) -> Result<()>;
    method model_health (line 21) | async fn model_health(&self) -> Result<()>;
  type ShardInfo (line 25) | pub struct ShardInfo {
  type ClientError (line 34) | pub enum ClientError {
    method from (line 44) | fn from(err: Status) -> Self {
    method from (line 52) | fn from(err: transport::Error) -> Self {
  method from (line 61) | fn from(chunk: Chunk) -> Self {
  type ChunksToString (line 68) | pub trait ChunksToString {
    method chunks_to_string (line 70) | fn chunks_to_string(&self) -> String;
    method chunks_to_string (line 74) | fn chunks_to_string(&self) -> String {
  type Result (line 91) | pub type Result<T> = std::result::Result<T, ClientError>;

FILE: backends/client/src/v2/client.rs
  type Client (line 16) | pub struct Client {
    method connect (line 22) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 31) | pub async fn connect_uds(path: String) -> Result<Self> {
    method service_discovery (line 46) | pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
    method info (line 66) | pub async fn info(&mut self) -> Result<InfoResponse> {
    method health (line 74) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 82) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 90) | pub async fn filter_batch(
    method warmup (line 108) | pub async fn warmup(
    method prefill (line 189) | pub async fn prefill(
    method decode (line 207) | pub async fn decode(
  type PrefillTimings (line 226) | pub struct PrefillTimings {
    method new (line 233) | fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
  type DecodeTimings (line 242) | pub struct DecodeTimings {
    method new (line 250) | fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_...

FILE: backends/client/src/v2/sharded_client.rs
  type ShardedClient (line 18) | pub struct ShardedClient {
    method new (line 23) | fn new(clients: Vec<Client>) -> Self {
    method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result<Self> {
    method connect (line 38) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 44) | pub async fn connect_uds(path: String) -> Result<Self> {
    method info (line 51) | pub async fn info(&mut self) -> Result<ShardInfo> {
    method health (line 62) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 73) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 84) | pub async fn filter_batch(
    method warmup (line 102) | pub async fn warmup(
    method prefill (line 134) | pub async fn prefill(
    method decode (line 167) | pub async fn decode(
  method from (line 197) | fn from(value: InfoResponse) -> Self {
  method device_health (line 210) | async fn device_health(&self) -> Result<()> {
  method model_health (line 215) | async fn model_health(&self) -> Result<()> {

FILE: backends/client/src/v3/client.rs
  type Client (line 16) | pub struct Client {
    method connect (line 22) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 31) | pub async fn connect_uds(path: String) -> Result<Self> {
    method service_discovery (line 46) | pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
    method info (line 66) | pub async fn info(&mut self) -> Result<InfoResponse> {
    method health (line 74) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 82) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 90) | pub async fn filter_batch(
    method warmup (line 108) | pub async fn warmup(
    method prefill (line 230) | pub async fn prefill(
    method decode (line 253) | pub async fn decode(
  type PrefillTimings (line 272) | pub struct PrefillTimings {
    method new (line 279) | fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
  type DecodeTimings (line 288) | pub struct DecodeTimings {
    method new (line 296) | fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_...

FILE: backends/client/src/v3/sharded_client.rs
  type ShardedClient (line 18) | pub struct ShardedClient {
    method new (line 23) | fn new(clients: Vec<Client>) -> Self {
    method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result<Self> {
    method connect (line 38) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 44) | pub async fn connect_uds(path: String) -> Result<Self> {
    method info (line 51) | pub async fn info(&mut self) -> Result<ShardInfo> {
    method health (line 62) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 73) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 84) | pub async fn filter_batch(
    method warmup (line 102) | pub async fn warmup(
    method prefill (line 142) | pub async fn prefill(
    method decode (line 176) | pub async fn decode(
  method from (line 206) | fn from(value: InfoResponse) -> Self {
  method device_health (line 219) | async fn device_health(&self) -> Result<()> {
  method model_health (line 224) | async fn model_health(&self) -> Result<()> {

FILE: backends/gaudi/server/text_generation_server/adapters/config.py
  class ModuleMap (line 15) | class ModuleMap:
  class AdapterConfig (line 21) | class AdapterConfig(ABC):
    method map_weights_for_model (line 25) | def map_weights_for_model(

FILE: backends/gaudi/server/text_generation_server/adapters/lora.py
  function get_start_stop_idxs_for_rank (line 30) | def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
  function shard_on_dim (line 37) | def shard_on_dim(
  function shard_lora_weights (line 56) | def shard_lora_weights(
  class LoraConfig (line 74) | class LoraConfig(AdapterConfig):
    method map_weights_for_model (line 81) | def map_weights_for_model(
    method load (line 103) | def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
  class LoraWeights (line 117) | class LoraWeights(AdapterWeights):
    method __init__ (line 120) | def __init__(
    method weights_a (line 142) | def weights_a(self) -> torch.Tensor:
    method weights_b (line 148) | def weights_b(self) -> torch.Tensor:
    method weights_a_t (line 154) | def weights_a_t(self) -> torch.Tensor:
    method weights_b_t (line 160) | def weights_b_t(self) -> torch.Tensor:
    method _transpose_weights (line 165) | def _transpose_weights(self):
    method get_batch_types (line 173) | def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
    method prepare_weights (line 190) | def prepare_weights(
  class RankSegments (line 256) | class RankSegments:
  class BatchLoraWeights (line 273) | class BatchLoraWeights(BatchAdapterWeights):
    method has_adapter (line 280) | def has_adapter(self, adapter_index: int) -> bool:
    method can_vectorize (line 283) | def can_vectorize(self, pg: ProcessGroup) -> bool:
    method load (line 290) | def load(
  function get_scaling_factor (line 457) | def get_scaling_factor(
  function _convert_lora (line 468) | def _convert_lora(v: AdapterWeights) -> AdapterWeights:

FILE: backends/gaudi/server/text_generation_server/adapters/weights.py
  class AdapterBatchMetadata (line 14) | class AdapterBatchMetadata:
  class AdapterWeights (line 30) | class AdapterWeights(ABC):
    method get_batch_types (line 32) | def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]:
    method speculative_tokens (line 36) | def speculative_tokens(self) -> int:
  class BatchAdapterWeights (line 40) | class BatchAdapterWeights(ABC):
    method has_adapter (line 42) | def has_adapter(self, adapter_index: int) -> bool:
    method load (line 46) | def load(
  class LayerAdapterWeights (line 56) | class LayerAdapterWeights:
    method __init__ (line 59) | def __init__(self):
    method add_adapter (line 62) | def add_adapter(self, adapter_idx: int, weights: AdapterWeights):
    method remove_adapter (line 65) | def remove_adapter(self, adapter_idx: int):
    method is_empty (line 70) | def is_empty(self) -> bool:
    method get_data (line 73) | def get_data(
  class AdapterBatchData (line 98) | class AdapterBatchData:
    method from_meta (line 107) | def from_meta(
    method ranks (line 122) | def ranks(self) -> Set[int]:
    method layer_names (line 134) | def layer_names(self) -> Set[str]:
    method adapter_keys (line 137) | def adapter_keys(self) -> Set[str]:
    method max_rank (line 144) | def max_rank(self) -> int:

FILE: backends/gaudi/server/text_generation_server/cache.py
  class Cache (line 10) | class Cache:
    method __init__ (line 11) | def __init__(self):
    method pop (line 14) | def pop(self, batch_id: int) -> Optional[B]:
    method set (line 17) | def set(self, entry: B):
    method delete (line 21) | def delete(self, batch_id: int):
    method clear (line 28) | def clear(self):
    method __len__ (line 33) | def __len__(self):

FILE: backends/gaudi/server/text_generation_server/cli.py
  class Quantization (line 16) | class Quantization(str, Enum):
  class Dtype (line 23) | class Dtype(str, Enum):
  class KVCacheDtype (line 28) | class KVCacheDtype(str, Enum):
  function serve (line 34) | def serve(
  function download_weights (line 132) | def download_weights(
  function quantize (line 336) | def quantize(

FILE: backends/gaudi/server/text_generation_server/interceptor.py
  class ExceptionInterceptor (line 15) | class ExceptionInterceptor(AsyncServerInterceptor):
    method intercept (line 16) | async def intercept(

FILE: backends/gaudi/server/text_generation_server/layers/attention/common.py
  class HPUPagedAttentionMetadata (line 11) | class HPUPagedAttentionMetadata:
  function subtuple (line 27) | def subtuple(
  function trim_attn_metadata (line 47) | def trim_attn_metadata(metadata: HPUPagedAttentionMetadata) -> object:
  class Seqlen (line 89) | class Seqlen:
    method __init__ (line 93) | def __init__(
    method clamp (line 99) | def clamp(self, max):
    method make_sliding_window_bias (line 103) | def make_sliding_window_bias(
  function _async_h2d_tensor_copy (line 146) | def _async_h2d_tensor_copy(source, device="hpu"):
  function trim_seqlen_metadata (line 157) | def trim_seqlen_metadata(metadata: Seqlen) -> object:

FILE: backends/gaudi/server/text_generation_server/layers/attention/hpu.py
  class FP8Matmul (line 16) | class FP8Matmul(torch.nn.Module):
    method __init__ (line 18) | def __init__(self, scale_other):
    method quant_input (line 23) | def quant_input(self, x, scale):
    method matmul_fp8 (line 28) | def matmul_fp8(
    method forward (line 44) | def forward(self, input, other):
  class FetchFromCache (line 57) | class FetchFromCache(torch.nn.Module):
    method __init__ (line 59) | def __init__(self, scale_inv):
    method forward (line 63) | def forward(self, cache, blocks):
  function attention (line 73) | def attention(
  function set_block_mapping (line 110) | def set_block_mapping(hpu_attention_meta: HPUPagedAttentionMetadata, bat...
  function paged_attention (line 134) | def paged_attention(
  function paged_attention_mla (line 185) | def paged_attention_mla(

FILE: backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py
  class KVScales (line 11) | class KVScales:
    method __post_init__ (line 26) | def __post_init__(self):
  class KVCache (line 34) | class KVCache:
    method __init__ (line 41) | def __init__(
    method dtype (line 69) | def dtype(self):
    method key (line 74) | def key(self):
    method value (line 80) | def value(self):
    method store (line 85) | def store(
  class KVCompressCache (line 110) | class KVCompressCache(KVCache):
    method __init__ (line 117) | def __init__(
    method dtype (line 137) | def dtype(self):
    method key (line 142) | def key(self):
    method value (line 148) | def value(self):
    method store (line 153) | def store(
  function paged_reshape_and_cache (line 170) | def paged_reshape_and_cache(
  function get_kv_scales (line 190) | def get_kv_scales(weights: Weights, prefix: str) -> KVScales:

FILE: backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py
  function pack (line 9) | def pack(imatrix: torch.Tensor, direction: str = "column"):
  function unpack (line 35) | def unpack(qmatrix: torch.Tensor, direction: str = "column"):
  function apply_order (line 61) | def apply_order(
  function fast_awq_to_gptq (line 83) | def fast_awq_to_gptq(qweight, qzeros):

FILE: backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py
  function error_raiser_hpu (line 12) | def error_raiser_hpu(*args, **kwargs):
  function unpack_awq (line 22) | def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
  function reverse_awq_order (line 45) | def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits...
  function unpack_weight_and_zeros (line 62) | def unpack_weight_and_zeros(qweight, qzeros, bits):
  function pack_tensor (line 75) | def pack_tensor(input, bits=4):
  class WQLinear (line 93) | class WQLinear(nn.Module):
    method __init__ (line 94) | def __init__(
    method _preprocessing (line 117) | def _preprocessing(self):
    method forward (line 126) | def forward(self, x):

FILE: backends/gaudi/server/text_generation_server/layers/bnb.py
  class BNBWeight (line 10) | class BNBWeight(UnquantizedWeight):
    method get_linear (line 13) | def get_linear(self, bias: torch.Tensor):
  class Linear8bitLt (line 17) | class Linear8bitLt(torch.nn.Module):
    method __init__ (line 18) | def __init__(
    method init_8bit_state (line 49) | def init_8bit_state(self):
    method forward (line 55) | def forward(self, x: torch.Tensor):
  class BNBFP4Weight (line 76) | class BNBFP4Weight(UnquantizedWeight):
    method get_linear (line 79) | def get_linear(self, bias: torch.Tensor):
  class BNBNF4Weight (line 84) | class BNBNF4Weight(UnquantizedWeight):
    method get_linear (line 87) | def get_linear(self, bias: torch.Tensor):
  class Linear4bit (line 91) | class Linear4bit(torch.nn.Module):
    method __init__ (line 92) | def __init__(self, weight, bias, quant_type):
    method forward (line 104) | def forward(self, x: torch.Tensor):

FILE: backends/gaudi/server/text_generation_server/layers/compressed_tensors/loader.py
  class CompressedTensorsLoader (line 29) | class CompressedTensorsLoader(WeightsLoader):
    method __init__ (line 32) | def __init__(self, config: Dict[str, Any]):
    method get_weights (line 69) | def get_weights(self, weights: Weights, prefix: str):
    method get_weights_col_packed (line 73) | def get_weights_col_packed(
    method get_multi_weights_col (line 82) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_multi_weights (line 86) | def get_multi_weights(self, weights: Weights, prefixes: List[str], dim...
    method get_weights_row (line 90) | def get_weights_row(self, weights: Weights, prefix: str):
    method _get_target_loaders (line 94) | def _get_target_loaders(
    method _create_loader_for_group (line 125) | def _create_loader_for_group(
    method _lookup_loader (line 154) | def _lookup_loader(self, prefix: str) -> WeightsLoader:

FILE: backends/gaudi/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
  class W8ANFpLoader (line 14) | class W8ANFpLoader(WeightsLoader):
    method __init__ (line 19) | def __init__(
    method __str__ (line 41) | def __str__(self) -> str:
    method get_weights (line 49) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 81) | def get_weights_col_packed(
    method get_multi_weights_col (line 130) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_multi_weights (line 177) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d...
    method get_weights_row (line 227) | def get_weights_row(self, weights: "Weights", prefix: str):

FILE: backends/gaudi/server/text_generation_server/layers/conv.py
  function load_conv2d (line 6) | def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_...
  function load_conv2d_no_bias (line 23) | def load_conv2d_no_bias(

FILE: backends/gaudi/server/text_generation_server/layers/exl2.py
  class Exl2Weight (line 9) | class Exl2Weight(Weight):
    method __post_init__ (line 20) | def __post_init__(self):
    method device (line 25) | def device(self) -> torch.device:
    method get_linear (line 28) | def get_linear(self, bias: torch.Tensor):
  class Exl2WeightsLoader (line 34) | class Exl2WeightsLoader(WeightsLoader):
    method get_weights (line 37) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 61) | def get_weights_col_packed(
    method get_weights_col (line 69) | def get_weights_col(self, weights: Weights, prefix: str):
    method get_multi_weights_col (line 73) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 76) | def get_weights_row(self, weights: Weights, prefix: str):

FILE: backends/gaudi/server/text_generation_server/layers/fp8.py
  function pad_weight (line 22) | def pad_weight(weight, block_size):
  function unpad_weight (line 37) | def unpad_weight(weight, original_M, original_N, keep_first_dim=False):
  function pad_block_fp8_weight_naive (line 47) | def pad_block_fp8_weight_naive(weight, weight_scale, block_size):
  function dynamic_quant (line 63) | def dynamic_quant(data, single_scale=False):
  function dequant_block_fp8_weight_naive (line 75) | def dequant_block_fp8_weight_naive(
  function apply_block_fp8_linear_hpu_dynamic (line 132) | def apply_block_fp8_linear_hpu_dynamic(
  function get_fp8_linear (line 162) | def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
  function normalize_e4m3fn_to_native_float8 (line 170) | def normalize_e4m3fn_to_native_float8(
  function per_tensor_dequantize (line 178) | def per_tensor_dequantize(
  function requantize_with_max_scale (line 194) | def requantize_with_max_scale(
  function fp8_quantize (line 220) | def fp8_quantize(
  class HybridFP8UnquantLoader (line 245) | class HybridFP8UnquantLoader(WeightsLoader):
    method __init__ (line 248) | def __init__(
    method get_weights (line 258) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 299) | def get_weights_col_packed(
    method get_multi_weights_col (line 352) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_multi_weights (line 414) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d...
    method get_weights_row (line 476) | def get_weights_row(self, weights: "Weights", prefix: str):
  class Fp8Weight (line 524) | class Fp8Weight(Weight):
    method get_linear (line 533) | def get_linear(self, bias: torch.Tensor):
  class Fp8Linear (line 552) | class Fp8Linear(torch.nn.Module):
    method __init__ (line 555) | def __init__(
    method from_unquant (line 577) | def from_unquant(cls, weight, bias, dtype):
    method from_fp8 (line 589) | def from_fp8(
    method forward (line 627) | def forward(self, input: torch.Tensor) -> torch.Tensor:
  function _load_scalar_or_matrix_scale (line 650) | def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: t...

FILE: backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
  class GPTQWeight (line 19) | class GPTQWeight(Weight):
    method __post_init__ (line 29) | def __post_init__(self):
    method device (line 34) | def device(self) -> torch.device:
    method get_linear (line 37) | def get_linear(self, bias: torch.Tensor):
  class GPTQWeightsLoader (line 66) | class GPTQWeightsLoader(WeightsLoader):
    method __init__ (line 71) | def __init__(
    method is_layer_skipped_quantization (line 90) | def is_layer_skipped_quantization(
    method get_weights (line 95) | def get_weights(self, weights: Weights, prefix: str):
    method get_weights_col_packed (line 157) | def get_weights_col_packed(
    method get_multi_weights_col (line 217) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_multi_weights (line 279) | def get_multi_weights(self, weights: Weights, prefixes: List[str], dim...
    method get_weights_row (line 336) | def get_weights_row(self, weights: Weights, prefix: str):
    method _get_gptq_params (line 426) | def _get_gptq_params(self, weights: Weights):

FILE: backends/gaudi/server/text_generation_server/layers/gptq/hpu.py
  function error_raiser_hpu (line 12) | def error_raiser_hpu(*args, **kwargs):
  function pack_tensor (line 20) | def pack_tensor(input, bits=4):
  class QuantLinear (line 34) | class QuantLinear(nn.Module):
    method __init__ (line 35) | def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsi...
    method unpack_zeros_from_cuda_old_format (line 58) | def unpack_zeros_from_cuda_old_format(self):
    method unpack_weight_from_cuda_old_format (line 71) | def unpack_weight_from_cuda_old_format(self):
    method _preprocessing (line 80) | def _preprocessing(self):
    method new (line 119) | def new(cls, bits, groupsize, infeatures, outfeatures, bias):
    method pack (line 140) | def pack(self, linear, scales, zeros, g_idx=None):
    method forward (line 197) | def forward(self, x):

FILE: backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
  class Quantizer (line 25) | class Quantizer(nn.Module):
    method __init__ (line 26) | def __init__(self, shape=1):
    method configure (line 32) | def configure(
    method _quantize (line 54) | def _quantize(self, x, scale, zero, maxq):
    method find_params (line 60) | def find_params(self, x, weight=False):
    method quantize (line 145) | def quantize(self, x):
    method enabled (line 151) | def enabled(self):
    method ready (line 154) | def ready(self):
  class GPTQ (line 158) | class GPTQ:
    method __init__ (line 159) | def __init__(self, layer, observe=False):
    method add_batch (line 174) | def add_batch(self, inp, out):
    method print_loss (line 209) | def print_loss(self, name, q_weight, weight_error, timecost):
    method fasterquant (line 243) | def fasterquant(
    method free (line 357) | def free(self):
  function get_wikitext2 (line 366) | def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_ptb (line 398) | def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_c4 (line 430) | def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_ptb_new (line 498) | def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_c4_new (line 530) | def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_loaders (line 584) | def get_loaders(
  function find_layers (line 599) | def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
  function sequential (line 615) | def sequential(
  function make_quant_linear (line 754) | def make_quant_linear(module, names, bits, groupsize, name=""):
  function pack (line 780) | def pack(model, quantizers, bits, groupsize):
  function setdeepattr (line 794) | def setdeepattr(module, full_name, tensor):
  function getdeepattr (line 802) | def getdeepattr(module, full_name):
  function load_weights_pre_hook (line 810) | def load_weights_pre_hook(module_name, weights, recursive=False):
  function load_weights_post_hook (line 842) | def load_weights_post_hook(module_name, weights, recursive=False):
  function quantize (line 867) | def quantize(

FILE: backends/gaudi/server/text_generation_server/layers/gptq/utils.py
  function torch_snr_error (line 5) | def torch_snr_error(

FILE: backends/gaudi/server/text_generation_server/layers/layernorm.py
  function load_layer_norm (line 8) | def load_layer_norm(cls, prefix, weights, eps):
  function load_layer_norm_no_bias (line 20) | def load_layer_norm_no_bias(cls, prefix, weights, eps):
  class FastLayerNorm (line 34) | class FastLayerNorm(nn.LayerNorm):
    method forward (line 35) | def forward(self, hidden_states, residual=None):
  class FastRMSNorm (line 43) | class FastRMSNorm(nn.Module):
    method __init__ (line 44) | def __init__(self, weight: torch.Tensor, eps: float):
    method load (line 51) | def load(cls, prefix, weights, eps=1e-6):
    method forward (line 55) | def forward(self, hidden_states, residual=None):

FILE: backends/gaudi/server/text_generation_server/layers/linear.py
  class FastLinear (line 5) | class FastLinear(torch.nn.Module):
    method __init__ (line 6) | def __init__(
    method load (line 19) | def load(cls, config, prefix: str, weights, bias: bool):
    method forward (line 27) | def forward(self, input: torch.Tensor) -> torch.Tensor:
  function get_linear (line 31) | def get_linear(weight, bias):

FILE: backends/gaudi/server/text_generation_server/layers/lora.py
  class LoraLinear (line 22) | class LoraLinear(nn.Module):
    method __init__ (line 23) | def __init__(
    method forward_layer_type (line 31) | def forward_layer_type(
    method forward_lora (line 135) | def forward_lora(
    method collect_lora_a (line 154) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
  class TensorParallelMultiAdapterLinear (line 158) | class TensorParallelMultiAdapterLinear(LoraLinear):
    method __init__ (line 159) | def __init__(
    method load (line 172) | def load(
    method forward (line 184) | def forward(
    method collect_lora_a (line 227) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
  class TensorParallelAdapterRowLinear (line 242) | class TensorParallelAdapterRowLinear(LoraLinear):
    method __init__ (line 243) | def __init__(self, base_layer, layer_id, layer_name, process_group):
    method load (line 248) | def load(cls, base_layer, layer_id, layer_name, process_group):
    method forward (line 251) | def forward(
    method collect_lora_a (line 270) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:

FILE: backends/gaudi/server/text_generation_server/layers/medusa.py
  class ResBlock (line 12) | class ResBlock(torch.nn.Module):
    method __init__ (line 13) | def __init__(self, config, prefix, weights):
    method forward (line 20) | def forward(self, x):
  class MedusaModel (line 24) | class MedusaModel(torch.nn.Module):
    method __init__ (line 25) | def __init__(self, config, medusa_config, weights):
    method forward (line 34) | def forward(self, x):
  class MedusaHead (line 41) | class MedusaHead(torch.nn.Module):
    method __init__ (line 42) | def __init__(self, config, medusa_config, prefix, weights):
    method forward (line 55) | def forward(self, x):
  class MedusaHeadV1 (line 62) | class MedusaHeadV1(nn.Module):
    method __init__ (line 63) | def __init__(self, lm_head, medusa):
    method load (line 69) | def load(config, prefix: str, weights):
    method forward (line 97) | def forward(
  class MedusaHeadV2 (line 109) | class MedusaHeadV2(nn.Module):
    method __init__ (line 110) | def __init__(self, config, prefix, weights):
    method forward (line 150) | def forward(self, x):

FILE: backends/gaudi/server/text_generation_server/layers/mlp.py
  class MLPSpeculatorLayerNorm (line 11) | class MLPSpeculatorLayerNorm(nn.Module):
    method __init__ (line 27) | def __init__(
    method forward (line 39) | def forward(self, x):
  function simple_norm (line 51) | def simple_norm(x: torch.Tensor, eps=1e-06):
  class MLPSpeculatorModelTied (line 58) | class MLPSpeculatorModelTied(torch.nn.Module):
    method __init__ (line 59) | def __init__(self, config, prefix, weights):
    method forward (line 96) | def forward(
  class MLPSpeculatorModel (line 142) | class MLPSpeculatorModel(torch.nn.Module):
    method __init__ (line 143) | def __init__(self, config, prefix, weights):
    method forward (line 192) | def forward(
  class MLPSpeculatorHead (line 235) | class MLPSpeculatorHead(nn.Module):
    method __init__ (line 236) | def __init__(self, lm_head, mlp_speculator, scale_input: bool):
    method forward (line 242) | def forward(
    method load (line 257) | def load(config, prefix: str, weights):

FILE: backends/gaudi/server/text_generation_server/layers/moe/__init__.py
  class MoELayer (line 30) | class MoELayer(Protocol):
    method __init__ (line 31) | def __init__(
    method forward (line 49) | def forward(
  class DenseMoELayer (line 54) | class DenseMoELayer(nn.Module):
    method __init__ (line 62) | def __init__(
    method forward (line 143) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
  class SparseMoELayer (line 182) | class SparseMoELayer(nn.Module):
    method __init__ (line 189) | def __init__(
    method forward (line 242) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
    method is_supported (line 246) | def is_supported(weights: Weights) -> bool:

FILE: backends/gaudi/server/text_generation_server/layers/moe/fp8.py
  class FP8SparseMoELayer (line 20) | class FP8SparseMoELayer(nn.Module):
    method __init__ (line 21) | def __init__(
    method forward (line 105) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
  function _load_expert_weights (line 168) | def _load_expert_weights(
  function _load_expert_multi_weights_col (line 218) | def _load_expert_multi_weights_col(
  function _load_expert_weights_row (line 248) | def _load_expert_weights_row(

FILE: backends/gaudi/server/text_generation_server/layers/moe/fused_moe.py
  function grouped_topk (line 21) | def grouped_topk(
  function fused_topk (line 83) | def fused_topk(
  function select_experts (line 98) | def select_experts(

FILE: backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
  class UnquantizedSparseMoELayer (line 13) | class UnquantizedSparseMoELayer(nn.Module):
    method __init__ (line 14) | def __init__(
    method forward (line 83) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
  function _load_expert_multi_weights_col (line 103) | def _load_expert_multi_weights_col(
  function _load_expert_weights_row (line 144) | def _load_expert_weights_row(

FILE: backends/gaudi/server/text_generation_server/layers/rotary.py
  function _create_inv_freq (line 11) | def _create_inv_freq(dim, base, device):
  function _get_rope_config (line 18) | def _get_rope_config(config):
  class PositionRotaryEmbedding (line 28) | class PositionRotaryEmbedding(nn.Module):
    method __init__ (line 29) | def __init__(self, inv_freq, scaling_factor, max_position_embeddings):
    method forward (line 43) | def forward(
    method static (line 76) | def static(cls, config, dim, base, device):
    method load (line 208) | def load(cls, config, prefix, weights):
    method _update_cos_sin_cache (line 253) | def _update_cos_sin_cache(self, dtype, device, seqlen):
    method get_cos_sin (line 272) | def get_cos_sin(self, position_ids: torch.Tensor):
  class SuRotaryEmbedding (line 281) | class SuRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 282) | def __init__(
    method _update_cos_sin_cache (line 305) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  class Phi3LongRoPEScaledRotaryEmbedding (line 332) | class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 333) | def __init__(
    method _update_cos_sin_cache (line 361) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  class DynamicPositionRotaryEmbedding (line 392) | class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 393) | def __init__(self, dim, max_position_embeddings, base, device, scaling...
    method _update_cos_sin_cache (line 400) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  function find_correction_dim (line 426) | def find_correction_dim(num_rotations, dim, base=10000, max_position_emb...
  function find_correction_range (line 433) | def find_correction_range(
  function linear_ramp_mask (line 441) | def linear_ramp_mask(min, max, dim):
  function get_mscale (line 450) | def get_mscale(scale: float = 1.0, mscale: float = 1.0):
  class YarnPositionRotaryEmbedding (line 456) | class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 457) | def __init__(
    method _update_cos_sin_cache (line 489) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  function apply_llama3_scaling (line 531) | def apply_llama3_scaling(
  class RotaryPositionEmbeddingMultimodalSections (line 560) | class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
    method __init__ (line 561) | def __init__(
    method _update_cos_sin_cache (line 579) | def _update_cos_sin_cache(
    method get_cos_sin (line 596) | def get_cos_sin(

FILE: backends/gaudi/server/text_generation_server/layers/speculative.py
  class SpeculativeHead (line 9) | class SpeculativeHead(torch.nn.Module):
    method __init__ (line 10) | def __init__(self, lm_head, speculator):
    method load (line 16) | def load(config, prefix: str, weights):
    method forward (line 44) | def forward(

FILE: backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
  class LayerConcat (line 9) | class LayerConcat(torch.nn.Module):
    method __init__ (line 15) | def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1):
    method forward (line 23) | def forward(self, x: torch.Tensor):
  class SuperLayer (line 28) | class SuperLayer(torch.nn.Module):
    method __init__ (line 29) | def __init__(self, linear):
    method forward (line 33) | def forward(self, x):
  class TensorParallelHead (line 37) | class TensorParallelHead(SuperLayer):
    method __init__ (line 38) | def __init__(self, linear, process_group, should_gather: bool):
    method load (line 44) | def load(config, prefix: str, weights):
    method forward (line 73) | def forward(self, input: torch.Tensor) -> torch.Tensor:
  class TensorParallelColumnLinear (line 111) | class TensorParallelColumnLinear(SuperLayer):
    method load_gate_up (line 113) | def load_gate_up(cls, config, prefix: str, weights, bias: bool):
    method load_qkv (line 124) | def load_qkv(
    method load (line 147) | def load(cls, config, prefix: str, weights, bias: bool):
    method load_multi (line 157) | def load_multi(cls, config, prefixes: List[str], weights, bias: bool, ...
  class TensorParallelRowLinear (line 176) | class TensorParallelRowLinear(SuperLayer):
    method __init__ (line 177) | def __init__(self, linear, process_group):
    method load (line 182) | def load(cls, config, prefix: str, weights, bias: bool):
    method forward (line 195) | def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.T...
  class TensorParallelEmbedding (line 206) | class TensorParallelEmbedding(torch.nn.Module):
    method __init__ (line 207) | def __init__(self, prefix: str, weights, reduce=True):
    method forward (line 229) | def forward(self, input: torch.Tensor) -> torch.Tensor:

FILE: backends/gaudi/server/text_generation_server/models/__init__.py
  class ModelType (line 163) | class ModelType(enum.Enum):
  function get_model (line 359) | def get_model(
  function get_model_with_lora_adapters (line 949) | def get_model_with_lora_adapters(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
  function _make_causal_mask (line 68) | def _make_causal_mask(
  function _expand_mask (line 88) | def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
  function build_alibi_tensor (line 99) | def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int) -> ...
  function dropout_add (line 156) | def dropout_add(
  function _split_heads (line 178) | def _split_heads(
  function _merge_heads (line 210) | def _merge_heads(x: torch.Tensor, num_heads: int, head_dim: int) -> torc...
  class BloomAttention (line 236) | class BloomAttention(nn.Module):
    method __init__ (line 237) | def __init__(self, prefix, config: BloomConfig, weights):
    method compute_attention (line 280) | def compute_attention(
    method forward (line 357) | def forward(
  class BloomMLP (line 435) | class BloomMLP(nn.Module):
    method __init__ (line 436) | def __init__(self, prefix, config: BloomConfig, weights):
    method forward (line 450) | def forward(
  class BloomBlock (line 474) | class BloomBlock(nn.Module):
    method __init__ (line 475) | def __init__(self, layer_id: int, config: BloomConfig, weights):
    method forward (line 500) | def forward(
  class BloomPreTrainedModel (line 556) | class BloomPreTrainedModel(PreTrainedModel):
    method _convert_to_standard_cache (line 562) | def _convert_to_standard_cache(
    method _convert_to_bloom_cache (line 582) | def _convert_to_bloom_cache(
  class BloomModel (line 601) | class BloomModel(BloomPreTrainedModel):
    method __init__ (line 602) | def __init__(self, config: BloomConfig, weights):
    method _prepare_attn_mask (line 635) | def _prepare_attn_mask(
    method set_input_embeddings (line 664) | def set_input_embeddings(self, new_embeddings: torch.Tensor):
    method forward (line 667) | def forward(
  class BloomForCausalLM (line 818) | class BloomForCausalLM(BloomPreTrainedModel):
    method __init__ (line 819) | def __init__(self, prefix: str, config, weights):
    method prepare_inputs_for_generation (line 829) | def prepare_inputs_for_generation(
    method forward (line 860) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py
  class CLIPVisionEmbeddings (line 23) | class CLIPVisionEmbeddings(nn.Module):
    method __init__ (line 24) | def __init__(self, prefix, config: CLIPVisionConfig, weights):
    method forward (line 56) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
  class CLIPTextEmbeddings (line 70) | class CLIPTextEmbeddings(nn.Module):
    method __init__ (line 71) | def __init__(self, config: CLIPTextConfig):
    method forward (line 87) | def forward(
  class CLIPAttention (line 109) | class CLIPAttention(nn.Module):
    method __init__ (line 112) | def __init__(self, prefix, config, weights):
    method _shape (line 142) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 149) | def forward(
  class CLIPMLP (line 234) | class CLIPMLP(nn.Module):
    method __init__ (line 235) | def __init__(self, prefix, config, weights):
    method forward (line 246) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class CLIPEncoderLayer (line 253) | class CLIPEncoderLayer(nn.Module):
    method __init__ (line 254) | def __init__(self, prefix, config: CLIPConfig, weights):
    method forward (line 268) | def forward(
  class CLIPPreTrainedModel (line 299) | class CLIPPreTrainedModel(nn.Module):
  class CLIPEncoder (line 386) | class CLIPEncoder(nn.Module):
    method __init__ (line 395) | def __init__(self, prefix, config: CLIPConfig, weights):
    method forward (line 407) | def forward(
  class CLIPTextTransformer (line 446) | class CLIPTextTransformer(nn.Module):
    method __init__ (line 447) | def __init__(self, prefix: str, config: CLIPTextConfig, weights=None):
    method forward (line 461) | def forward(
  class CLIPTextModel (line 533) | class CLIPTextModel(CLIPPreTrainedModel):
    method __init__ (line 538) | def __init__(self, prefix, config: CLIPTextConfig):
    method forward (line 544) | def forward(
  class CLIPVisionTransformer (line 575) | class CLIPVisionTransformer(nn.Module):
    method __init__ (line 576) | def __init__(self, prefix, config: CLIPVisionConfig, weights):
    method forward (line 591) | def forward(
  class CLIPVisionModel (line 619) | class CLIPVisionModel(CLIPPreTrainedModel):
    method __init__ (line 624) | def __init__(self, config: CLIPVisionConfig):
    method get_input_embeddings (line 630) | def get_input_embeddings(self) -> nn.Module:
    method forward (line 633) | def forward(
  class CLIPModel (line 665) | class CLIPModel(nn.Module):
    method __init__ (line 666) | def __init__(self, prefix, config: CLIPConfig, weights):
    method get_text_features (line 691) | def get_text_features(
    method get_image_features (line 724) | def get_image_features(
    method forward (line 760) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
  class CohereRotary (line 58) | class CohereRotary(PositionRotaryEmbedding):
    method forward (line 59) | def forward(
  class CohereLayerNorm (line 88) | class CohereLayerNorm(nn.Module):
    method __init__ (line 89) | def __init__(self, prefix, weights, eps):
    method forward (line 97) | def forward(self, hidden_states):
  function load_attention (line 112) | def load_attention(config, prefix, weights):
  function _load_gqa (line 125) | def _load_gqa(config, prefix: str, weights):
  class FlashCohereAttention (line 157) | class FlashCohereAttention(torch.nn.Module):
    method __init__ (line 158) | def __init__(
    method forward (line 214) | def forward(
  class CohereMLP (line 283) | class CohereMLP(nn.Module):
    method __init__ (line 284) | def __init__(self, prefix, config, weights):
    method forward (line 315) | def forward(self, hidden_states):
  class FlashCohereLayer (line 323) | class FlashCohereLayer(nn.Module):
    method __init__ (line 324) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
    method forward (line 342) | def forward(
  class FlashCohereModel (line 377) | class FlashCohereModel(torch.nn.Module):
    method __init__ (line 378) | def __init__(self, prefix: str, config, weights):
    method forward (line 415) | def forward(
  class FlashCohereForCausalLM (line 459) | class FlashCohereForCausalLM(torch.nn.Module):
    method __init__ (line 460) | def __init__(self, prefix: str, config, weights):
    method forward (line 483) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
  class DbrxAttentionConfig (line 51) | class DbrxAttentionConfig(PretrainedConfig):
    method __init__ (line 52) | def __init__(
  class DbrxFFNConfig (line 73) | class DbrxFFNConfig(PretrainedConfig):
    method __init__ (line 74) | def __init__(
  class DbrxConfig (line 108) | class DbrxConfig(PretrainedConfig):
    method __init__ (line 115) | def __init__(
    method num_key_value_heads (line 168) | def num_key_value_heads(self):
  function promote_scalar (line 174) | def promote_scalar(x: torch.Tensor) -> torch.Tensor:
  function load_attention (line 178) | def load_attention(config, prefix, weights):
  function _load_experts (line 189) | def _load_experts(config, prefix, weights):
  function _load_experts_quantized (line 220) | def _load_experts_quantized(config, prefix, weights, cls):
  class DbrxAttention (line 260) | class DbrxAttention(torch.nn.Module):
    method __init__ (line 261) | def __init__(
    method forward (line 302) | def forward(
  class DbrxNormAttentionNorm (line 363) | class DbrxNormAttentionNorm(nn.Module):
    method __init__ (line 364) | def __init__(
    method forward (line 387) | def forward(
  function select_experts (line 420) | def select_experts(
  function round_up (line 438) | def round_up(x: torch.Tensor, value: int):
  class BlockSparseMoE (line 442) | class BlockSparseMoE(nn.Module):
    method __init__ (line 443) | def __init__(self, prefix, config: DbrxConfig, weights):
    method forward (line 493) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DenseMoE (line 505) | class DenseMoE(nn.Module):
    method __init__ (line 506) | def __init__(self, prefix, config: DbrxConfig, weights):
    method forward (line 556) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DbrxLayer (line 603) | class DbrxLayer(nn.Module):
    method __init__ (line 604) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
    method forward (line 618) | def forward(
  class DbrxModel (line 648) | class DbrxModel(torch.nn.Module):
    method __init__ (line 649) | def __init__(self, prefix: str, config, weights):
    method forward (line 682) | def forward(
  class FlashDbrxForCausalLM (line 725) | class FlashDbrxForCausalLM(torch.nn.Module):
    method __init__ (line 726) | def __init__(self, prefix: str, config, weights):
    method forward (line 741) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
  function get_and_maybe_dequant_weights (line 48) | def get_and_maybe_dequant_weights(layer: torch.nn.Module) -> torch.Tensor:
  class DeepseekV2Config (line 60) | class DeepseekV2Config(PretrainedConfig):
    method __init__ (line 61) | def __init__(
  class DeepseekV2Attention (line 166) | class DeepseekV2Attention(torch.nn.Module):
    method __init__ (line 167) | def __init__(
    method _q_proj_and_k_up_proj (line 277) | def _q_proj_and_k_up_proj(self, x):
    method _v_up_proj_and_o_proj (line 292) | def _v_up_proj_and_o_proj(self, x):
    method forward (line 301) | def forward(
  class DeepseekV2MLP (line 422) | class DeepseekV2MLP(nn.Module):
    method __init__ (line 423) | def __init__(self, prefix: str, config, weights, intermediate_size: int):
    method forward (line 453) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
  class DeepseekV2MoE (line 461) | class DeepseekV2MoE(nn.Module):
    method __init__ (line 462) | def __init__(
    method forward (line 504) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DeepseekV2Layer (line 524) | class DeepseekV2Layer(nn.Module):
    method __init__ (line 525) | def __init__(self, prefix, layer_id, config, weights, rotary_emb):
    method forward (line 564) | def forward(
  class DeepseekV2Model (line 600) | class DeepseekV2Model(torch.nn.Module):
    method __init__ (line 601) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 634) | def forward(
  class FlashDeepseekV2ForCausalLM (line 678) | class FlashDeepseekV2ForCausalLM(torch.nn.Module):
    method __init__ (line 679) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 691) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
  function get_and_maybe_dequant_weights (line 48) | def get_and_maybe_dequant_weights(layer: torch.nn.Module) -> torch.Tensor:
  class DeepseekV3Config (line 60) | class DeepseekV3Config(PretrainedConfig):
    method __init__ (line 61) | def __init__(
  class DeepseekV3Attention (line 166) | class DeepseekV3Attention(torch.nn.Module):
    method __init__ (line 167) | def __init__(
    method _q_proj_and_k_up_proj (line 276) | def _q_proj_and_k_up_proj(self, x):
    method _v_up_proj_and_o_proj (line 291) | def _v_up_proj_and_o_proj(self, x):
    method forward (line 300) | def forward(
  class DeepseekV3MLP (line 421) | class DeepseekV3MLP(nn.Module):
    method __init__ (line 422) | def __init__(self, prefix: str, config, weights, intermediate_size: int):
    method forward (line 452) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
  class DeepseekV3MoE (line 460) | class DeepseekV3MoE(nn.Module):
    method __init__ (line 461) | def __init__(
    method forward (line 512) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DeepseekV3Layer (line 532) | class DeepseekV3Layer(nn.Module):
    method __init__ (line 533) | def __init__(self, prefix, layer_id, config, weights, rotary_emb):
    method forward (line 572) | def forward(
  class DeepseekV3Model (line 608) | class DeepseekV3Model(torch.nn.Module):
    method __init__ (line 609) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 642) | def forward(
  class FlashDeepseekV3ForCausalLM (line 686) | class FlashDeepseekV3ForCausalLM(torch.nn.Module):
    method __init__ (line 687) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 699) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
  class Gemma2Config (line 53) | class Gemma2Config(PretrainedConfig):
    method __init__ (line 54) | def __init__(
  class Gemma2FastRMSNorm (line 109) | class Gemma2FastRMSNorm(FastRMSNorm):
    method load (line 111) | def load(cls, prefix: str, weights, eps=1e-6):
    method forward (line 121) | def forward(self, hidden_states, residual=None):
  function load_attention (line 132) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 145) | def _load_gqa(config, prefix: str, weights):
  class FlashGemma2Attention (line 167) | class FlashGemma2Attention(torch.nn.Module):
    method __init__ (line 168) | def __init__(
    method forward (line 234) | def forward(
  class Gemma2MLP (line 299) | class Gemma2MLP(nn.Module):
    method __init__ (line 300) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 349) | def forward(self, hidden_states, adapter_data):
  class FlashGemma2Layer (line 357) | class FlashGemma2Layer(nn.Module):
    method __init__ (line 358) | def __init__(
    method forward (line 401) | def forward(
  class FlashGemma2Model (line 441) | class FlashGemma2Model(torch.nn.Module):
    method __init__ (line 442) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 477) | def forward(
  class FlashGemma2ForCausalLM (line 524) | class FlashGemma2ForCausalLM(torch.nn.Module):
    method __init__ (line 525) | def __init__(self, prefix: str, config, weights, *, causal: bool = True):
    method forward (line 554) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
  class Gemma3FastRMSNorm (line 62) | class Gemma3FastRMSNorm(FastRMSNorm):
    method load (line 64) | def load(cls, prefix: str, weights, eps=1e-6):
    method forward (line 74) | def forward(self, hidden_states, residual=None):
  function load_attention (line 85) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 98) | def _load_gqa(config, prefix: str, weights):
  class FlashGemma3Attention (line 120) | class FlashGemma3Attention(torch.nn.Module):
    method __init__ (line 121) | def __init__(
    method forward (line 198) | def forward(
  class Gemma3MLP (line 275) | class Gemma3MLP(nn.Module):
    method __init__ (line 276) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 325) | def forward(self, hidden_states, adapter_data):
  class FlashGemma3Layer (line 333) | class FlashGemma3Layer(nn.Module):
    method __init__ (line 334) | def __init__(
    method forward (line 379) | def forward(
  class FlashGemma3Model (line 419) | class FlashGemma3Model(torch.nn.Module):
    method __init__ (line 420) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 464) | def forward(
  class FlashGemma3ForCausalLM (line 514) | class FlashGemma3ForCausalLM(torch.nn.Module):
    method __init__ (line 515) | def __init__(self, prefix: str, config, weights, *, causal: bool = True):
    method forward (line 545) | def forward(
  class Gemma3MultimodalInputProjection (line 576) | class Gemma3MultimodalInputProjection(torch.nn.Module):
    method __init__ (line 577) | def __init__(self, prefix, config, weights):
    method forward (line 599) | def forward(self, vision_outputs: torch.Tensor):
  class Gemma3ForConditionalGeneration (line 620) | class Gemma3ForConditionalGeneration(nn.Module):
    method __init__ (line 621) | def __init__(self, prefix, config, weights):
    method get_vision_embeds (line 671) | def get_vision_embeds(
    method get_inputs_embeds (line 687) | def get_inputs_embeds(
    method forward (line 704) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
  class GemmaConfig (line 51) | class GemmaConfig(PretrainedConfig):
    method __init__ (line 52) | def __init__(
  class GemmaFastRMSNorm (line 107) | class GemmaFastRMSNorm(FastRMSNorm):
    method load (line 109) | def load(cls, prefix: str, weights, eps=1e-6):
    method forward (line 119) | def forward(self, hidden_states, residual=None):
  function load_attention (line 130) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 143) | def _load_gqa(config, prefix: str, weights):
  class FlashGemmaAttention (line 165) | class FlashGemmaAttention(torch.nn.Module):
    method __init__ (line 166) | def __init__(self, prefix: str, config, weights, causal: bool, rotary_...
    method forward (line 198) | def forward(
  class GemmaMLP (line 257) | class GemmaMLP(nn.Module):
    method __init__ (line 258) | def __init__(self, prefix: str, config, weights):
    method forward (line 289) | def forward(self, hidden_states):
  class FlashGemmaLayer (line 295) | class FlashGemmaLayer(nn.Module):
    method __init__ (line 296) | def __init__(self, prefix: str, config, weights, causal: bool, rotary_...
    method forward (line 316) | def forward(
  class FlashGemmaModel (line 352) | class FlashGemmaModel(torch.nn.Module):
    method __init__ (line 353) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 386) | def forward(
  class FlashGemmaForCausalLM (line 431) | class FlashGemmaForCausalLM(torch.nn.Module):
    method __init__ (line 432) | def __init__(self, prefix: str, config, weights, *, causal: bool = True):
    method forward (line 459) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
  function load_qkv (line 45) | def load_qkv(config, prefix: str, weights, head_size, num_heads):
  function _load_qkv_gptq (line 56) | def _load_qkv_gptq(config, prefix: str, weights):
  function _load_qkv (line 87) | def _load_qkv(config, prefix: str, weights, head_size, num_heads):
  function load_row (line 134) | def load_row(config, prefix: str, weights, bias: bool):
  function load_col (line 153) | def load_col(config, prefix: str, weights, bias: bool):
  class FlashGPT2Attention (line 168) | class FlashGPT2Attention(torch.nn.Module):
    method __init__ (line 169) | def __init__(
    method forward (line 209) | def forward(
  class GPT2MLP (line 259) | class GPT2MLP(nn.Module):
    method __init__ (line 260) | def __init__(self, prefix: str, config, weights):
    method forward (line 290) | def forward(self, hidden_states):
  class FlashGPT2Layer (line 296) | class FlashGPT2Layer(nn.Module):
    method __init__ (line 297) | def __init__(self, prefix: str, config, weights):
    method forward (line 313) | def forward(
  class FlashGPT2Model (line 346) | class FlashGPT2Model(torch.nn.Module):
    method __init__ (line 347) | def __init__(self, prefix: str, config, weights):
    method forward (line 377) | def forward(
  class FlashGPT2ForCausalLM (line 416) | class FlashGPT2ForCausalLM(torch.nn.Module):
    method __init__ (line 417) | def __init__(self, prefix: str, config, weights):
    method forward (line 436) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
  function load_attention (line 55) | def load_attention(config, prefix: str, weights):
  function load_row (line 65) | def load_row(config, prefix: str, weights, bias: bool):
  class GPTJRotary (line 78) | class GPTJRotary(PositionRotaryEmbedding):
    method forward (line 79) | def forward(
  class FlashGPTJAttention (line 107) | class FlashGPTJAttention(torch.nn.Module):
    method __init__ (line 108) | def __init__(
    method forward (line 149) | def forward(
  class GPTJMLP (line 209) | class GPTJMLP(nn.Module):
    method __init__ (line 210) | def __init__(self, prefix: str, config, weights):
    method forward (line 235) | def forward(self, hidden_states):
  class FlashGPTJLayer (line 241) | class FlashGPTJLayer(nn.Module):
    method __init__ (line 242) | def __init__(self, prefix: str, config, weights, rotary_emb):
    method forward (line 256) | def forward(
  class FlashGPTJModel (line 286) | class FlashGPTJModel(torch.nn.Module):
    method __init__ (line 287) | def __init__(self, prefix: str, config, weights):
    method forward (line 323) | def forward(
  class FlashGPTJForCausalLM (line 367) | class FlashGPTJForCausalLM(torch.nn.Module):
    method __init__ (line 368) | def __init__(self, prefix: str, config, weights):
    method forward (line 381) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama4_modeling.py
  function reshape_for_broadcast (line 55) | def reshape_for_broadcast(freqs: torch.Tensor, target):
  function apply_rotary_emb (line 61) | def apply_rotary_emb(
  function repeat_kv (line 94) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Llama4TextExperts (line 108) | class Llama4TextExperts(nn.Module):
    method __init__ (line 109) | def __init__(self, prefix, config, weights):
    method forward (line 127) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Llama4TextMLP (line 156) | class Llama4TextMLP(nn.Module):
    method __init__ (line 157) | def __init__(self, prefix, config, weights):
    method forward (line 180) | def forward(self, x):
  class Llama4TextL2Norm (line 186) | class Llama4TextL2Norm(torch.nn.Module):
    method __init__ (line 187) | def __init__(self, eps: float = 1e-6):
    method _norm (line 191) | def _norm(self, x):
    method forward (line 194) | def forward(self, x):
    method extra_repr (line 197) | def extra_repr(self):
  class Llama4TextMoe (line 201) | class Llama4TextMoe(nn.Module):
    method __init__ (line 202) | def __init__(
    method forward (line 223) | def forward(self, hidden_states, adapter_data):
  class Llama4TextRotaryEmbedding (line 265) | class Llama4TextRotaryEmbedding(nn.Module):
    method __init__ (line 266) | def __init__(self, config, device=None):
    method forward (line 281) | def forward(self, x, position_ids):
  class Llama4TextAttention (line 302) | class Llama4TextAttention(FlashLlamaAttention):
    method __init__ (line 305) | def __init__(self, prefix, config, weights, layer_idx):
    method forward (line 325) | def forward(
  class Llama4TextDecoderLayer (line 435) | class Llama4TextDecoderLayer(nn.Module):
    method __init__ (line 436) | def __init__(self, prefix, config, weights, layer_idx):
    method forward (line 460) | def forward(
  class Llama4TextModel (line 507) | class Llama4TextModel(nn.Module):
    method __init__ (line 509) | def __init__(self, prefix, config, weights):
    method forward (line 540) | def forward(
    method _update_causal_mask (line 600) | def _update_causal_mask(
    method create_chunked_attention_mask (line 735) | def create_chunked_attention_mask(
    method _prepare_4d_causal_attention_mask_with_cache_position (line 761) | def _prepare_4d_causal_attention_mask_with_cache_position(
  class Llama4ForCausalLM (line 826) | class Llama4ForCausalLM(nn.Module):
    method __init__ (line 827) | def __init__(self, prefix, config, weights):
    method forward (line 839) | def forward(
  class Llama4VisionMLP2 (line 873) | class Llama4VisionMLP2(torch.nn.Module):
    method __init__ (line 874) | def __init__(self, prefix, config, weights):
    method forward (line 887) | def forward(self, hidden_states):
  class Llama4MultiModalProjector (line 897) | class Llama4MultiModalProjector(nn.Module):
    method __init__ (line 898) | def __init__(self, prefix, config, weights):
    method forward (line 904) | def forward(self, image_features):
  function pixel_shuffle (line 909) | def pixel_shuffle(input_tensor, shuffle_ratio):
  class Llama4VisionPixelShuffleMLP (line 932) | class Llama4VisionPixelShuffleMLP(nn.Module):
    method __init__ (line 933) | def __init__(self, prefix, config, weights):
    method forward (line 944) | def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
  function vision_reshape_for_broadcast (line 950) | def vision_reshape_for_broadcast(freqs_ci: torch.Tensor, query: torch.Te...
  class Llama4VisionAttention (line 956) | class Llama4VisionAttention(nn.Module):
    method __init__ (line 957) | def __init__(self, prefix, config, weights):
    method forward (line 981) | def forward(
  class Llama4VisionMLP (line 1027) | class Llama4VisionMLP(nn.Module):
    method __init__ (line 1028) | def __init__(self, prefix, config, weights):
    method forward (line 1039) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Llama4VisionEncoderLayer (line 1046) | class Llama4VisionEncoderLayer(nn.Module):
    method __init__ (line 1047) | def __init__(self, prefix, config, weights):
    method forward (line 1065) | def forward(
  class Llama4VisionEncoder (line 1093) | class Llama4VisionEncoder(nn.Module):
    method __init__ (line 1102) | def __init__(self, prefix, config, weights):
    method forward (line 1116) | def forward(
  class Llama4UnfoldConvolution (line 1135) | class Llama4UnfoldConvolution(nn.Module):
    method __init__ (line 1136) | def __init__(self, prefix, config, weights):
    method forward (line 1146) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Llama4VisionRotaryEmbedding (line 1153) | class Llama4VisionRotaryEmbedding(nn.Module):
    method __init__ (line 1154) | def __init__(self, config, weights):
    method forward (line 1192) | def forward(self, hidden_states):
  class Llama4VisionModel (line 1199) | class Llama4VisionModel(nn.Module):
    method __init__ (line 1201) | def __init__(self, prefix, config, weights):
    method forward (line 1243) | def forward(
  class Llama4ForConditionalGeneration (line 1298) | class Llama4ForConditionalGeneration(nn.Module):
    method __init__ (line 1300) | def __init__(self, prefix: str, config, weights):
    method get_image_features (line 1328) | def get_image_features(
    method get_vision_embeds (line 1359) | def get_vision_embeds(
    method get_inputs_embeds (line 1376) | def get_inputs_embeds(
    method forward (line 1411) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
  function load_attention (line 64) | def load_attention(config, prefix: str, weights, layer_id):
  function no_fp8 (line 117) | def no_fp8(weights: Weights):
  class FlashLlamaAttention (line 129) | class FlashLlamaAttention(torch.nn.Module):
    method __init__ (line 130) | def __init__(
    method forward (line 189) | def forward(
  class Phi3MoE (line 250) | class Phi3MoE(nn.Module):
    method __init__ (line 251) | def __init__(
    method forward (line 274) | def forward(self, x, adapter_data) -> torch.Tensor:
  class LlamaMLP (line 286) | class LlamaMLP(nn.Module):
    method __init__ (line 287) | def __init__(self, prefix, config, weights, index):
    method forward (line 359) | def forward(self, hidden_states, adapter_data):
  class FlashLlamaLayer (line 367) | class FlashLlamaLayer(nn.Module):
    method __init__ (line 368) | def __init__(self, index, prefix, config, weights, rotary_emb):
    method forward (line 420) | def forward(
  class FlashLlamaModel (line 462) | class FlashLlamaModel(torch.nn.Module):
    method __init__ (line 463) | def __init__(self, prefix, config, weights):
    method forward (line 545) | def forward(
  class FlashLlamaForCausalLM (line 594) | class FlashLlamaForCausalLM(torch.nn.Module):
    method __init__ (line 595) | def __init__(self, prefix: str, config, weights, name=None):
    method forward (line 640) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
  function get_anyres_image_grid_shape (line 37) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
  function unpad_image (line 60) | def unpad_image(tensor, original_size):
  class LlavaNextMultiModalProjector (line 94) | class LlavaNextMultiModalProjector(nn.Module):
    method __init__ (line 95) | def __init__(self, prefix, config, weights):
    method forward (line 106) | def forward(self, image_features):
  class FlashLlavaNextForConditionalGeneration (line 113) | class FlashLlavaNextForConditionalGeneration(nn.Module):
    method __init__ (line 114) | def __init__(self, prefix, config, weights):
    method _merge_input_ids_with_image_features (line 149) | def _merge_input_ids_with_image_features(
    method get_vision_embeds (line 166) | def get_vision_embeds(
    method get_inputs_embeds (line 254) | def get_inputs_embeds(
    method forward (line 271) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
  class MistralConfig (line 52) | class MistralConfig(PretrainedConfig):
    method __init__ (line 55) | def __init__(
  class MistralAttention (line 106) | class MistralAttention(torch.nn.Module):
    method __init__ (line 107) | def __init__(self, prefix: str, config, weights, layer_id, rotary_emb):
    method forward (line 172) | def forward(
  class MistralMLP (line 235) | class MistralMLP(nn.Module):
    method __init__ (line 236) | def __init__(self, prefix: str, config, weights, layer_id):
    method forward (line 290) | def forward(self, hidden_states, adapter_data):
  class MistralLayer (line 298) | class MistralLayer(nn.Module):
    method __init__ (line 299) | def __init__(self, prefix: str, config, weights, layer_id, rotary_emb):
    method forward (line 321) | def forward(
  class MistralModel (line 359) | class MistralModel(torch.nn.Module):
    method __init__ (line 360) | def __init__(self, prefix: str, config, weights):
    method forward (line 401) | def forward(
  class FlashMistralForCausalLM (line 445) | class FlashMistralForCausalLM(torch.nn.Module):
    method __init__ (line 446) | def __init__(self, prefix: str, config, weights, name=None):
    method forward (line 478) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
  class MixtralConfig (line 51) | class MixtralConfig(PretrainedConfig):
    method __init__ (line 54) | def __init__(
  function promote_scalar (line 109) | def promote_scalar(x: torch.Tensor) -> torch.Tensor:
  function load_attention (line 113) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 126) | def _load_gqa(config, prefix: str, weights):
  function _load_experts (line 149) | def _load_experts(config, prefix: str, mat, weights):
  class MixtralAttention (line 185) | class MixtralAttention(torch.nn.Module):
    method __init__ (line 186) | def __init__(
    method forward (line 228) | def forward(
  function select_experts (line 288) | def select_experts(gate_logits: torch.Tensor, top_k: int):
  function round_up (line 301) | def round_up(x: torch.Tensor, value: int):
  class MixtralMoE (line 305) | class MixtralMoE(nn.Module):
    method __init__ (line 306) | def __init__(
    method forward (line 330) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class MixtralLayer (line 342) | class MixtralLayer(nn.Module):
    method __init__ (line 343) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
    method forward (line 370) | def forward(
  class MixtralModel (line 406) | class MixtralModel(torch.nn.Module):
    method __init__ (line 407) | def __init__(self, prefix: str, config, weights):
    method forward (line 445) | def forward(
  class FlashMixtralForCausalLM (line 489) | class FlashMixtralForCausalLM(torch.nn.Module):
    method __init__ (line 490) | def __init__(self, prefix: str, config, weights):
    method forward (line 506) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py
  function _prepare_aspect_ratio_attention_mask (line 44) | def _prepare_aspect_ratio_attention_mask(
  function _prepare_4d_causal_attention_mask_with_cache_position (line 76) | def _prepare_4d_causal_attention_mask_with_cache_position(
  function _prepare_cross_attention_mask (line 140) | def _prepare_cross_attention_mask(
  class MllamaVisionMLP (line 173) | class MllamaVisionMLP(nn.Module):
    method __init__ (line 174) | def __init__(self, *, prefix, config, weights):
    method forward (line 185) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class MllamaVisionSdpaAttention (line 192) | class MllamaVisionSdpaAttention(nn.Module):
    method __init__ (line 193) | def __init__(self, *, prefix, config, weights):
    method forward (line 214) | def forward(
  class MllamaVisionEncoderLayer (line 260) | class MllamaVisionEncoderLayer(nn.Module):
    method __init__ (line 261) | def __init__(self, *, prefix, config, weights, is_gated: bool):
    method forward (line 292) | def forward(
  class MllamaVisionEncoder (line 313) | class MllamaVisionEncoder(nn.Module):
    method __init__ (line 314) | def __init__(self, *, prefix, config, weights, is_gated: bool, num_lay...
    method forward (line 327) | def forward(
  class MllamaPrecomputedAspectRatioEmbedding (line 350) | class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
    method __init__ (line 351) | def __init__(self, *, prefix, config, weights):
    method forward (line 364) | def forward(
  class MllamaPrecomputedPositionEmbedding (line 377) | class MllamaPrecomputedPositionEmbedding(nn.Module):
    method __init__ (line 378) | def __init__(self, *, prefix, config, weights):
    method forward (line 399) | def forward(
  class MllamaVisionModel (line 419) | class MllamaVisionModel(nn.Module):
    method __init__ (line 420) | def __init__(self, *, prefix, config, weights):
    method apply_class_embedding (line 496) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T...
    method forward (line 502) | def forward(
  class MllamaTextCrossAttention (line 634) | class MllamaTextCrossAttention(nn.Module):
    method __init__ (line 637) | def __init__(self, *, prefix, config, weights, layer_idx):
    method forward (line 686) | def forward(
  class MllamaTextMLP (line 744) | class MllamaTextMLP(nn.Module):
    method __init__ (line 745) | def __init__(self, *, prefix, config, weights):
    method forward (line 767) | def forward(self, x):
  class FlashLlamaCrossLayer (line 777) | class FlashLlamaCrossLayer(torch.nn.Module):
    method __init__ (line 780) | def __init__(self, *, prefix, config, weights, index) -> None:
    method forward (line 808) | def forward(
  class MllamaTextRMSNorm (line 852) | class MllamaTextRMSNorm(nn.Module):
    method __init__ (line 853) | def __init__(self, weight, eps):
    method load (line 859) | def load(cls, *, prefix, weights, eps):
    method forward (line 865) | def forward(self, hidden_states):
    method extra_repr (line 872) | def extra_repr(self):
  class FlashMllamaForConditionalGeneration (line 876) | class FlashMllamaForConditionalGeneration(nn.Module):
    method __init__ (line 877) | def __init__(self, prefix, config, weights):
    method vision_forward (line 898) | def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_...
    method forward (line 916) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
  class GPTNeoXConfig (line 54) | class GPTNeoXConfig(TransformersGPTNeoXConfig):
  function load_row (line 60) | def load_row(config, prefix: str, weights, bias: bool):
  function load_qkv (line 76) | def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_...
  class FlashNeoxAttention (line 101) | class FlashNeoxAttention(torch.nn.Module):
    method __init__ (line 102) | def __init__(self, config, prefix, weights, rotary_emb):
    method forward (line 138) | def forward(
  class FlashMLP (line 197) | class FlashMLP(nn.Module):
    method __init__ (line 198) | def __init__(self, config, prefix, weights):
    method forward (line 219) | def forward(self, hidden_states):
  class FlashNeoXLayer (line 226) | class FlashNeoXLayer(nn.Module):
    method __init__ (line 227) | def __init__(self, layer_id, config, weights, rotary_emb):
    method forward (line 253) | def forward(
  class FlashGPTNeoXPreTrainedModel (line 311) | class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
  class FlashGPTNeoXModel (line 318) | class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
    method __init__ (line 319) | def __init__(self, prefix: str, config, weights):
    method forward (line 353) | def forward(
  class FlashGPTNeoXForCausalLM (line 397) | class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
    method __init__ (line 398) | def __init__(self, prefix, config, weights):
    method forward (line 412) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
  class PaliGemmaForConditionalGeneration (line 29) | class PaliGemmaForConditionalGeneration(nn.Module):
    method __init__ (line 30) | def __init__(self, prefix, config, weights):
    method get_vision_embeds (line 67) | def get_vision_embeds(
    method get_inputs_embeds (line 83) | def get_inputs_embeds(
    method forward (line 96) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
  class PhiConfig (line 33) | class PhiConfig(PretrainedConfig):
    method __init__ (line 34) | def __init__(
  function load_attention (line 73) | def load_attention(config, prefix, weights):
  function _load_gqa (line 86) | def _load_gqa(config, prefix: str, weights):
  class FlashPhiAttention (line 110) | class FlashPhiAttention(torch.nn.Module):
    method __init__ (line 111) | def __init__(
    method forward (line 153) | def forward(
  class PhiMLP (line 221) | class PhiMLP(nn.Module):
    method __init__ (line 222) | def __init__(self, prefix, config, weights):
    method forward (line 250) | def forward(self, hidden_states):
  class FlashPhiLayer (line 256) | class FlashPhiLayer(nn.Module):
    method __init__ (line 257) | def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
    method forward (line 274) | def forward(
  class FlashPhiModel (line 306) | class FlashPhiModel(torch.nn.Module):
    method __init__ (line 307) | def __init__(self, prefix: str, config, weights):
    method forward (line 350) | def forward(
  class FlashPhiForCausalLM (line 394) | class FlashPhiForCausalLM(torch.nn.Module):
    method __init__ (line 395) | def __init__(self, prefix: str, config, weights):
    method forward (line 410) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
  class PhiMoEConfig (line 29) | class PhiMoEConfig(PretrainedConfig):
    method __init__ (line 120) | def __init__(
    method _rope_scaling_validation (line 190) | def _rope_scaling_validation(self):

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
  function load_attention (line 29) | def load_attention(config, prefix, weights):
  function _load_gqa (line 42) | def _load_gqa(config, prefix: str, weights):
  class Qwen2Attention (line 55) | class Qwen2Attention(torch.nn.Module):
    method __init__ (line 56) | def __init__(
    method forward (line 101) | def forward(
  class Qwen2MLP (line 161) | class Qwen2MLP(nn.Module):
    method __init__ (line 162) | def __init__(self, prefix, config, weights):
    method forward (line 193) | def forward(self, hidden_states):
  class Qwen2Layer (line 199) | class Qwen2Layer(nn.Module):
    method __init__ (line 200) | def __init__(self, prefix, layer_id, config, weights, rotary_emb):
    method forward (line 219) | def forward(
  class Qwen2Model (line 253) | class Qwen2Model(torch.nn.Module):
    method __init__ (line 254) | def __init__(self, prefix: str, config, weights):
    method forward (line 292) | def forward(
  class Qwen2ForCausalLM (line 336) | class Qwen2ForCausalLM(torch.nn.Module):
    method __init__ (line 337) | def __init__(self, prefix: str, config, weights):
    method forward (line 365) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_modeling.py
  class Qwen3Attention (line 41) | class Qwen3Attention(nn.Module):
    method __init__ (line 44) | def __init__(self, config, prefix, weights, layer_idx, rotary_emb):
    method forward (line 112) | def forward(
  class Qwen3DecoderLayer (line 177) | class Qwen3DecoderLayer(nn.Module):
    method __init__ (line 178) | def __init__(self, config, prefix, weights, layer_idx: int, rotary_emb):
    method forward (line 198) | def forward(
  class Qwen3Model (line 235) | class Qwen3Model(nn.Module):
    method __init__ (line 236) | def __init__(self, config, prefix: str, weights):
    method forward (line 267) | def forward(
  class Qwen3ForCausalLM (line 314) | class Qwen3ForCausalLM(nn.Module):
    method __init__ (line 316) | def __init__(self, prefix: str, config, weights):
    method forward (line 336) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py
  function rotate_half (line 47) | def rotate_half(x):
  function apply_rotary_pos_emb (line 54) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  class Qwen3MoeAttention (line 81) | class Qwen3MoeAttention(nn.Module):
    method __init__ (line 84) | def __init__(self, config, prefix, weights, layer_idx, rotary_emb):
    method forward (line 143) | def forward(
  class Qwen3MoE (line 202) | class Qwen3MoE(nn.Module):
    method __init__ (line 203) | def __init__(self, prefix, config, moe_layer_cls: Type[MoELayer], weig...
    method forward (line 226) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class Qwen3MoeMLP (line 237) | class Qwen3MoeMLP(nn.Module):
    method __init__ (line 238) | def __init__(self, prefix, config, weights, intermediate_size=None):
    method forward (line 267) | def forward(self, x):
  class Qwen3MoeSparseMoeBlock (line 273) | class Qwen3MoeSparseMoeBlock(nn.Module):
    method __init__ (line 274) | def __init__(self, prefix, config, weights):
    method forward (line 295) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Qwen3MoeDecoderLayer (line 343) | class Qwen3MoeDecoderLayer(nn.Module):
    method __init__ (line 344) | def __init__(self, config, prefix, weights, layer_idx: int, rotary_emb):
    method forward (line 387) | def forward(
  class Qwen3MoeModel (line 428) | class Qwen3MoeModel(nn.Module):
    method __init__ (line 429) | def __init__(self, config, prefix: str, weights):
    method forward (line 460) | def forward(
  class Qwen3MoeForCausalLM (line 502) | class Qwen3MoeForCausalLM(nn.Module):
    method __init__ (line 504) | def __init__(self, prefix: str, config, weights):
    method forward (line 524) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
  function load_row (line 28) | def load_row(config, prefix: str, weights, bias: bool):
  class RWConfig (line 44) | class RWConfig(PretrainedConfig):
    method __init__ (line 51) | def __init__(
  class FlashRWAttention (line 131) | class FlashRWAttention(torch.nn.Module):
    method __init__ (line 132) | def __init__(
    method forward (line 176) | def forward(
  class FlashRWLargeAttention (line 236) | class FlashRWLargeAttention(torch.nn.Module):
    method __init__ (line 237) | def __init__(
    method forward (line 290) | def forward(
  class FlashMLP (line 351) | class FlashMLP(nn.Module):
    method __init__ (line 352) | def __init__(self, config, prefix: str, weights):
    method forward (line 363) | def forward(self, hidden_states):
  class FlashRWLayer (line 370) | class FlashRWLayer(nn.Module):
    method __init__ (line 371) | def __init__(
    method forward (line 420) | def forward(
  class FlashRWLayerNorm (line 477) | class FlashRWLayerNorm(nn.Module):
    method __init__ (line 478) | def __init__(self, config, prefix: str, weights):
    method forward (line 508) | def forward(
  class FlashRWLargeLayer (line 522) | class FlashRWLargeLayer(nn.Module):
    method __init__ (line 523) | def __init__(self, layer_id, prefix: str, config, weights, rotary_emb):
    method forward (line 541) | def forward(
  class FlashRWPreTrainedModel (line 579) | class FlashRWPreTrainedModel(PreTrainedModel):
  class FlashRWModel (line 583) | class FlashRWModel(FlashRWPreTrainedModel):
    method __init__ (line 584) | def __init__(self, prefix: str, config, weights):
    method forward (line 623) | def forward(
  class FlashRWForCausalLM (line 667) | class FlashRWForCausalLM(FlashRWPreTrainedModel):
    method __init__ (line 668) | def __init__(self, prefix: str, config, weights):
    method forward (line 680) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
  function load_multi_mqa (line 30) | def load_multi_mqa(
  function _load_multi_mqa_gptq (line 43) | def _load_multi_mqa_gptq(
  function _load_multi_mqa (line 130) | def _load_multi_mqa(
  function load_col (line 200) | def load_col(config, prefix: str, weights, bias: bool):
  function load_row (line 213) | def load_row(config, prefix: str, weights, bias: bool):
  class FlashMQAttention (line 229) | class FlashMQAttention(torch.nn.Module):
    method __init__ (line 230) | def __init__(self, prefix, config, weights):
    method forward (line 265) | def forward(
  class MLP (line 319) | class MLP(nn.Module):
    method __init__ (line 320) | def __init__(self, prefix, config, weights):
    method forward (line 341) | def forward(self, hidden_states):
  class Block (line 348) | class Block(nn.Module):
    method __init__ (line 349) | def __init__(self, prefix: str, layer_id, config, weights):
    method forward (line 369) | def forward(
  class FlashSantacoderModel (line 396) | class FlashSantacoderModel(nn.Module):
    method __init__ (line 397) | def __init__(self, prefix: str, config, weights):
    method forward (line 431) | def forward(
  class FlashSantacoderForCausalLM (line 472) | class FlashSantacoderForCausalLM(nn.Module):
    method __init__ (line 473) | def __init__(self, prefix, config, weights):
    method forward (line 487) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
  class Starcoder2Config (line 57) | class Starcoder2Config(PretrainedConfig):
    method __init__ (line 60) | def __init__(
  function load_attention (line 117) | def load_attention(config, prefix, weights, layer_id):
  function _load_gqa (line 144) | def _load_gqa(config, prefix: str, weights):
  class Starcoder2Attention (line 176) | class Starcoder2Attention(torch.nn.Module):
    method __init__ (line 177) | def __init__(
    method forward (line 228) | def forward(
  class Starcoder2MLP (line 291) | class Starcoder2MLP(nn.Module):
    method __init__ (line 292) | def __init__(self, prefix, config, weights, index):
    method forward (line 334) | def forward(self, hidden_states, adapter_data):
  class Starcoder2GatedMLP (line 340) | class Starcoder2GatedMLP(nn.Module):
    method __init__ (line 341) | def __init__(self, index, prefix, config, weights):
    method forward (line 390) | def forward(self, hidden_states, adapter_data):
  class Starcoder2Layer (line 409) | class Starcoder2Layer(nn.Module):
    method __init__ (line 410) | def __init__(self, layer_id, config, weights, rotary_emb):
    method forward (line 436) | def forward(
  class Starcoder2Model (line 474) | class Starcoder2Model(torch.nn.Module):
    method __init__ (line 475) | def __init__(self, prefix, config, weights):
    method forward (line 511) | def forward(
  class FlashStarcoder2ForCausalLM (line 557) | class FlashStarcoder2ForCausalLM(torch.nn.Module):
    method __init__ (line 558) | def __init__(self, prefix, config, weights):
    method forward (line 587) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
  function repeat_kv (line 39) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Idefics2VisionEmbeddings (line 53) | class Idefics2VisionEmbeddings(nn.Module):
    method __init__ (line 64) | def __init__(self, prefix, config, weights):
    method forward (line 91) | def forward(
  class Idefics2VisionAttention (line 134) | class Idefics2VisionAttention(nn.Module):
    method __init__ (line 135) | def __init__(self, prefix, config, weights):
    method forward (line 164) | def forward(
  class Idefics2VisionMLP (line 232) | class Idefics2VisionMLP(nn.Module):
    method __init__ (line 233) | def __init__(self, prefix, config, weights):
    method forward (line 244) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Idefics2EncoderLayer (line 251) | class Idefics2EncoderLayer(nn.Module):
    method __init__ (line 252) | def __init__(self, prefix, config, weights):
    method forward (line 269) | def forward(
  class Idefics2Encoder (line 291) | class Idefics2Encoder(nn.Module):
    method __init__ (line 292) | def __init__(self, prefix, config, weights):
    method forward (line 305) | def forward(
  class Idefics2VisionTransformer (line 319) | class Idefics2VisionTransformer(nn.Module):
    method __init__ (line 320) | def __init__(self, prefix, config, weights):
    method forward (line 335) | def forward(
  class Idefics2MLP (line 380) | class Idefics2MLP(nn.Module):
    method __init__ (line 381) | def __init__(self, prefix, config, weights):
    method forward (line 408) | def forward(self, hidden_states):
  class Idefics2RMSNorm (line 418) | class Idefics2RMSNorm(nn.Module):
    method __init__ (line 419) | def __init__(self, prefix, weights, eps):
    method forward (line 429) | def forward(self, hidden_states):
  class Idefics2PerceiverAttention (line 437) | class Idefics2PerceiverAttention(nn.Module):
    method __init__ (line 438) | def __init__(self, prefix, config, weights):
    method forward (line 472) | def forward(
  class Idefics2PerceiverLayer (line 544) | class Idefics2PerceiverLayer(nn.Module):
    method __init__ (line 545) | def __init__(self, prefix, config, weights):
    method forward (line 572) | def forward(
  class Idefics2PerceiverResampler (line 605) | class Idefics2PerceiverResampler(nn.Module):
    method __init__ (line 606) | def __init__(self, prefix, config, weights) -> None:
    method forward (line 632) | def forward(
  class Idefics2Connector (line 664) | class Idefics2Connector(nn.Module):
    method __init__ (line 665) | def __init__(self, prefix, config, weights):
    method forward (line 674) | def forward(self, image_hidden_states, attention_mask):
  class Idefics2ForConditionalGeneration (line 682) | class Idefics2ForConditionalGeneration(nn.Module):
    method __init__ (line 683) | def __init__(self, prefix, config, weights):
    method _merge_input_ids_with_image_features (line 723) | def _merge_input_ids_with_image_features(
    method get_vision_embeds (line 737) | def get_vision_embeds(
    method get_inputs_embeds (line 820) | def get_inputs_embeds(
    method forward (line 835) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
  function repeat_kv (line 38) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Idefics3VisionEmbeddings (line 52) | class Idefics3VisionEmbeddings(nn.Module):
    method __init__ (line 63) | def __init__(self, prefix, config, weights):
    method forward (line 90) | def forward(
  class Idefics3VisionAttention (line 133) | class Idefics3VisionAttention(nn.Module):
    method __init__ (line 134) | def __init__(self, prefix, config, weights):
    method forward (line 163) | def forward(
  class Idefics3VisionMLP (line 231) | class Idefics3VisionMLP(nn.Module):
    method __init__ (line 232) | def __init__(self, prefix, config, weights):
    method forward (line 243) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Idefics3EncoderLayer (line 250) | class Idefics3EncoderLayer(nn.Module):
    method __init__ (line 251) | def __init__(self, prefix, config, weights):
    method forward (line 268) | def forward(
  class Idefics3Encoder (line 290) | class Idefics3Encoder(nn.Module):
    method __init__ (line 291) | def __init__(self, prefix, config, weights):
    method forward (line 304) | def forward(
  class Idefics3VisionTransformer (line 318) | class Idefics3VisionTransformer(nn.Module):
    method __init__ (line 319) | def __init__(self, prefix, config, weights):
    method forward (line 334) | def forward(
  class Idefics3SimpleMLP (line 379) | class Idefics3SimpleMLP(nn.Module):
    method __init__ (line 380) | def __init__(self, prefix, config, weights):
    method forward (line 391) | def forward(self, x):
  class Idefics3Connector (line 395) | class Idefics3Connector(nn.Module):
    method __init__ (line 396) | def __init__(self, prefix, config, weights):
    method pixel_shuffle (line 401) | def pixel_shuffle(self, x, scale_factor=2):
    method forward (line 417) | def forward(self, image_hidden_states):
  class Idefics3ForConditionalGeneration (line 423) | class Idefics3ForConditionalGeneration(nn.Module):
    method __init__ (line 424) | def __init__(self, prefix, config, weights):
    method _merge_input_ids_with_image_features (line 466) | def _merge_input_ids_with_image_features(
    method get_vision_embeds (line 480) | def get_vision_embeds(
    method get_inputs_embeds (line 563) | def get_inputs_embeds(
    method forward (line 578) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
  class InferenceParams (line 25) | class InferenceParams:
  class MambaConfig (line 36) | class MambaConfig(PretrainedConfig):
    method __init__ (line 37) | def __init__(
  class MambaBlock (line 71) | class MambaBlock(nn.Module):
    method __init__ (line 72) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 94) | def forward(self, hidden_states: torch.Tensor, inference_params=None):
    method step (line 140) | def step(self, hidden_states, conv_state, ssm_state):
  class ResidualBlock (line 170) | class ResidualBlock(nn.Module):
    method __init__ (line 171) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 180) | def forward(
  class MambaModel (line 195) | class MambaModel(nn.Module):
    method __init__ (line 196) | def __init__(self, config, weights):
    method forward (line 218) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
  class Qwen2_5_VLVideosProcessorKwargs (line 68) | class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
  class Qwen2_5_VLProcessorKwargs (line 72) | class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
  class Qwen2_5_VLProcessor (line 82) | class Qwen2_5_VLProcessor(ProcessorMixin):
    method __init__ (line 102) | def __init__(
    method __call__ (line 117) | def __call__(
    method batch_decode (line 237) | def batch_decode(self, *args, **kwargs):
    method decode (line 244) | def decode(self, *args, **kwargs):
    method post_process_image_text_to_text (line 251) | def post_process_image_text_to_text(self, generated_outputs):
    method model_input_names (line 270) | def model_input_names(self):
  class Qwen2_5_VLVisionConfig (line 280) | class Qwen2_5_VLVisionConfig(PretrainedConfig):
    method __init__ (line 284) | def __init__(
  class Qwen2_5_VLConfig (line 320) | class Qwen2_5_VLConfig(PretrainedConfig):
    method __init__ (line 322) | def __init__(
  class Qwen2_5VLAttention (line 384) | class Qwen2_5VLAttention(nn.Module):
    method __init__ (line 385) | def __init__(self, *, prefix, config, weights):
    method forward (line 409) | def forward(
  class Qwen2_5VLVisionMLP (line 478) | class Qwen2_5VLVisionMLP(nn.Module):
    method __init__ (line 479) | def __init__(self, *, prefix, config, weights):
    method forward (line 497) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Qwen2_5VLVisionBlock (line 505) | class Qwen2_5VLVisionBlock(nn.Module):
    method __init__ (line 506) | def __init__(self, prefix, config, weights):
    method forward (line 529) | def forward(self, hidden_states, cu_seqlens, cos, sin, max_seqlen) -> ...
  class Qwen2_5VLPatchMerger (line 539) | class Qwen2_5VLPatchMerger(nn.Module):
    method __init__ (line 540) | def __init__(self, *, prefix, config, weights):
    method forward (line 555) | def forward(self, hidden_states) -> torch.Tensor:
  class Qwen2_5VisionModel (line 564) | class Qwen2_5VisionModel(nn.Module):
    method __init__ (line 565) | def __init__(self, *, prefix, config, weights):
    method apply_class_embedding (line 612) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T...
    method get_window_index (line 618) | def get_window_index(self, grid_thw):
    method forward (line 665) | def forward(
  class Qwen2_5VLForConditionalGeneration (line 774) | class Qwen2_5VLForConditionalGeneration(nn.Module):
    method __init__ (line 775) | def __init__(self, prefix, config, weights):
    method get_position_ids (line 824) | def get_position_ids(
    method get_vision_embeds (line 898) | def get_vision_embeds(
    method get_inputs_embeds (line 908) | def get_inputs_embeds(
    method forward (line 922) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py
  class Qwen2VLAttention (line 54) | class Qwen2VLAttention(nn.Module):
    method __init__ (line 55) | def __init__(self, *, prefix, config, weights):
    method forward (line 78) | def forward(
  class Qwen2VLVisionMLP (line 147) | class Qwen2VLVisionMLP(nn.Module):
    method __init__ (line 148) | def __init__(self, *, prefix, config, weights):
    method forward (line 158) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Qwen2VLVisionBlock (line 165) | class Qwen2VLVisionBlock(nn.Module):
    method __init__ (line 166) | def __init__(self, prefix, config, weights):
    method forward (line 189) | def forward(self, hidden_states, cu_seqlens, cos, sin, max_seqlen) -> ...
  class Qwen2VLPatchMerger (line 198) | class Qwen2VLPatchMerger(nn.Module):
    method __init__ (line 199) | def __init__(self, *, prefix, config, weights):
    method forward (line 214) | def forward(self, hidden_states) -> torch.Tensor:
  class Qwen2VisionModel (line 223) | class Qwen2VisionModel(nn.Module):
    method __init__ (line 224) | def __init__(self, *, prefix, config, weights):
    method apply_class_embedding (line 266) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T...
    method forward (line 272) | def forward(
  class Qwen2VLForConditionalGeneration (line 349) | class Qwen2VLForConditionalGeneration(nn.Module):
    method __init__ (line 350) | def __init__(self, prefix, config, weights):
    method get_position_ids (line 404) | def get_position_ids(
    method get_vision_embeds (line 478) | def get_vision_embeds(
    method get_inputs_embeds (line 488) | def get_inputs_embeds(
    method forward (line 502) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/siglip.py
  class SiglipVisionEmbeddings (line 21) | class SiglipVisionEmbeddings(nn.Module):
    method __init__ (line 22) | def __init__(self, prefix, config: SiglipVisionConfig, weights):
    method forward (line 52) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
  class SiglipAttention (line 62) | class SiglipAttention(nn.Module):
    method __init__ (line 65) | def __init__(self, prefix, config, weights):
    method _shape (line 95) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 102) | def forward(
  class SiglipMLP (line 163) | class SiglipMLP(nn.Module):
    method __init__ (line 164) | def __init__(self, prefix, config, weights):
    method forward (line 175) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class SiglipEncoderLayer (line 182) | class SiglipEncoderLayer(nn.Module):
    method __init__ (line 183) | def __init__(self, prefix, config: SiglipConfig, weights):
    method forward (line 197) | def forward(
  class SiglipMultiheadAttentionPoolingHead (line 216) | class SiglipMultiheadAttentionPoolingHead(nn.Module):
    method __init__ (line 219) | def __init__(self, prefix, config: SiglipVisionConfig, weights):
    method forward (line 229) | def forward(self, hidden_state):
  function _trunc_normal_ (line 242) | def _trunc_normal_(tensor, mean, std, a, b):
  function trunc_normal_tf_ (line 278) | def trunc_normal_tf_(
  function variance_scaling_ (line 308) | def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="no...
  function lecun_normal_ (line 333) | def lecun_normal_(tensor):
  function default_flax_embed_init (line 337) | def default_flax_embed_init(tensor):
  class SiglipEncoder (line 341) | class SiglipEncoder(nn.Module):
    method __init__ (line 350) | def __init__(self, prefix, config: SiglipConfig, weights):
    method forward (line 362) | def forward(
  class SiglipVisionTransformer (line 377) | class SiglipVisionTransformer(nn.Module):
    method __init__ (line 378) | def __init__(self, prefix, config: SiglipVisionConfig, weights):
    method forward (line 389) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
  function load_text_model (line 1) | def load_text_model(prefix, config, weights, name=None):
  function load_vision_model (line 42) | def load_vision_model(prefix, config, weights):

FILE: backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
  function generate_block_metadata (line 84) | def generate_block_metadata(
  class FlashCausalLMBatch (line 171) | class FlashCausalLMBatch(Batch):
    method to_pb (line 261) | def to_pb(self) -> generate_pb2.CachedBatch:
    method batch_tokenized_inputs (line 275) | def batch_tokenized_inputs(
    method from_tokenized (line 295) | def from_tokenized(
    method from_pb (line 495) | def from_pb(
    method filter (line 507) | def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
    method concatenate (line 699) | def concatenate(
    method prepare_for_decode (line 980) | def prepare_for_decode(
    method prepare_for_prefill (line 1097) | def prepare_for_prefill(
    method __len__ (line 1422) | def __len__(self):
  class FlashCausalLM (line 1438) | class FlashCausalLM(Model):
    method __init__ (line 1439) | def __init__(
    method batch_type (line 1592) | def batch_type(self) -> Type[FlashCausalLMBatch]:
    method max_past (line 1595) | def max_past(self) -> int:
    method init_kv_cache (line 1598) | def init_kv_cache(
    method warmup (line 1631) | def warmup(
    method log_warmup (line 1766) | def log_warmup(self, prefilling, i, max_i, batch_size, seq_len):
    method use_graphs (line 1782) | def use_graphs(self, prefill, seq_len, batch_size):
    method align_workers (line 1791) | def align_workers(self, value, op):
    method warmup_hpu_graph (line 1798) | def warmup_hpu_graph(self, batch):
    method warmup_prefill (line 1908) | def warmup_prefill(
    method warmup_decode (line 1964) | def warmup_decode(self, batch_size: int, block_num: int, batch: FlashC...
    method forward (line 2063) | def forward(
    method generate_token (line 2179) | def generate_token(

FILE: backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
  function prompt_split_image_llama4 (line 44) | def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk):
  function _prompt_split_image (line 72) | def _prompt_split_image(
  function get_anyres_image_grid_shape (line 101) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
  function image_text_replacement (line 124) | def image_text_replacement(processor, image_input, config) -> str:
  function image_text_replacement_fixup (line 197) | def image_text_replacement_fixup(config, text: str) -> str:
  function preprocess_text (line 205) | def preprocess_text(config, text: str) -> str:
  function preprocess_image (line 211) | def preprocess_image(config, img):
  function get_unpadded_features (line 226) | def get_unpadded_features(
  function get_number_of_features (line 253) | def get_number_of_features(height: int, width: int, config) -> int:
  function scatter_image_embeds (line 280) | def scatter_image_embeds(
  function gather_image_embeds (line 294) | def gather_image_embeds(
  class ImagePositions (line 304) | class ImagePositions:
  class FlashVlmCausalLMBatch (line 312) | class FlashVlmCausalLMBatch(FlashCausalLMBatch):
    method concatenate (line 326) | def concatenate(cls, batches, padded_total_bs: int = 0):
    method filter (line 356) | def filter(self, request_ids: List[int]):
    method batch_tokenized_inputs (line 386) | def batch_tokenized_inputs(
    method get_image_positions (line 464) | def get_image_positions(
    method from_pb_processor (line 535) | def from_pb_processor(
    method prepare_for_prefill (line 558) | def prepare_for_prefill(
    method update_encoder_cache (line 628) | def update_encoder_cache(self, encoder_outputs, request_id, img_pos):
    method gather_vision_embeds (line 633) | def gather_vision_embeds(self):
    method free_encoder_cache (line 696) | def free_encoder_cache(self):
  class FlashVlmCausalLM (line 703) | class FlashVlmCausalLM(FlashCausalLM):
    method __init__ (line 704) | def __init__(
    method batch_type (line 736) | def batch_type(self) -> Type[FlashVlmCausalLMBatch]:
    method max_past (line 739) | def max_past(self) -> Optional[int]:
    method warmup_decode (line 742) | def warmup_decode(
    method warmup_hpu_graph (line 844) | def warmup_hpu_graph(self, batch: FlashVlmCausalLMBatch):
    method get_vision_embeds (line 908) | def get_vision_embeds(
    method get_inputs_embeds (line 923) | def get_inputs_embeds(
    method encode_images (line 933) | def encode_images(self, batch):
    method set_inputs_embeds (line 972) | def set_inputs_embeds(self, batch):
    method forward (line 986) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/globals.py
  function set_model_id (line 35) | def set_model_id(model_id: str):
  function set_adapter_to_index (line 45) | def set_adapter_to_index(adapter_to_index: Dict[str, int]):
  function get_adapter_to_index (line 50) | def get_adapter_to_index():

FILE: backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
  class FlashMllamaCausalLMBatch (line 45) | class FlashMllamaCausalLMBatch(FlashVlmCausalLMBatch):
    method prepare_for_prefill (line 51) | def prepare_for_prefill(
    method concatenate (line 60) | def concatenate(cls, batches, padded_total_bs: int = 0):
    method filter (line 83) | def filter(self, request_ids: List[int]):
    method batch_tokenized_inputs (line 115) | def batch_tokenized_inputs(
    method from_pb_processor (line 181) | def from_pb_processor(
  function generate_cross_attention_states (line 225) | def generate_cross_attention_states(
  class FlashMllamaCausalLM (line 240) | class FlashMllamaCausalLM(FlashVlmCausalLM):
    method set_inputs_embeds (line 241) | def set_inputs_embeds(self, batch):
    method warmup_decode (line 245) | def warmup_decode(
    method warmup_prefill (line 316) | def warmup_prefill(
    method warmup_hpu_graph (line 378) | def warmup_hpu_graph(self, batch: FlashMllamaCausalLMBatch):
    method forward (line 489) | def forward(

FILE: backends/gaudi/server/text_generation_server/models/model.py
  class Model (line 22) | class Model(ABC):
    method __init__ (line 23) | def __init__(
    method info (line 74) | def info(self) -> InfoResponse:
    method batch_type (line 89) | def batch_type(self) -> Type[B]:
    method generate_token (line 93) | def generate_token(
    method warmup (line 98) | def warmup(
    method decode_token (line 104) | def decode_token(
    method check_initialized (line 134) | def check_initialized(self):

FILE: backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
  class Seq2SeqLMBatch (line 35) | class Seq2SeqLMBatch(Batch):
    method to_pb (line 75) | def to_pb(self) -> generate_pb2.CachedBatch:
    method from_pb (line 85) | def from_pb(
    method filter (line 179) | def filter(self, request_ids: List[int]) -> Optional["Seq2SeqLMBatch"]:
    method concatenate (line 294) | def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBat...
    method __len__ (line 536) | def __len__(self):
  class Seq2SeqLM (line 540) | class Seq2SeqLM(Model):
    method __init__ (line 541) | def __init__(
    method fallback (line 609) | def fallback(
    method batch_type (line 671) | def batch_type(self) -> Type[Seq2SeqLMBatch]:
    method forward (line 674) | def forward(
    method generate_token (line 712) | def generate_token(

FILE: backends/gaudi/server/text_generation_server/models/types.py
  class Batch (line 13) | class Batch(ABC):
    method to_pb (line 15) | def to_pb(self) -> generate_pb2.CachedBatch:
    method from_pb (line 20) | def from_pb(
    method filter (line 30) | def filter(self, request_ids: List[int]) -> "Batch":
    method concatenate (line 35) | def concatenate(cls, batches: List["Batch"]) -> "Batch":
    method __len__ (line 39) | def __len__(self):
  class GeneratedText (line 44) | class GeneratedText:
    method to_pb (line 50) | def to_pb(self) -> generate_pb2.GeneratedText:
  class Tokens (line 60) | class Tokens:
    method to_pb (line 66) | def to_pb(self) -> generate_pb2.Tokens:
    method __len__ (line 74) | def __len__(self):
  class Generation (line 79) | class Generation:
    method to_pb (line 87) | def to_pb(self) -> generate_pb2.Generation:

FILE: backends/gaudi/server/text_generation_server/server.py
  class SignalHandler (line 34) | class SignalHandler:
    method __init__ (line 37) | def __init__(self):
    method exit_gracefully (line 41) | def exit_gracefully(self, signum, frame):
  class TextGenerationService (line 46) | class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServi...
    method __init__ (line 47) | def __init__(
    method Info (line 65) | async def Info(self, request, context):
    method Health (line 68) | async def Health(self, request, context):
    method ServiceDiscovery (line 73) | async def ServiceDiscovery(self, request, context):
    method ClearCache (line 76) | async def ClearCache(self, request, context):
    method FilterBatch (line 83) | async def FilterBatch(self, request, context):
    method Warmup (line 92) | async def Warmup(self, request, context):
    method Prefill (line 144) | async def Prefill(self, request, context):
    method Decode (line 173) | async def Decode(self, request, context):
  function serve (line 201) | def serve(

FILE: backends/gaudi/server/text_generation_server/tracing.py
  class UDSOpenTelemetryAioServerInterceptor (line 16) | class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterce...
    method __init__ (line 17) | def __init__(self):
    method _start_span (line 20) | def _start_span(self, handler_call_details, context, set_status_on_exc...
  function setup_tracing (line 57) | def setup_tracing(otlp_service_name: str, otlp_endpoint: str):

FILE: backends/gaudi/server/text_generation_server/utils/adapter.py
  class AdapterInfo (line 28) | class AdapterInfo:
  class AdapterParameters (line 35) | class AdapterParameters:
  class AdapterSource (line 44) | class AdapterSource:
  function parse_lora_adapters (line 50) | def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]:
  function load_and_merge_adapters (line 71) | def load_and_merge_adapters(
  class AdapterParametersContainer (line 99) | class AdapterParametersContainer:
    method __hash__ (line 103) | def __hash__(self) -> int:
  function _load_and_merge (line 108) | def _load_and_merge(
  function check_architectures (line 146) | def check_architectures(
  function load_module_map (line 185) | def load_module_map(
  function get_attn_weights (line 233) | def get_attn_weights(i, layer):
  function get_mlp_weights (line 256) | def get_mlp_weights(i, layer):
  function build_layer_weight_lookup (line 294) | def build_layer_weight_lookup(model):

FILE: backends/gaudi/server/text_generation_server/utils/chunks.py
  function concat_text_chunks (line 8) | def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str:

FILE: backends/gaudi/server/text_generation_server/utils/convert.py
  function _remove_duplicate_names (line 12) | def _remove_duplicate_names(
  function convert_file (line 62) | def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]):
  function convert_files (line 96) | def convert_files(pt_files: List[Path], sf_files: List[Path], discard_na...

FILE: backends/gaudi/server/text_generation_server/utils/debug.py
  function to_gb_rounded (line 17) | def to_gb_rounded(mem: float) -> float:
  function count_hpu_graphs (line 30) | def count_hpu_graphs():
  function dbg_trace (line 34) | def dbg_trace(tag, txt):

FILE: backends/gaudi/server/text_generation_server/utils/dist.py
  class FakeBarrier (line 13) | class FakeBarrier:
    method wait (line 14) | def wait(self):
  class FakeGroup (line 18) | class FakeGroup(ProcessGroup):
    method __init__ (line 19) | def __init__(self, rank, size):
    method allreduce (line 24) | def allreduce(self, *args, **kwargs):
    method allgather (line 27) | def allgather(self, inputs, local_tensor, **kwargs):
    method barrier (line 35) | def barrier(self, *args, **kwargs):
    method size (line 38) | def size(self):
    method rank (line 41) | def rank(self):
    method _get_backend_name (line 44) | def _get_backend_name(self):
  function initialize_torch_distributed (line 48) | def initialize_torch_distributed():

FILE: backends/gaudi/server/text_generation_server/utils/hub.py
  function _cached_weight_files (line 21) | def _cached_weight_files(
  function _weight_hub_files_from_model_info (line 32) | def _weight_hub_files_from_model_info(
  function _weight_files_from_dir (line 46) | def _weight_files_from_dir(d: Path, extension: str) -> List[str]:
  function _get_cached_revision_directory (line 62) | def _get_cached_revision_directory(
  function weight_hub_files (line 97) | def weight_hub_files(
  function try_to_load_from_cache (line 119) | def try_to_load_from_cache(
  function weight_files (line 133) | def weight_files(
  function download_weights (line 188) | def download_weights(

FILE: backends/gaudi/server/text_generation_server/utils/import_utils.py
  function get_hpu_free_memory (line 4) | def get_hpu_free_memory(device, memory_fraction):
  function synchronize_hpu (line 9) | def synchronize_hpu(device):
  function noop (line 13) | def noop(*args, **kwargs):

FILE: backends/gaudi/server/text_generation_server/utils/kernels.py
  function load_kernel (line 9) | def load_kernel(*, module: str, repo_id: str):

FILE: backends/gaudi/server/text_generation_server/utils/log.py
  function log_once (line 6) | def log_once(log, msg: str, master=True):
  function log_master (line 13) | def log_master(log, msg: str):

FILE: backends/gaudi/server/text_generation_server/utils/logits_process.py
  class StaticWarper (line 26) | class StaticWarper:
    method __init__ (line 27) | def __init__(
    method __call__ (line 51) | def __call__(self, scores):
  function static_warper (line 76) | def static_warper(
  class HeterogeneousRepetitionPenaltyLogitsProcessor (line 87) | class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor):
    method __init__ (line 99) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t...
    method __call__ (line 105) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 116) | def filter(self, indices):
  class FrequencyPenaltyLogitsProcessor (line 124) | class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
    method __init__ (line 133) | def __init__(self, penalty: float):
    method __call__ (line 136) | def __call__(
  class HeterogeneousFrequencyPenaltyLogitsProcessor (line 148) | class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor):
    method __init__ (line 158) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t...
    method __call__ (line 164) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 183) | def filter(self, indices):
  class HeterogeneousTemperatureLogitsWarper (line 191) | class HeterogeneousTemperatureLogitsWarper:
    method __init__ (line 202) | def __init__(
    method __call__ (line 210) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 214) | def filter(self, indices):
  class HeterogeneousTopPLogitsWarper (line 222) | class HeterogeneousTopPLogitsWarper(LogitsProcessor):
    method __init__ (line 238) | def __init__(
    method __call__ (line 253) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 273) | def filter(self, indices):
  class HeterogeneousTopKLogitsWarper (line 281) | class HeterogeneousTopKLogitsWarper(LogitsProcessor):
    method __init__ (line 296) | def __init__(
    method __call__ (line 324) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 345) | def filter(self, indices):
  class HeterogeneousTypicalLogitsWarper (line 362) | class HeterogeneousTypicalLogitsWarper(LogitsProcessor):
    method __init__ (line 378) | def __init__(
    method __call__ (line 400) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 436) | def filter(self, indices):
  class HeterogeneousProcessorWrapper (line 452) | class HeterogeneousProcessorWrapper(LogitsProcessor):
    method __init__ (line 460) | def __init__(
    method __call__ (line 466) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 471) | def filter(self, indices):
  class GrammarLogitProcessor (line 483) | class GrammarLogitProcessor(LogitsProcessor):
    method __init__ (line 487) | def __init__(self, tokenizer, device, grammar, grammar_type):
    method __call__ (line 494) | def __call__(
    method advance (line 507) | def advance(self, next_token_id, fsm_grammar_state):
    method _advance (line 513) | def _advance(next_token_id, fsm_grammar_state, fsm):
    method _cached_compile_fsm (line 521) | def _cached_compile_fsm(grammar_type, schema, tokenizer):
    method _cached_adapt_tokenizer (line 533) | def _cached_adapt_tokenizer(tokenizer):
  class HeterogeneousGrammarLogitProcessor (line 561) | class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
    method __init__ (line 562) | def __init__(self, tokenizer, device, grammars, grammar_types):
    method __call__ (line 575) | def __call__(
    method advance_batch (line 590) | def advance_batch(self, next_token_ids, fsm_grammar_states):
    method advance_at_index (line 598) | def advance_at_index(self, next_token_id, fsm_grammar_state, index):
    method filter (line 605) | def filter(self, indices):

FILE: backends/gaudi/server/text_generation_server/utils/merges/strategies.py
  class AdapterParameters (line 17) | class AdapterParameters:
    method __init__ (line 18) | def __init__(
  function _apply_weights (line 28) | def _apply_weights(
  class MergeStrategy (line 44) | class MergeStrategy(ABC):
    method merge (line 45) | def merge(
  class LinearMerge (line 51) | class LinearMerge(MergeStrategy):
    method __init__ (line 52) | def __init__(self, **kwargs):
    method merge (line 55) | def merge(
  class TiesMerge (line 62) | class TiesMerge(MergeStrategy):
    method __init__ (line 63) | def __init__(self, density: float, majority_sign_method: str = "total"...
    method merge (line 67) | def merge(
  class DareLinearMerge (line 86) | class DareLinearMerge(MergeStrategy):
    method __init__ (line 87) | def __init__(self, density: float, **kwargs):
    method merge (line 90) | def merge(
  class DareTiesMerge (line 102) | class DareTiesMerge(MergeStrategy):
    method __init__ (line 103) | def __init__(self, density: float, majority_sign_method: str = "total"...
    method merge (line 107) | def merge(
  function merge_adapters (line 136) | def merge_adapters(
  function _validate_lora_configs (line 193) | def _validate_lora_configs(lora_configs: List["LoraConfig"]):
  function _merge_lora_configs (line 207) | def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig":

FILE: backends/gaudi/server/text_generation_server/utils/merges/utils.py
  function magnitude_based_pruning (line 23) | def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> tor...
  function random_pruning (line 39) | def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) ...
  function prune (line 56) | def prune(
  function calculate_majority_sign_mask (line 83) | def calculate_majority_sign_mask(
  function disjoint_merge (line 105) | def disjoint_merge(task_tensors, majority_sign_mask):

FILE: backends/gaudi/server/text_generation_server/utils/peft.py
  function download_and_unload_peft (line 10) | def download_and_unload_peft(model_id, revision, trust_remote_code):
  function download_peft (line 48) | def download_peft(

FILE: backends/gaudi/server/text_generation_server/utils/prefill_chunking.py
  function set_support_chunking (line 7) | def set_support_chunking(support_chunking: bool):
  function get_support_chunking (line 12) | def get_support_chunking() -> bool:
  function set_max_prefill_tokens (line 17) | def set_max_prefill_tokens(max_prefill_tokens: int):
  function get_max_prefill_tokens (line 22) | def get_max_prefill_tokens() -> int:

FILE: backends/gaudi/server/text_generation_server/utils/quantization.py
  class _QuantizerConfig (line 14) | class _QuantizerConfig:
  class _FP8QuantizerConfig (line 26) | class _FP8QuantizerConfig:
  function _get_config_json (line 30) | def _get_config_json(model_id: str, revision: Optional[str], filename: s...
  function _get_quantizer_config (line 45) | def _get_quantizer_config(model_id, revision):
  function get_loader (line 122) | def get_loader(

FILE: backends/gaudi/server/text_generation_server/utils/segments.py
  function find_segments (line 10) | def find_segments(
  class SegmentConcatBuilder (line 35) | class SegmentConcatBuilder:
    method __init__ (line 36) | def __init__(self):
    method concat (line 40) | def concat(self, adapter_segments: torch.Tensor, segment_indices: List...
    method build (line 65) | def build(self) -> Tuple[torch.Tensor, List[int]]:

FILE: backends/gaudi/server/text_generation_server/utils/sgmv.py
  function has_sgmv (line 30) | def has_sgmv() -> bool:
  function pad_rank (line 34) | def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
  function use_cutlass_shrink (line 64) | def use_cutlass_shrink(lora_rank: int) -> bool:
  function orient_for_rank (line 68) | def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
  function add_lora_sgmv_cutlass (line 75) | def add_lora_sgmv_cutlass(
  function _add_lora_sgmv_cutlass_legacy (line 115) | def _add_lora_sgmv_cutlass_legacy(
  function get_tmp_tensor (line 133) | def get_tmp_tensor(device: torch.device) -> torch.Tensor:
  function get_tmp_tensor_for_size (line 138) | def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Te...
  function get_tmp_tensor_for_size_no_kernels (line 143) | def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) ...
  function get_tmp_expand_size (line 147) | def get_tmp_expand_size(size: int) -> int:
  function get_tmp_tensors (line 151) | def get_tmp_tensors(
  function lora_a_sgmv_cutlass (line 167) | def lora_a_sgmv_cutlass(
  function lora_b_sgmv_cutlass (line 184) | def lora_b_sgmv_cutlass(
  function add_lora_a_bgmv (line 217) | def add_lora_a_bgmv(
  function add_lora_b_bgmv (line 227) | def add_lora_b_bgmv(
  function segmented_matmul (line 237) | def segmented_matmul(

FILE: backends/gaudi/server/text_generation_server/utils/speculate.py
  function get_speculate (line 4) | def get_speculate() -> int:
  function set_speculate (line 9) | def set_speculate(speculate: int):

FILE: backends/gaudi/server/text_generation_server/utils/tokens.py
  class NextTokenChooser (line 27) | class NextTokenChooser:
    method __init__ (line 28) | def __init__(
    method __call__ (line 84) | def __call__(self, input_ids, scores):
    method advance_grammar (line 103) | def advance_grammar(self, next_id: int):
    method from_pb (line 111) | def from_pb(
  class StopSequenceCriteria (line 134) | class StopSequenceCriteria:
    method __init__ (line 135) | def __init__(self, stop_sequence: str):
    method __call__ (line 139) | def __call__(self, output: str) -> bool:
  class StoppingCriteria (line 145) | class StoppingCriteria:
    method __init__ (line 146) | def __init__(
    method __call__ (line 174) | def __call__(self, last_token: int, last_output: str) -> Tuple[bool, O...
    method from_pb (line 198) | def from_pb(
  function create_n_gram_speculation (line 216) | def create_n_gram_speculation(
  class HeterogeneousNextTokenChooser (line 240) | class HeterogeneousNextTokenChooser:
    method __init__ (line 241) | def __init__(
    method __call__ (line 335) | def __call__(
    method advance_grammar (line 424) | def advance_grammar(self, next_ids: List[int]):
    method advance_grammar_single (line 432) | def advance_grammar_single(self, grammar_state_index: int, next_id: int):
    method advance_grammar_single_with_past_state (line 443) | def advance_grammar_single_with_past_state(
    method filter (line 457) | def filter(self, indices):
    method from_pb (line 500) | def from_pb(
  function pad_next_token_chooser_parameters (line 531) | def pad_next_token_chooser_parameters(
  class Sampling (line 553) | class Sampling:
    method __init__ (line 554) | def __init__(self, seed: int, device: str = "cpu"):
    method __call__ (line 564) | def __call__(self, logits):
  class Greedy (line 572) | class Greedy:
    method __call__ (line 573) | def __call__(self, logits):
  class HeterogeneousSampling (line 577) | class HeterogeneousSampling:
    method __init__ (line 582) | def __init__(self, do_sample: List[bool], seeds: List[int], device: to...
    method __call__ (line 595) | def __call__(self, logits):
    method filter (line 605) | def filter(self, indices):
  function batch_top_tokens (line 619) | def batch_top_tokens(
  function make_tokenizer_optional (line 700) | def make_tokenizer_optional(tokenizer):
  function is_tokenizer_transparent (line 766) | def is_tokenizer_transparent(tokenizer):

FILE: backends/gaudi/server/text_generation_server/utils/version.py
  function get_driver_version (line 6) | def get_driver_version():
  function is_driver_compatible (line 32) | def is_driver_compatible():

FILE: backends/gaudi/server/text_generation_server/utils/watermark.py
  class WatermarkLogitsProcessor (line 26) | class WatermarkLogitsProcessor(LogitsProcessor):
    method __init__ (line 27) | def __init__(
    method _seed_rng (line 40) | def _seed_rng(self, input_ids: Union[List[int], torch.LongTensor]):
    method _get_greenlist_ids (line 55) | def _get_greenlist_ids(
    method _calc_greenlist_mask (line 70) | def _calc_greenlist_mask(
    method _bias_greenlist_logits (line 79) | def _bias_greenlist_logits(
    method __call__ (line 85) | def __call__(

FILE: backends/gaudi/server/text_generation_server/utils/weights.py
  class WeightsLoader (line 11) | class WeightsLoader(ABC):
    method get_weights (line 23) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 30) | def get_weights_col_packed(
    method get_weights_col (line 50) | def get_weights_col(self, weights: "Weights", prefix: str):
    method get_multi_weights_col (line 58) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_multi_weights (line 66) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d...
    method get_weights_row (line 74) | def get_weights_row(self, weights: "Weights", prefix: str):
  class Weight (line 82) | class Weight(ABC):
    method get_linear (line 87) | def get_linear(self, bias: torch.Tensor):
  class UnquantizedWeight (line 93) | class UnquantizedWeight(Weight):
    method get_linear (line 96) | def get_linear(self, bias: torch.Tensor):
  class DefaultWeightsLoader (line 102) | class DefaultWeightsLoader(WeightsLoader):
    method __init__ (line 105) | def __init__(self, weight_class: Type[UnquantizedWeight]):
    method get_weights (line 117) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 120) | def get_weights_col_packed(
    method get_multi_weights_col (line 132) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_weights_row (line 136) | def get_weights_row(self, weights: "Weights", prefix: str):
    method get_multi_weights (line 141) | def get_multi_weights(self, weights: "Weights", prefixes: List[str], d...
  class Weights (line 146) | class Weights:
    method __init__ (line 147) | def __init__(
    method _get_handle (line 177) | def _get_handle(self, filename):
    method get_filename (line 184) | def get_filename(self, tensor_name: str) -> (str, str):
    method _get_slice (line 201) | def _get_slice(self, tensor_name: str):
    method has_tensor (line 207) | def has_tensor(self, tensor_name: str):
    method get_shape (line 214) | def get_shape(self, tensor_name: str):
    method get_tensor (line 217) | def get_tensor(
    method get_partial_sharded (line 242) | def get_partial_sharded(
    method get_sharded (line 275) | def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_d...
    method get_packed_sharded (line 288) | def get_packed_sharded(
    method get_weights (line 357) | def get_weights(self, prefix: str):
    method get_weights_col_packed_qkv (line 360) | def get_weights_col_packed_qkv(
    method get_weights_col_packed_gate_up (line 370) | def get_weights_col_packed_gate_up(self, prefix: str):
    method get_weights_col_packed (line 373) | def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, ...
    method get_weights_col (line 383) | def get_weights_col(self, prefix: str):
    method get_multi_weights_col (line 386) | def get_multi_weights_col(self, prefixes: List[str], dim: int):
    method get_tensor_shard (line 389) | def get_tensor_shard(self, var, dim):
    method get_weights_row (line 405) | def get_weights_row(self, prefix: str):
    method get_multi_weights (line 408) | def get_multi_weights(self, prefixes: List[str], dim: int):
    method use_loader (line 412) | def use_loader(self, weights_loader: WeightsLoader):
    method loader (line 426) | def loader(self):
  function _blocks_to_block_sizes (line 430) | def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]...

FILE: backends/grpc-metadata/src/lib.rs
  type MetadataInjector (line 9) | struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);
  method set (line 13) | fn set(&mut self, key: &str, value: String) {
  function inject (line 23) | fn inject(metadata: &mut tonic::metadata::MetadataMap) {
  type InjectTelemetryContext (line 32) | pub trait InjectTelemetryContext {
    method inject_context (line 33) | fn inject_context(self) -> Self;
    method inject_context (line 37) | fn inject_context(mut self) -> Self {

FILE: backends/llamacpp/build.rs
  type PrefixStripper (line 6) | struct PrefixStripper;
  method generated_name_override (line 9) | fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option<Str...
  function main (line 14) | fn main() {

FILE: backends/llamacpp/src/backend.rs
  type LlamacppSplitMode (line 22) | pub enum LlamacppSplitMode {
  type Err (line 29) | type Err = String;
  method from_str (line 30) | fn from_str(s: &str) -> Result<Self, Self::Err> {
  type LlamacppNuma (line 43) | pub enum LlamacppNuma {
  type LlamacppGGMLType (line 53) | pub enum LlamacppGGMLType {
    method to_ggml_type (line 89) | fn to_ggml_type(self) -> llamacpp::ggml_type {
  type LlamacppConfig (line 126) | pub struct LlamacppConfig {
  type LlamacppRequest (line 147) | struct LlamacppRequest {
    method new (line 170) | fn new(
  type LlamacppBackend (line 164) | pub struct LlamacppBackend {
    method new (line 427) | pub fn new(
  type Llamacpp (line 193) | struct Llamacpp {
    method new (line 219) | fn new(conf: LlamacppConfig) -> Result<Self, BackendError> {
    method decode (line 284) | fn decode(&mut self) -> i32 {
    method clear_kv_cache (line 288) | fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) {
    method batch_push (line 294) | fn batch_push(
  function llamacpp_log_callback (line 201) | extern "C" fn llamacpp_log_callback(
  method drop (line 315) | fn drop(&mut self) {
  type LlamacppSampler (line 326) | struct LlamacppSampler {
    method new (line 331) | fn new(req: &LlamacppRequest) -> Option<Self> {
    method sample (line 381) | fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (llamacpp::ll...
  method drop (line 406) | fn drop(&mut self) {
  type LlamacppSeq (line 413) | struct LlamacppSeq {
  method schedule (line 644) | fn schedule(
  method health (line 659) | async fn health(&self, _: bool) -> bool {
  method name (line 663) | fn name(&self) -> &'static str {
  type BackendError (line 669) | pub enum BackendError {

FILE: backends/llamacpp/src/main.rs
  type Args (line 25) | struct Args {
  function main (line 167) | async fn main() -> Result<(), RouterError> {
  type RouterError (line 335) | enum RouterError {

FILE: backends/llamacpp/src/quantize.rs
  type QuantizeType (line 7) | pub enum QuantizeType {
  function model (line 11) | pub fn model(

FILE: backends/neuron/server/text_generation_server/cli.py
  function serve (line 12) | def serve(
  function download_weights (line 75) | def download_weights(

FILE: backends/neuron/server/text_generation_server/generator.py
  class Generator (line 35) | class Generator(ABC):
    method info (line 43) | def info(self) -> InfoResponse:
    method warmup (line 47) | def warmup(self, batch: Batch) -> int:
    method prefill (line 59) | def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
    method decode (line 74) | def decode(self, batches: List[Batch]) -> Tuple[List[Generation], Cach...
    method filter (line 78) | def filter(self, batch_id: int, request_ids: List[int]) -> CachedBatch:
    method clear (line 82) | def clear(self):
    method from_pretrained (line 87) | def from_pretrained(cls, model_id: str, revision: Optional[str]):
  class Slot (line 92) | class Slot:
    class State (line 95) | class State(Enum):
    method __init__ (line 100) | def __init__(self, id: int, tokenizer: PreTrainedTokenizerBase):
    method clear (line 105) | def clear(self):
    method id (line 123) | def id(self) -> int:
    method state (line 127) | def state(self) -> "Slot.State":
    method batch_id (line 131) | def batch_id(self) -> int:
    method request_id (line 135) | def request_id(self) -> int:
    method cached_text (line 139) | def cached_text(self) -> str:
    method generation_config (line 143) | def generation_config(self) -> GenerationConfig:
    method generated_tokens (line 147) | def generated_tokens(self) -> int:
    method assign (line 150) | def assign(
    method reset (line 198) | def reset(
    method pause (line 221) | def pause(self):
    method resume (line 228) | def resume(self):
    method _decode_next_tokens (line 232) | def _decode_next_tokens(
    method append (line 259) | def append(self, next_token: int) -> str:
    method select (line 284) | def select(
    method stopped (line 301) | def stopped(self) -> bool:
    method generated_text (line 307) | def generated_text(self) -> str:
    method next_token (line 311) | def next_token(self) -> int:
    method attention_mask (line 315) | def attention_mask(self) -> torch.LongTensor:
    method max_token (line 319) | def max_token(self) -> int:
    method max_new_tokens (line 323) | def max_new_tokens(self) -> int:
    method truncate (line 329) | def truncate(self) -> int:
  class NeuronGenerator (line 333) | class NeuronGenerator(Generator):
    method __init__ (line 336) | def __init__(
    method on_device_sampling (line 363) | def on_device_sampling(self) -> bool:
    method info (line 367) | def info(self) -> InfoResponse:
    method warmup (line 376) | def warmup(self, batch: Batch) -> int:
    method max_prefill_length (line 399) | def max_prefill_length(self) -> int:
    method prefill (line 404) | def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
    method decode (line 517) | def decode(
    method _generate_token (line 585) | def _generate_token(
    method _cached_batch (line 652) | def _cached_batch(self, batch_id: int, request_ids: List):
    method filter (line 659) | def filter(self, batch_id: int, keep_request_ids: List[int]) -> Cached...
    method clear (line 677) | def clear(self, batch_id: Optional[int] = None):
    method _clear (line 684) | def _clear(self, keep_slot_ids: List):
    method from_pretrained (line 691) | def from_pretrained(cls, model_id: str, revision: str = None):

FILE: backends/neuron/server/text_generation_server/interceptor.py
  class ExceptionInterceptor (line 10) | class ExceptionInterceptor(AsyncServerInterceptor):
    method intercept (line 11) | async def intercept(

FILE: backends/neuron/server/text_generation_server/model.py
  function get_export_kwargs_from_env (line 17) | def get_export_kwargs_from_env():
  function is_cached (line 36) | def is_cached(model_id):
  function log_cache_size (line 50) | def log_cache_size():
  function fetch_model (line 62) | def fetch_model(

FILE: backends/neuron/server/text_generation_server/server.py
  class TextGenerationService (line 14) | class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServi...
    method __init__ (line 15) | def __init__(self, generator: Generator, server_urls: List[str]):
    method Info (line 19) | async def Info(self, request, context):
    method Health (line 22) | async def Health(self, request, context):
    method ServiceDiscovery (line 25) | async def ServiceDiscovery(self, request, context):
    method ClearCache (line 28) | async def ClearCache(self, request, context):
    method FilterBatch (line 35) | async def FilterBatch(self, request, context):
    method Warmup (line 39) | async def Warmup(self, request, context):
    method Prefill (line 43) | async def Prefill(self, request, context):
    method Decode (line 47) | async def Decode(self, request, context):
  function serve (line 52) | def serve(

FILE: backends/neuron/server/text_generation_server/tgi_env.py
  function parse_cmdline_and_set_env (line 34) | def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namesp...
  function neuron_config_to_env (line 88) | def neuron_config_to_env(neuron_config):
  function sort_neuron_configs (line 114) | def sort_neuron_configs(dictionary):
  function lookup_compatible_cached_model (line 118) | def lookup_compatible_cached_model(
  function check_env_and_neuron_config_compatibility (line 158) | def check_env_and_neuron_config_compatibility(
  function get_env_dict (line 245) | def get_env_dict() -> Dict[str, str]:
  function get_neuron_config_for_model (line 252) | def get_neuron_config_for_model(

FILE: backends/neuron/tests/fixtures/model.py
  function export_model (line 58) | def export_model(model_id, export_kwargs, neuron_model_path):
  function neuron_model_config (line 80) | def neuron_model_config(request):
  function neuron_model_path (line 117) | def neuron_model_path(neuron_model_config):

FILE: backends/neuron/tests/prune_test_models.py
  function main (line 5) | def main():

FILE: backends/neuron/tests/server/helpers.py
  function create_request (line 10) | def create_request(
  function check_prefill (line 40) | def check_prefill(
  function check_decode_single (line 80) | def check_decode_single(
  function check_decode_multiple (line 106) | def check_decode_multiple(model_path):

FILE: backends/neuron/tests/server/test_cached_model.py
  function cached_model_id (line 9) | def cached_model_id(neuron_model_config) -> str:
  function test_model_is_cached (line 26) | def test_model_is_cached(cached_model_id):
  function test_fetch_cached_model (line 30) | def test_fetch_cached_model(cached_model_id: str):
  function test_generator_from_cached_model (line 38) | def test_generator_from_cached_model(cached_model_id: str):

FILE: backends/neuron/tests/server/test_continuous_batching.py
  function test_continuous_batching_two_requests (line 6) | def test_continuous_batching_two_requests(neuron_model_config):

FILE: backends/neuron/tests/server/test_decode.py
  function test_decode (line 6) | def test_decode(neuron_model_config):
  function _test_decode (line 25) | def _test_decode(config_name, generator, do_sample):

FILE: backends/neuron/tests/server/test_generator_slot.py
  function tokenizer (line 12) | def tokenizer(request):
  function test_decode_streaming (line 33) | def test_decode_streaming(tokenizer, input_text, generated_text):

FILE: backends/neuron/tests/server/test_info.py
  function test_info (line 4) | def test_info(neuron_model_path):

FILE: backends/neuron/tests/server/test_prefill.py
  function test_prefill (line 6) | def test_prefill(neuron_model_config):
  function _test_prefill (line 21) | def _test_prefill(config_name, generator, batch_size, do_sample):
  function test_prefill_truncate (line 60) | def test_prefill_truncate(neuron_model_config):

FILE: backends/neuron/tests/test_entry_point.py
  function test_get_neuron_config_for_model (line 15) | def test_get_neuron_config_for_model(neuron_model_config):
  function test_lookup_compatible_cached_model (line 38) | def test_lookup_compatible_cached_model(model_id: str):
  function test_neuron_config_to_env (line 43) | def test_neuron_config_to_env(neuron_model_config) -> None:

FILE: backends/neuron/tgi_entry_point.py
  function main (line 22) | def main():

FILE: backends/trtllm/build.rs
  constant ADDITIONAL_BACKEND_LINK_LIBRARIES (line 8) | const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
  constant CUDA_ARCH_LIST (line 9) | const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
  constant CUDA_REQUIRED_VERSION (line 10) | const CUDA_REQUIRED_VERSION: &str = "12.8";
  constant MPI_REQUIRED_VERSION (line 11) | const MPI_REQUIRED_VERSION: &str = "4.1";
  constant INSTALL_PREFIX (line 12) | const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
  constant TENSORRT_ROOT_DIR (line 13) | const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
  constant NCCL_ROOT_DIR (line 14) | const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
  constant IS_GHA_BUILD (line 16) | const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
  constant BACKEND_DEPS (line 26) | const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
  constant CUDA_TRANSITIVE_DEPS (line 27) | const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nv...
  constant TENSORRT_LLM_TRANSITIVE_DEPS (line 28) | const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
  function get_compiler_flag (line 45) | fn get_compiler_flag(
  function get_library_architecture (line 56) | fn get_library_architecture() -> &'static str {
  function build_backend (line 87) | fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> ...
  function build_ffi_layer (line 178) | fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
  function main (line 206) | fn main() {

FILE: backends/trtllm/csrc/backend.cpp
  type huggingface::tgi::backends::trtllm (line 8) | namespace huggingface::tgi::backends::trtllm {

FILE: backends/trtllm/csrc/backend.hpp
  type huggingface::tgi::backends::trtllm (line 17) | namespace huggingface::tgi::backends::trtllm {
    type generation_params_t (line 26) | struct generation_params_t {
    type sampling_params_t (line 33) | struct sampling_params_t {
    type generation_config_t (line 65) | struct generation_config_t {
      method generation_config_t (line 70) | constexpr explicit generation_config_t(const json &config) :
    class backend_workspace_t (line 87) | class backend_workspace_t {
      method backend_workspace_t (line 100) | backend_workspace_t(std::filesystem::path &engines_folder, std::file...
      method backend_workspace_t (line 106) | backend_workspace_t(std::filesystem::path &&engines_folder, std::fil...
      method engines_folder (line 116) | [[nodiscard]] constexpr std::filesystem::path engines_folder() const...
      method generation_config_t (line 123) | [[nodiscard]] constexpr const generation_config_t &generation_config...
    type backend_error_t (line 143) | enum backend_error_t {
    class backend_t (line 155) | class backend_t {
      method backend_t (line 163) | backend_t(std::filesystem::path &&engines_folder, std::filesystem::p...
  type fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> (line 212) | struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_par...
    method format (line 213) | auto format(huggingface::tgi::backends::trtllm::generation_params_t co...
  type fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> (line 220) | struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_param...
    method format (line 221) | auto format(huggingface::tgi::backends::trtllm::sampling_params_t cons...

FILE: backends/trtllm/csrc/ffi.hpp
  type rust::behavior (line 16) | namespace rust::behavior {
    function trycatch (line 18) | static void trycatch(Try &&func, Fail &&fail) noexcept try {
  type huggingface::tgi::backends::trtllm (line 25) | namespace huggingface::tgi::backends::trtllm {
    class tensorrt_llm_backend_t (line 26) | class tensorrt_llm_backend_t
      method tensorrt_llm_backend_t (line 83) | tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::f...
      method num_tokens_ready (line 86) | size_t num_tokens_ready() const noexcept { return inner_.num_tokens_...
      method request_id_t (line 88) | request_id_t submit(
      method pull_tokens (line 118) | std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexce...
      method cancel (line 139) | void cancel(request_id_t request_id) noexcept {
    function finish_reason_t (line 35) | constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason r...
    class tensorrt_llm_backend_t (line 78) | class tensorrt_llm_backend_t {
      method tensorrt_llm_backend_t (line 83) | tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::f...
      method num_tokens_ready (line 86) | size_t num_tokens_ready() const noexcept { return inner_.num_tokens_...
      method request_id_t (line 88) | request_id_t submit(
      method pull_tokens (line 118) | std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexce...
      method cancel (line 139) | void cancel(request_id_t request_id) noexcept {
    function initialize_logging (line 145) | void initialize_logging() {
    function initialize_tensorrt_llm_backend (line 163) | void initialize_tensorrt_llm_backend() {
    function create_backend_from_engine_folder (line 180) | std::unique_ptr<tensorrt_llm_backend_t>
  type huggingface::tgi::backends::trtllm (line 32) | namespace huggingface::tgi::backends::trtllm {
    class tensorrt_llm_backend_t (line 26) | class tensorrt_llm_backend_t
      method tensorrt_llm_backend_t (line 83) | tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::f...
      method num_tokens_ready (line 86) | size_t num_tokens_ready() const noexcept { return inner_.num_tokens_...
      method request_id_t (line 88) | request_id_t submit(
      method pull_tokens (line 118) | std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexce...
      method cancel (line 139) | void cancel(request_id_t request_id) noexcept {
    function finish_reason_t (line 35) | constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason r...
    class tensorrt_llm_backend_t (line 78) | class tensorrt_llm_backend_t {
      method tensorrt_llm_backend_t (line 83) | tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::f...
      method num_tokens_ready (line 86) | size_t num_tokens_ready() const noexcept { return inner_.num_tokens_...
      method request_id_t (line 88) | request_id_t submit(
      method pull_tokens (line 118) | std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexce...
      method cancel (line 139) | void cancel(request_id_t request_id) noexcept {
    function initialize_logging (line 145) | void initialize_logging() {
    function initialize_tensorrt_llm_backend (line 163) | void initialize_tensorrt_llm_backend() {
    function create_backend_from_engine_folder (line 180) | std::unique_ptr<tensorrt_llm_backend_t>

FILE: backends/trtllm/csrc/hardware.hpp
  type huggingface::tgi::hardware::cuda (line 8) | namespace huggingface::tgi::hardware::cuda {
    function get_device_count (line 19) | inline std::optional<size_t> get_device_count() {
    type compute_capabilities_t (line 30) | struct compute_capabilities_t {
      method compute_capabilities_t (line 34) | compute_capabilities_t(): compute_capabilities_t(0) {}
      method compute_capabilities_t (line 35) | explicit compute_capabilities_t(size_t device_idx): major(-1), minor...
      method compute_capabilities_t (line 41) | compute_capabilities_t(int32_t major, int32_t minor): major(major), ...
      method is_at_least (line 48) | [[nodiscard]] constexpr auto is_at_least(std::tuple<uint32_t, uint32...
      method is_at_least_volta (line 54) | [[nodiscard]] constexpr bool is_at_least_volta() const { return is_a...
      method is_at_least_turing (line 60) | [[nodiscard]] constexpr bool is_at_least_turing() const { return is_...
      method is_at_least_ampere (line 66) | [[nodiscard]] constexpr bool is_at_least_ampere() const { return is_...
      method is_at_least_ada_lovelace (line 72) | [[nodiscard]] constexpr bool is_at_least_ada_lovelace() const { retu...
      method is_at_least_hopper (line 78) | [[nodiscard]] constexpr bool is_at_least_hopper() const { return is_...

FILE: backends/trtllm/scripts/setup_sccache.py
  function setup_sccache_locally (line 14) | def setup_sccache_locally():
  function setup_sccache_for_s3 (line 25) | def setup_sccache_for_s3():

FILE: backends/trtllm/src/errors.rs
  type TensorRtLlmBackendError (line 7) | pub enum TensorRtLlmBackendError {

FILE: backends/trtllm/src/lib.rs
  type FinishReason (line 11) | pub enum FinishReason {
  type GenerationStep (line 33) | pub struct GenerationStep {
  function create_backend_from_engine_folder (line 64) | fn create_backend_from_engine_folder(
  function num_tokens_ready (line 69) | fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
  function submit (line 71) | fn submit(
  function pull_tokens (line 83) | fn pull_tokens(
  function cancel (line 87) | fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
  method from (line 95) | fn from(reason: FinishReason) -> Self {

FILE: backends/trtllm/src/looper.rs
  type InferResult (line 29) | type InferResult<T> = Result<T, InferError>;
  type GenerationContext (line 32) | struct GenerationContext {
  type DecodedToken (line 41) | struct DecodedToken {
    type Error (line 49) | type Error = InferError;
    method try_from (line 51) | fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {
  function executor_status_looper (line 65) | fn executor_status_looper(
  function post_process_decoded_token (line 170) | fn post_process_decoded_token(
  function ensure_paths_exist (line 218) | fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
  type TensorRtLlmBackendV2 (line 259) | pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);
    method new (line 262) | pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
    method validate (line 286) | fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
  method schedule (line 315) | fn schedule(
  method health (line 340) | async fn health(&self, _: bool) -> bool {
  method name (line 344) | fn name(&self) -> &'static str {

FILE: backends/trtllm/src/main.rs
  type Args (line 19) | struct Args {
  function get_tokenizer (line 74) | async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> ...
  function main (line 219) | async fn main() -> Result<(), TensorRtLlmBackendError> {

FILE: backends/trtllm/src/utils.rs
  function first_line (line 20) | pub(crate) fn first_line(s: &str, fail: &str) -> String {

FILE: backends/v2/build.rs
  function main (line 3) | fn main() -> Result<(), Box<dyn std::error::Error>> {

FILE: backends/v2/src/backend.rs
  type BackendV2 (line 16) | pub struct BackendV2 {
    method new (line 27) | pub(crate) fn new(
  method schedule (line 73) | fn schedule(
  method health (line 98) | async fn health(&self, current_health: bool) -> bool {
  method start_health (line 108) | fn start_health(&self) -> bool {
  method name (line 112) | fn name(&self) -> &'static str {
  function batching_task (line 122) | pub(crate) async fn batching_task(
  function prefill (line 240) | async fn prefill(
  function decode (line 280) | async fn decode(
  function filter_batch (line 327) | async fn filter_batch(
  function filter_send_generations (line 361) | fn filter_send_generations(generations: Vec<Generation>, entries: &mut I...
  function send_responses (line 386) | fn send_responses(
  function send_errors (line 478) | fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
  method from (line 495) | fn from(value: crate::client::GeneratedText) -> Self {

FILE: backends/v2/src/client/grpc_client.rs
  type Client (line 14) | pub struct Client {
    method connect (line 21) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 30) | pub async fn connect_uds(path: String) -> Result<Self> {
    method service_discovery (line 45) | pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
    method info (line 65) | pub async fn info(&mut self) -> Result<InfoResponse> {
    method health (line 73) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 81) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 89) | pub async fn filter_batch(
    method warmup (line 107) | pub async fn warmup(
    method prefill (line 188) | pub async fn prefill(
    method decode (line 206) | pub async fn decode(
  type PrefillTimings (line 225) | pub struct PrefillTimings {
    method new (line 232) | fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
  type DecodeTimings (line 241) | pub struct DecodeTimings {
    method new (line 249) | fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_...

FILE: backends/v2/src/client/mod.rs
  type Health (line 22) | pub trait Health {
    method device_health (line 24) | async fn device_health(&self) -> Result<()>;
    method model_health (line 28) | async fn model_health(&self) -> Result<()>;
  type ShardInfo (line 32) | pub struct ShardInfo {
  type ClientError (line 41) | pub enum ClientError {
    method from (line 51) | fn from(err: Status) -> Self {
    method from (line 59) | fn from(err: transport::Error) -> Self {
  type Result (line 68) | pub type Result<T> = std::result::Result<T, ClientError>;

FILE: backends/v2/src/client/sharded_client.rs
  type ShardedClient (line 18) | pub struct ShardedClient {
    method new (line 23) | fn new(clients: Vec<Client>) -> Self {
    method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result<Self> {
    method connect (line 39) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 45) | pub async fn connect_uds(path: String) -> Result<Self> {
    method info (line 52) | pub async fn info(&mut self) -> Result<ShardInfo> {
    method health (line 63) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 74) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 85) | pub async fn filter_batch(
    method warmup (line 103) | pub async fn warmup(
    method prefill (line 135) | pub async fn prefill(
    method decode (line 168) | pub async fn decode(
  method from (line 198) | fn from(value: InfoResponse) -> Self {
  method device_health (line 211) | async fn device_health(&self) -> Result<()> {
  method model_health (line 216) | async fn model_health(&self) -> Result<()> {

FILE: backends/v2/src/lib.rs
  type BackendInfo (line 12) | pub struct BackendInfo {
  function connect_backend (line 33) | pub async fn connect_backend(
  type V2Error (line 130) | pub enum V2Error {

FILE: backends/v2/src/main.rs
  type Args (line 9) | struct Args {
  type Commands (line 82) | enum Commands {
  function main (line 87) | async fn main() -> Result<(), RouterError> {
  type RouterError (line 215) | enum RouterError {

FILE: backends/v2/src/queue.rs
  type Entry (line 18) | pub(crate) struct Entry {
  type Queue (line 35) | pub(crate) struct Queue {
    method new (line 41) | pub(crate) fn new(
    method append (line 63) | pub(crate) fn append(&self, entry: Entry) {
    method next_batch (line 73) | pub(crate) async fn next_batch(
  function queue_task (line 101) | async fn queue_task(
  type State (line 135) | struct State {
    method new (line 159) | fn new(
    method append (line 177) | fn append(&mut self, mut entry: Entry) {
    method next_batch (line 188) | fn next_batch(
  type NextBatch (line 349) | type NextBatch = (IntMap<u64, Entry>, Batch, Span);
  type QueueCommand (line 352) | enum QueueCommand {
  method from (line 365) | fn from(value: ValidParameters) -> Self {
  method from (line 392) | fn from(value: ValidStoppingParameters) -> Self {
  function default_entry (line 407) | fn default_entry() -> (
  function test_append (line 452) | fn test_append() {
  function test_next_batch_empty (line 468) | fn test_next_batch_empty() {
  function test_next_batch_min_size (line 476) | fn test_next_batch_min_size() {
  function test_next_batch_max_size (line 508) | fn test_next_batch_max_size() {
  function test_next_batch_token_budget (line 528) | fn test_next_batch_token_budget() {
  function test_queue_append (line 561) | async fn test_queue_append() {
  function test_queue_next_batch_empty (line 568) | async fn test_queue_next_batch_empty() {
  function test_queue_next_batch_min_size (line 576) | async fn test_queue_next_batch_min_size() {
  function test_queue_next_batch_max_size (line 609) | async fn test_queue_next_batch_max_size() {
  function test_queue_next_batch_token_budget (line 625) | async fn test_queue_next_batch_token_budget() {
  function test_queue_next_batch_token_speculate (line 650) | async fn test_queue_next_batch_token_speculate() {
  function test_queue_next_batch_dropped_receiver (line 669) | async fn test_queue_next_batch_dropped_receiver() {

FILE: backends/v3/benches/prefix_cache.rs
  function prefix_cache_benchmark (line 9) | fn prefix_cache_benchmark(c: &mut Criterion) {

FILE: backends/v3/build.rs
  function main (line 3) | fn main() -> Result<(), Box<dyn std::error::Error>> {

FILE: backends/v3/src/backend.rs
  type BackendV3 (line 18) | pub struct BackendV3 {
    method new (line 29) | pub(crate) fn new(
  method schedule (line 79) | fn schedule(
  method health (line 105) | async fn health(&self, current_health: bool) -> bool {
  method start_health (line 115) | fn start_health(&self) -> bool {
  method name (line 119) | fn name(&self) -> &'static str {
  function batching_task (line 129) | pub(crate) async fn batching_task(
  function prefill (line 297) | async fn prefill(
  function decode (line 342) | async fn decode(
  function filter_batch (line 389) | async fn filter_batch(
  function filter_send_generations (line 423) | fn filter_send_generations(generations: Vec<Generation>, entries: &mut I...
  function send_responses (line 448) | fn send_responses(
  function send_errors (line 540) | fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
  method from (line 557) | fn from(value: crate::client::GeneratedText) -> Self {

FILE: backends/v3/src/block_allocator.rs
  type BlockAllocation (line 7) | pub struct BlockAllocation {
  method drop (line 20) | fn drop(&mut self) {
  type BlockAllocator (line 28) | pub struct BlockAllocator {
    method new (line 34) | pub(crate) fn new(
    method allocate (line 57) | pub(crate) async fn allocate(
    method free (line 77) | pub(crate) fn free(&self, blocks: Vec<u32>, allocation_id: u64) {
  function block_allocator_task (line 87) | async fn block_allocator_task(
  type BlockAllocatorCommand (line 119) | enum BlockAllocatorCommand {
  type Allocator (line 131) | pub trait Allocator {
    method allocate (line 132) | fn allocate(
    method free (line 138) | fn free(&mut self, blocks: Vec<u32>, allocation_id: u64);
    method allocate (line 160) | fn allocate(
    method free (line 218) | fn free(&mut self, blocks: Vec<u32>, _allocation_id: u64) {
  type SimpleAllocator (line 140) | pub struct SimpleAllocator {
    method new (line 148) | fn new(blocks: u32, block_size: u32, window_size: Option<u32>) -> Self {

FILE: backends/v3/src/client/grpc_client.rs
  type Client (line 16) | pub struct Client {
    method connect (line 23) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 32) | pub async fn connect_uds(path: String) -> Result<Self> {
    method service_discovery (line 47) | pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
    method info (line 67) | pub async fn info(&mut self) -> Result<InfoResponse> {
    method health (line 75) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 83) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 91) | pub async fn filter_batch(
    method warmup (line 109) | pub async fn warmup(
    method prefill (line 230) | pub async fn prefill(
    method decode (line 258) | pub async fn decode(
  type PrefillTimings (line 277) | pub struct PrefillTimings {
    method new (line 285) | fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_...
  type DecodeTimings (line 295) | pub struct DecodeTimings {
    method new (line 303) | fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_...

FILE: backends/v3/src/client/mod.rs
  type Health (line 23) | pub trait Health {
    method device_health (line 25) | async fn device_health(&self) -> Result<()>;
    method model_health (line 29) | async fn model_health(&self) -> Result<()>;
  type ClientError (line 33) | pub enum ClientError {
    method from (line 43) | fn from(err: Status) -> Self {
    method from (line 51) | fn from(err: transport::Error) -> Self {
  method from (line 60) | fn from(chunk: Chunk) -> Self {
  type Result (line 67) | pub type Result<T> = std::result::Result<T, ClientError>;

FILE: backends/v3/src/client/sharded_client.rs
  type ShardedClient (line 18) | pub struct ShardedClient {
    method new (line 23) | fn new(clients: Vec<Client>) -> Self {
    method from_master_client (line 29) | async fn from_master_client(mut master_client: Client) -> Result<Self> {
    method connect (line 39) | pub async fn connect(uri: Uri) -> Result<Self> {
    method connect_uds (line 45) | pub async fn connect_uds(path: String) -> Result<Self> {
    method info (line 52) | pub async fn info(&mut self) -> Result<InfoResponse> {
    method health (line 63) | pub async fn health(&mut self) -> Result<HealthResponse> {
    method clear_cache (line 74) | pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<(...
    method filter_batch (line 85) | pub async fn filter_batch(
    method warmup (line 103) | pub async fn warmup(
    method prefill (line 142) | pub async fn prefill(
    method decode (line 176) | pub async fn decode(
  method device_health (line 207) | async fn device_health(&self) -> Result<()> {
  method model_health (line 212) | async fn model_health(&self) -> Result<()> {

FILE: backends/v3/src/lib.rs
  type BackendInfo (line 14) | pub struct BackendInfo {
  function connect_backend (line 48) | pub async fn connect_backend(
  type V3Error (line 172) | pub enum V3Error {

FILE: backends/v3/src/main.rs
  type Args (line 9) | struct Args {
  type Commands (line 82) | enum Commands {
  function main (line 87) | async fn main() -> Result<(), RouterError> {
  type RouterError (line 231) | enum RouterError {

FILE: backends/v3/src/queue.rs
  type Entry (line 22) | pub(crate) struct Entry {
  type Queue (line 41) | pub(crate) struct Queue {
    method new (line 47) | pub(crate) fn new(
    method append (line 76) | pub(crate) fn append(&self, entry: Entry) {
    method next_batch (line 86) | pub(crate) async fn next_batch(
  function queue_task (line 119) | async fn queue_task(
  type State (line 166) | struct State {
    method new (line 195) | fn new(
    method append (line 226) | fn append(&mut self, mut entry: Entry) {
    method next_batch (line 237) | async fn next_batch(
  type NextBatch (line 507) | type NextBatch = (IntMap<u64, Entry>, Batch, Span);
  type QueueCommand (line 510) | enum QueueCommand {
  method from (line 523) | fn from(value: ValidParameters) -> Self {
  method from (line 550) | fn from(value: ValidStoppingParameters) -> Self {
  function default_entry (line 566) | fn default_entry() -> (
  function test_append (line 612) | async fn test_append() {
  function test_next_batch_empty (line 628) | async fn test_next_batch_empty() {
  function test_next_batch_min_size (line 636) | async fn test_next_batch_min_size() {
  function test_next_batch_max_size (line 668) | async fn test_next_batch_max_size() {
  function test_next_batch_token_budget (line 688) | async fn test_next_batch_token_budget() {
  function test_queue_append (line 721) | async fn test_queue_append() {
  function test_queue_next_batch_empty (line 728) | async fn test_queue_next_batch_empty() {
  function test_queue_next_batch_min_size (line 736) | async fn test_queue_next_batch_min_size() {
  function test_queue_next_batch_max_size (line 769) | async fn test_queue_next_batch_max_size() {
  function test_queue_next_batch_token_budget (line 785) | async fn test_queue_next_batch_token_budget() {
  function test_queue_next_batch_token_speculate (line 810) | async fn test_queue_next_batch_token_speculate() {
  function test_queue_next_batch_dropped_receiver (line 829) | async fn test_queue_next_batch_dropped_receiver() {

FILE: backends/v3/src/radix.rs
  function hash (line 9) | fn hash(slice: &[u32]) -> u64 {
  type RadixAllocator (line 20) | pub struct RadixAllocator {
    method new (line 39) | pub fn new(block_size: u32, n_blocks: u32, window_size: Option<u32>) -...
    method alloc_or_reclaim (line 52) | fn alloc_or_reclaim(&mut self, n_blocks_needed: usize) -> Option<Vec<u...
  method allocate (line 82) | fn allocate(
  method free (line 157) | fn free(&mut self, blocks: Vec<u32>, allocation_id: u64) {
  type RadixAllocation (line 211) | struct RadixAllocation {
  type TrieError (line 230) | pub enum TrieError {
  type NodeId (line 235) | pub type NodeId = DefaultKey;
  type RadixTrie (line 238) | pub struct RadixTrie {
    method new (line 258) | pub fn new(block_size: usize) -> Self {
    method find (line 280) | pub fn find(&mut self, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
    method find_ (line 286) | fn find_(&mut self, node_id: NodeId, key: &[u32], blocks: &mut Vec<u32...
    method decref (line 313) | pub fn decref(&mut self, node_id: NodeId) -> Result<(), TrieError> {
    method incref (line 342) | pub fn incref(&mut self, node_id: NodeId) -> Result<(), TrieError> {
    method evict (line 363) | pub fn evict(&mut self, n_blocks: usize) -> Vec<u32> {
    method insert (line 416) | pub fn insert(&mut self, tokens: &[u32], blocks: &[u32]) -> Result<usi...
    method insert_ (line 423) | fn insert_(
    method split_node (line 473) | fn split_node(&mut self, node_id: NodeId, prefix_len: usize) -> NodeId {
    method add_node (line 509) | fn add_node(
    method add_node_to_parent (line 529) | fn add_node_to_parent(&mut self, parent_id: NodeId, hash: u64, child_i...
    method remove_node (line 540) | fn remove_node(&mut self, node_id: NodeId) -> TrieNode {
    method update_access_time (line 558) | fn update_access_time(&mut self, node_id: NodeId) {
    method print_debug (line 575) | pub fn print_debug(&self) {
    method print_debug_ (line 579) | fn print_debug_(&self, node_id: NodeId, indent: usize) {
    method root_id (line 597) | pub(crate) fn root_id(&self) -> DefaultKey {
  type TrieNode (line 604) | struct TrieNode {
    method new (line 614) | fn new(key: Vec<u32>, blocks: Vec<u32>, last_accessed: u64, parent: Op...
  function shared_prefix (line 626) | fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize {
  function allocator_block_size (line 647) | fn allocator_block_size() {
  function allocator_block_size_non_aligned (line 662) | fn allocator_block_size_non_aligned() {
  function allocator_reuses_prefixes (line 677) | fn allocator_reuses_prefixes() {
  function allocator_collects_older_prefixes_first (line 691) | fn allocator_collects_older_prefixes_first() {
  function allocator_frees_fully_overlapping_prefills (line 711) | fn allocator_frees_fully_overlapping_prefills() {
  function allocator_frees_partially_overlapping_prefills (line 727) | fn allocator_frees_partially_overlapping_prefills() {
  function trie_insertions_have_correct_prefix_len (line 769) | fn trie_insertions_have_correct_prefix_len() {
  function trie_insertions_block_size (line 792) | fn trie_insertions_block_size() {
  function trie_get_returns_correct_blocks (line 816) | fn trie_get_returns_correct_blocks() {
  function trie_evict_removes_correct_blocks (line 850) | fn trie_evict_removes_correct_blocks() {
  function full_match_returns_correct_node (line 888) | fn full_match_returns_correct_node() {
  function partial_match_does_not_recurse (line 899) | fn partial_match_does_not_recurse() {
  type AllocationWithInfo (line 910) | struct AllocationWithInfo {
  function invariants_hold_on_many_operations_remove_all (line 919) | fn invariants_hold_on_many_operations_remove_all() {
  function invariants_hold_on_many_operations_remove_subset (line 924) | fn invariants_hold_on_many_operations_remove_subset() {
  function invariants_hold_on_many_insertions (line 928) | fn invariants_hold_on_many_insertions(remove_all: bool) {
  function check_allocation_invariants (line 1014) | fn check_allocation_invariants(allocations: &[AllocationWithInfo]) {

FILE: benchmark/src/app.rs
  type App (line 15) | pub(crate) struct App {
    method new (line 33) | pub(crate) fn new(
    method handle_key_event (line 69) | pub(crate) fn handle_key_event(&mut self, key_event: KeyEvent) {
    method tick (line 125) | pub(crate) fn tick(&mut self) {
    method render (line 155) | pub fn render(&mut self, f: &mut Frame) {
  type Data (line 367) | pub(crate) struct Data {
    method new (line 379) | fn new(n_run: usize, batch_size: Vec<u32>) -> Self {
    method push_prefill (line 406) | fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
    method push_decode (line 412) | fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
    method end_batch (line 420) | fn end_batch(&mut self, batch_idx: usize) {
  function progress_gauge (line 437) | fn progress_gauge(title: &str, label: String, progress: f64, color: Colo...
  function throughput_paragraph (line 446) | fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> P...
  function latency_paragraph (line 459) | fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Par...
  function statis_spans (line 485) | fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
  function latency_histogram_data (line 516) | fn latency_histogram_data(latency: &[f64], bins: usize) -> Vec<(String, ...
  function latency_histogram (line 529) | fn latency_histogram<'a>(
  function latency_throughput_chart (line 544) | fn latency_throughput_chart<'a>(
  function color_vec (line 674) | fn color_vec() -> Vec<Color> {

FILE: benchmark/src/event.rs
  type Event (line 8) | pub(crate) enum Event {
  function terminal_event_task (line 17) | pub(crate) async fn terminal_event_task(
  function event_loop (line 33) | async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {

FILE: benchmark/src/generation.rs
  constant LOREM_IPSUM (line 10) | const LOREM_IPSUM: &str = "Lorem ipsum dolor sit amet, consectetur adipi...
  type Prefill (line 13) | pub(crate) struct Prefill {
  type Decode (line 19) | pub(crate) struct Decode {
  type Message (line 26) | pub(crate) enum Message {
  function generation_task (line 36) | pub(crate) async fn generation_task(
  function generate_runs (line 64) | async fn generate_runs(
  function prefill (line 132) | async fn prefill(
  function decode (line 197) | async fn decode(batch: CachedBatch, client: &mut ShardedClient) -> Resul...
  function create_sequence (line 227) | fn create_sequence(sequence_length: u32, tokenizer: Tokenizer) -> String {

FILE: benchmark/src/lib.rs
  function run (line 19) | pub async fn run(

FILE: benchmark/src/main.rs
  type Args (line 16) | struct Args {
  function main (line 108) | fn main() -> Result<(), Box<dyn std::error::Error>> {
  function init_logging (line 211) | fn init_logging() {

FILE: benchmark/src/table.rs
  function parameters_table (line 6) | pub(crate) fn parameters_table(
  function latency_table (line 46) | pub(crate) fn latency_table(data: &Data) -> Table {
  function throughput_table (line 84) | pub(crate) fn throughput_table(data: &Data) -> Table {
  function add_latencies (line 107) | fn add_latencies(
  function add_throuhgputs (line 132) | fn add_throuhgputs(
  function avg_min_max (line 154) | fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
  function px (line 167) | fn px(data: &[f64], p: u32) -> f64 {
  function format_value (line 172) | fn format_value(value: f64, unit: &'static str) -> String {

FILE: benchmark/src/utils.rs
  function histogram (line 16) | pub(crate) fn histogram(values: &[f64], bins: usize) -> Vec<(f64, usize)> {
  function percentiles (line 35) | pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<S...

FILE: clients/python/tests/conftest.py
  function flan_t5_xxl (line 8) | def flan_t5_xxl():
  function llama_7b (line 13) | def llama_7b():
  function fake_model (line 18) | def fake_model():
  function unsupported_model (line 23) | def unsupported_model():
  function base_url (line 28) | def base_url():
  function bloom_url (line 33) | def bloom_url(base_url, bloom_model):
  function flan_t5_xxl_url (line 38) | def flan_t5_xxl_url(base_url, flan_t5_xxl):
  function llama_7b_url (line 43) | def llama_7b_url(base_url, llama_7b):
  function fake_url (line 48) | def fake_url(base_url, fake_model):
  function unsupported_url (line 53) | def unsupported_url(base_url, unsupported_model):
  function hf_headers (line 58) | def hf_headers():

FILE: clients/python/tests/test_client.py
  function test_generate (line 8) | def test_generate(llama_7b_url, hf_headers):
  function test_generate_best_of (line 24) | def test_generate_best_of(llama_7b_url, hf_headers):
  function test_generate_not_found (line 36) | def test_generate_not_found(fake_url, hf_headers):
  function test_generate_validation_error (line 42) | def test_generate_validation_error(llama_7b_url, hf_headers):
  function test_generate_stream (line 48) | def test_generate_stream(llama_7b_url, hf_headers):
  function test_generate_stream_not_found (line 63) | def test_generate_stream_not_found(fake_url, hf_headers):
  function test_generate_stream_validation_error (line 69) | def test_generate_stream_validation_error(llama_7b_url, hf_headers):
  function test_generate_async (line 76) | async def test_generate_async(llama_7b_url, hf_headers):
  function test_generate_async_best_of (line 98) | async def test_generate_async_best_of(llama_7b_url, hf_headers):
  function test_generate_async_not_found (line 111) | async def test_generate_async_not_found(fake_url, hf_headers):
  function test_generate_async_validation_error (line 118) | async def test_generate_async_validation_error(llama_7b_url, hf_headers):
  function test_generate_stream_async (line 125) | async def test_generate_stream_async(llama_7b_url, hf_headers):
  function test_generate_stream_async_not_found (line 141) | async def test_generate_stream_async_not_found(fake_url, hf_headers):
  function test_generate_stream_async_validation_error (line 149) | async def test_generate_stream_async_validation_error(llama_7b_url, hf_h...

FILE: clients/python/tests/test_errors.py
  function test_generation_error (line 16) | def test_generation_error():
  function test_incomplete_generation_error (line 21) | def test_incomplete_generation_error():
  function test_overloaded_error (line 26) | def test_overloaded_error():
  function test_validation_error (line 31) | def test_validation_error():
  function test_bad_request_error (line 36) | def test_bad_request_error():
  function test_shard_not_ready_error (line 41) | def test_shard_not_ready_error():
  function test_shard_timeout_error (line 47) | def test_shard_timeout_error():
  function test_not_found_error (line 52) | def test_not_found_error():
  function test_rate_limit_exceeded_error (line 57) | def test_rate_limit_exceeded_error():
  function test_unknown_error (line 62) | def test_unknown_error():

FILE: clients/python/tests/test_types.py
  function test_parameters_validation (line 7) | def test_parameters_validation():
  function test_request_validation (line 72) | def test_request_validation():

FILE: clients/python/text_generation/client.py
  class Client (line 31) | class Client:
    method __init__ (line 52) | def __init__(
    method completion (line 76) | def completion(
    method _completion_stream_response (line 142) | def _completion_stream_response(self, request):
    method chat (line 164) | def chat(
    method _chat_stream_response (line 264) | def _chat_stream_response(self, request):
    method generate (line 286) | def generate(
    method generate_stream (line 392) | def generate_stream(
  class AsyncClient (line 513) | class AsyncClient:
    method __init__ (line 535) | def __init__(
    method completion (line 559) | async def completion(
    method _completion_single_response (line 615) | async def _completion_single_response(self, request):
    method _completion_stream_response (line 627) | async def _completion_stream_response(self, request):
    method chat (line 646) | async def chat(
    method _chat_single_response (line 736) | async def _chat_single_response(self, request):
    method _chat_stream_response (line 748) | async def _chat_stream_response(self, request):
    method generate (line 772) | async def generate(
    method generate_stream (line 877) | async def generate_stream(

FILE: clients/python/text_generation/errors.py
  class ValidationError (line 5) | class ValidationError(Exception):
    method __init__ (line 6) | def __init__(self, message: str):
  class GenerationError (line 10) | class GenerationError(Exception):
    method __init__ (line 11) | def __init__(self, message: str):
  class OverloadedError (line 15) | class OverloadedError(Exception):
    method __init__ (line 16) | def __init__(self, message: str):
  class IncompleteGenerationError (line 20) | class IncompleteGenerationError(Exception):
    method __init__ (line 21) | def __init__(self, message: str):
  class BadRequestError (line 26) | class BadRequestError(Exception):
    method __init__ (line 27) | def __init__(self, message: str):
  class ShardNotReadyError (line 31) | class ShardNotReadyError(Exception):
    method __init__ (line 32) | def __init__(self, message: str):
  class ShardTimeoutError (line 36) | class ShardTimeoutError(Exception):
    method __init__ (line 37) | def __init__(self, message: str):
  class NotFoundError (line 41) | class NotFoundError(Exception):
    method __init__ (line 42) | def __init__(self, message: str):
  class RateLimitExceededError (line 46) | class RateLimitExceededError(Exception):
    method __init__ (line 47) | def __init__(self, message: str):
  class NotSupportedError (line 51) | class NotSupportedError(Exception):
    method __init__ (line 52) | def __init__(self, model_id: str):
  class UnknownError (line 61) | class UnknownError(Exception):
    method __init__ (line 62) | def __init__(self, message: str):
  function parse_error (line 66) | def parse_error(status_code: int, payload: Dict[str, str]) -> Exception:

FILE: clients/python/text_generation/inference_api.py
  function deployed_models (line 16) | def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
  function check_model_support (line 37) | def check_model_support(repo_id: str, headers: Optional[Dict] = None) ->...
  class InferenceAPIClient (line 59) | class InferenceAPIClient(Client):
    method __init__ (line 83) | def __init__(self, repo_id: str, token: Optional[str] = None, timeout:...
  class InferenceAPIAsyncClient (line 115) | class InferenceAPIAsyncClient(AsyncClient):
    method __init__ (line 140) | def __init__(self, repo_id: str, token: Optional[str] = None, timeout:...

FILE: clients/python/text_generation/types.py
  class GrammarType (line 9) | class GrammarType(str, Enum):
  class Grammar (line 15) | class Grammar(BaseModel):
  class ToolCall (line 22) | class ToolCall(BaseModel):
  class Chunk (line 31) | class Chunk(BaseModel):
  class Message (line 37) | class Message(BaseModel):
  class Tool (line 48) | class Tool(BaseModel):
  class Function (line 55) | class Function(BaseModel):
  class ChoiceDeltaToolCall (line 60) | class ChoiceDeltaToolCall(BaseModel):
  class ChoiceDelta (line 67) | class ChoiceDelta(BaseModel):
  class Choice (line 73) | class Choice(BaseModel):
  class CompletionRequest (line 80) | class CompletionRequest(BaseModel):
  class CompletionComplete (line 106) | class CompletionComplete(BaseModel):
  class Completion (line 117) | class Completion(BaseModel):
  class ChatRequest (line 127) | class ChatRequest(BaseModel):
  class ChatCompletionComplete (line 169) | class ChatCompletionComplete(BaseModel):
  class ChatComplete (line 182) | class ChatComplete(BaseModel):
  class ChatCompletionChunk (line 193) | class ChatCompletionChunk(BaseModel):
  class Parameters (line 203) | class Parameters(BaseModel):
    method valid_best_of (line 247) | def valid_best_of(cls, field_value, values):
    method valid_repetition_penalty (line 266) | def valid_repetition_penalty(cls, v):
    method valid_frequency_penalty (line 272) | def valid_frequency_penalty(cls, v):
    method valid_seed (line 278) | def valid_seed(cls, v):
    method valid_temp (line 284) | def valid_temp(cls, v):
    method valid_top_k (line 290) | def valid_top_k(cls, v):
    method valid_top_p (line 296) | def valid_top_p(cls, v):
    method valid_truncate (line 302) | def valid_truncate(cls, v):
    method valid_typical_p (line 308) | def valid_typical_p(cls, v):
    method valid_top_n_tokens (line 314) | def valid_top_n_tokens(cls, v):
    method valid_grammar (line 320) | def valid_grammar(cls, v):
  class Request (line 329) | class Request(BaseModel):
    method valid_input (line 338) | def valid_input(cls, v):
    method valid_best_of_stream (line 344) | def valid_best_of_stream(cls, field_value, values):
  class InputToken (line 359) | class InputToken(BaseModel):
  class Token (line 370) | class Token(BaseModel):
  class FinishReason (line 383) | class FinishReason(str, Enum):
  class BestOfSequence (line 393) | class BestOfSequence(BaseModel):
  class Details (line 411) | class Details(BaseModel):
  class Response (line 429) | class Response(BaseModel):
  class StreamDetails (line 437) | class StreamDetails(BaseModel):
  class StreamResponse (line 447) | class StreamResponse(BaseModel):
  class DeployedModel (line 461) | class DeployedModel(BaseModel):

FILE: integration-tests/conftest.py
  class SessionTimeoutFix (line 19) | class SessionTimeoutFix(requests.Session):
    method request (line 20) | def request(self, *args, **kwargs):
  function pytest_addoption (line 68) | def pytest_addoption(parser):
  function pytest_configure (line 86) | def pytest_configure(config):
  function pytest_collection_modifyitems (line 91) | def pytest_collection_modifyitems(config, items):
  function container_log (line 139) | def container_log(request: SubRequest):
  class ResponseComparator (line 151) | class ResponseComparator(JSONSnapshotExtension):
    method _serialize (line 155) | def _serialize(
    method serialize (line 181) | def serialize(
    method matches (line 201) | def matches(
  class GenerousResponseComparator (line 385) | class GenerousResponseComparator(ResponseComparator):
  class IgnoreLogProbResponseComparator (line 390) | class IgnoreLogProbResponseComparator(ResponseComparator):
  class LauncherHandle (line 394) | class LauncherHandle:
    method __init__ (line 395) | def __init__(self, port: int, error_log):
    method _inner_health (line 400) | def _inner_health(self):
    method health (line 403) | async def health(self, timeout: int = 60):
  class ContainerLauncherHandle (line 421) | class ContainerLauncherHandle(LauncherHandle):
    method __init__ (line 422) | def __init__(self, docker_client, container_name, port: int, error_log):
    method _inner_health (line 427) | def _inner_health(self) -> bool:
  class ProcessLauncherHandle (line 432) | class ProcessLauncherHandle(LauncherHandle):
    method __init__ (line 433) | def __init__(self, process, port: int, error_log):
    method _inner_health (line 437) | def _inner_health(self) -> bool:
  function response_snapshot (line 442) | def response_snapshot(snapshot):
  function generous_response_snapshot (line 447) | def generous_response_snapshot(snapshot):
  function ignore_logprob_response_snapshot (line 452) | def ignore_logprob_response_snapshot(snapshot):
  function error_log (line 457) | def error_log():
  function launcher (line 463) | async def launcher(error_log):
  function generate_load (line 734) | def generate_load():
  function generate_multi (line 762) | def generate_multi():
  function chicken (line 797) | def chicken():
  function cow_beach (line 806) | def cow_beach():

FILE: integration-tests/fixtures/gaudi/service.py
  function stream_container_logs (line 58) | def stream_container_logs(container, test_name):
  class TestClient (line 72) | class TestClient(AsyncInferenceClient):
    method __init__ (line 73) | def __init__(self, service_name: str, base_url: str):
  class LauncherHandle (line 78) | class LauncherHandle:
    method __init__ (line 79) | def __init__(self, service_name: str, port: int):
    method _inner_health (line 82) | def _inner_health(self):
    method health (line 85) | async def health(self, timeout: int = 60):
  class ContainerLauncherHandle (line 118) | class ContainerLauncherHandle(LauncherHandle):
    method __init__ (line 119) | def __init__(self, docker_client, container_name, port: int):
    method _inner_health (line 125) | def _inner_health(self) -> bool:
  class ProcessLauncherHandle (line 140) | class ProcessLauncherHandle(LauncherHandle):
    method __init__ (line 141) | def __init__(self, process, port: int):
    method _inner_health (line 146) | def _inner_health(self) -> bool:
  function data_volume (line 151) | def data_volume():
  function gaudi_launcher (line 162) | def gaudi_launcher():
  function gaudi_generate_load (line 292) | def gaudi_generate_load():

FILE: integration-tests/fixtures/neuron/export_models.py
  function get_neuron_backend_hash (line 79) | def get_neuron_backend_hash():
  function get_neuron_model_name (line 104) | def get_neuron_model_name(config_name: str):
  function get_tgi_docker_image (line 108) | def get_tgi_docker_image():
  function maybe_export_model (line 121) | def maybe_export_model(config_name, model_config):
  function maybe_export_models (line 218) | def maybe_export_models():
  function neuron_model_config (line 224) | def neuron_model_config(request):
  function neuron_model_path (line 269) | def neuron_model_path(neuron_model_config):

FILE: integration-tests/fixtures/neuron/service.py
  function get_tgi_docker_image (line 24) | def get_tgi_docker_image():
  class TestClient (line 45) | class TestClient(AsyncInferenceClient):
    method __init__ (line 46) | def __init__(self, service_name: str, base_url: str):
  class LauncherHandle (line 51) | class LauncherHandle:
    method __init__ (line 52) | def __init__(self, service_name: str, port: int):
    method _inner_health (line 55) | def _inner_health(self):
    method health (line 58) | async def health(self, timeout: int = 60):
  class ContainerLauncherHandle (line 75) | class ContainerLauncherHandle(LauncherHandle):
    method __init__ (line 76) | def __init__(self, service_name, docker_client, container_name, port: ...
    method _inner_health (line 82) | def _inner_health(self) -> bool:
  function event_loop (line 92) | def event_loop():
  function neuron_launcher (line 99) | def neuron_launcher(event_loop):
  function neuron_generate_load (line 239) | def neuron_generate_load():

FILE: integration-tests/gaudi/capture_expected_outputs.py
  function test_config (line 17) | def test_config(request) -> Dict[str, Any]:
  function test_name (line 25) | def test_name(test_config):
  function tgi_service (line 30) | def tgi_service(launcher, test_config, test_name) -> Generator:
  function test_capture_expected_outputs (line 37) | async def test_capture_expected_outputs(tgi_service, test_config, test_n...

FILE: integration-tests/gaudi/test_gaudi_generate.py
  function pytest_configure (line 7) | def pytest_configure(config):
  function pytest_generate_tests (line 193) | def pytest_generate_tests(metafunc):
  function test_config (line 208) | def test_config(request: SubRequest) -> Dict[str, Any]:
  function model_id (line 217) | def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]:
  function test_name (line 222) | def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]:
  function expected_outputs (line 227) | def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]:
  function input (line 235) | def input(test_config: Dict[str, Any]) -> str:
  function tgi_service (line 240) | def tgi_service(
  function tgi_client (line 253) | async def tgi_client(tgi_service) -> AsyncInferenceClient:
  function test_model_single_request (line 260) | async def test_model_single_request(
  function test_model_multiple_requests (line 276) | async def test_model_multiple_requests(

FILE: integration-tests/models/test_bloom_560m.py
  function bloom_560_handle (line 5) | def bloom_560_handle(launcher):
  function bloom_560 (line 11) | async def bloom_560(bloom_560_handle):
  function test_bloom_560m (line 18) | async def test_bloom_560m(bloom_560, response_snapshot):
  function test_bloom_560m_all_params (line 33) | async def test_bloom_560m_all_params(bloom_560, response_snapshot):
  function test_bloom_560m_load (line 56) | async def test_bloom_560m_load(bloom_560, generate_load, response_snapsh...

FILE: integration-tests/models/test_bloom_560m_sharded.py
  function bloom_560m_sharded_handle (line 5) | def bloom_560m_sharded_handle(launcher):
  function bloom_560m_sharded (line 11) | async def bloom_560m_sharded(bloom_560m_sharded_handle):
  function test_bloom_560m_sharded (line 18) | async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
  function test_bloom_560m_sharded_load (line 33) | async def test_bloom_560m_sharded_load(

FILE: integration-tests/models/test_chat_llama.py
  function flash_llama_chat_handle (line 5) | def flash_llama_chat_handle(launcher):
  function flash_llama_chat (line 13) | async def flash_llama_chat(flash_llama_chat_handle):
  function test_flash_llama_simple (line 19) | async def test_flash_llama_simple(flash_llama_chat, response_snapshot):

FILE: integration-tests/models/test_chat_stream_options.py
  function chat_handle (line 5) | def chat_handle(launcher):
  function chat_client (line 13) | async def chat_client(chat_handle):

FILE: integration-tests/models/test_completion_prompts.py
  function flash_llama_completion_handle (line 8) | def flash_llama_completion_handle(launcher):
  function flash_llama_completion (line 16) | async def flash_llama_completion(flash_llama_completion_handle):
  function test_flash_llama_completion_single_prompt (line 26) | def test_flash_llama_completion_single_prompt(
  function test_flash_llama_completion_stream_usage (line 50) | async def test_flash_llama_completion_stream_usage(
  function test_flash_llama_completion_many_prompts (line 118) | def test_flash_llama_completion_many_prompts(flash_llama_completion, res...
  function test_flash_llama_completion_many_prompts_stream (line 154) | async def test_flash_llama_completion_many_prompts_stream(
  function test_chat_openai_usage (line 190) | async def test_chat_openai_usage(flash_llama_completion, response_snapsh...
  function test_chat_openai_nousage (line 214) | async def test_chat_openai_nousage(flash_llama_completion, response_snap...
  function test_chat_hfhub_usage (line 235) | async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
  function test_chat_hfhub_nousage (line 259) | async def test_chat_hfhub_nousage(flash_llama_completion, response_snaps...

FILE: integration-tests/models/test_compressed_tensors_w8a8_int.py
  function compressed_tensors_w8a8_int_handle (line 5) | def compressed_tensors_w8a8_int_handle(launcher):
  function compressed_tensors_w8a8_int (line 15) | async def compressed_tensors_w8a8_int(compressed_tensors_w8a8_int_handle):
  function test_compressed_tensors_w8a8_int (line 23) | async def test_compressed_tensors_w8a8_int(
  function test_compressed_tensors_w8a8_int_all_params (line 43) | async def test_compressed_tensors_w8a8_int_all_params(
  function test_compressed_tensors_w8a8_int_load (line 73) | async def test_compressed_tensors_w8a8_int_load(

FILE: integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
  function compressed_tensors_w8a8_int_dynamic_weight_handle (line 5) | def compressed_tensors_w8a8_int_dynamic_weight_handle(launcher):
  function compressed_tensors_w8a8_int_dynamic_weight (line 15) | async def compressed_tensors_w8a8_int_dynamic_weight(
  function test_compressed_tensors_w8a8_int_dynamic_weight (line 25) | async def test_compressed_tensors_w8a8_int_dynamic_weight(
  function test_compressed_tensors_w8a8_int_dynamic_weight_all_params (line 46) | async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
  function test_compressed_tensors_w8a8_int_dynamic_weight_load (line 76) | async def test_compressed_tensors_w8a8_int_dynamic_weight_load(

FILE: integration-tests/models/test_compressed_tensors_w8an_fp.py
  function compressed_tensors_w8an_handle (line 5) | def compressed_tensors_w8an_handle(launcher):
  function compressed_tensors_w8an (line 15) | async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
  function test_compressed_tensors_w8an (line 23) | async def test_compressed_tensors_w8an(compressed_tensors_w8an, response...
  function test_compressed_tensors_w8an_all_params (line 39) | async def test_compressed_tensors_w8an_all_params(
  function test_compressed_tensors_w8an_load (line 69) | async def test_compressed_tensors_w8an_load(

FILE: integration-tests/models/test_compressed_tensors_wna16_int.py
  function compressed_tensors_wna16_handle (line 5) | def compressed_tensors_wna16_handle(launcher):
  function compressed_tensors_wna16 (line 15) | async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
  function test_compressed_tensors_wna16 (line 23) | async def test_compressed_tensors_wna16(compressed_tensors_wna16, respon...
  function test_compressed_tensors_wna16_all_params (line 39) | async def test_compressed_tensors_wna16_all_params(
  function test_compressed_tensors_wna16_load (line 69) | async def test_compressed_tensors_wna16_load(

FILE: integration-tests/models/test_compressed_tensors_wna16_int_24.py
  function compressed_tensors_wna16_int_24_handle (line 5) | def compressed_tensors_wna16_int_24_handle(launcher):
  function compressed_tensors_wna16_int_24 (line 15) | async def compressed_tensors_wna16_int_24(compressed_tensors_wna16_int_2...
  function test_compressed_tensors_wna16_int_24 (line 23) | async def test_compressed_tensors_wna16_int_24(
  function test_compressed_tensors_wna16_int_24_all_params (line 43) | async def test_compressed_tensors_wna16_int_24_all_params(
  function test_compressed_tensors_wna16_int_24_load (line 73) | async def test_compressed_tensors_wna16_int_24_load(

FILE: integration-tests/models/test_continue_final_message.py
  function llama_continue_final_message_handle (line 6) | def llama_continue_final_message_handle(launcher):
  function llama_continue_final_message (line 12) | async def llama_continue_final_message(llama_continue_final_message_hand...
  function test_llama_completion_single_prompt (line 17) | def test_llama_completion_single_prompt(
  function test_llama_completion_single_prompt_continue (line 46) | def test_llama_completion_single_prompt_continue(

FILE: integration-tests/models/test_flash_awq.py
  function flash_llama_awq_handle (line 5) | def flash_llama_awq_handle(launcher):
  function flash_llama_awq (line 15) | async def flash_llama_awq(flash_llama_awq_handle):
  function test_flash_llama_awq (line 22) | async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
  function test_flash_llama_awq_all_params (line 37) | async def test_flash_llama_awq_all_params(flash_llama_awq, response_snap...
  function test_flash_llama_awq_load (line 59) | async def test_flash_llama_awq_load(flash_llama_awq, generate_load, resp...

FILE: integration-tests/models/test_flash_awq_sharded.py
  function flash_llama_awq_handle_sharded (line 5) | def flash_llama_awq_handle_sharded(launcher):
  function flash_llama_awq_sharded (line 15) | async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded):
  function test_flash_llama_awq_sharded (line 22) | async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response...
  function test_flash_llama_awq_load_sharded (line 37) | async def test_flash_llama_awq_load_sharded(

FILE: integration-tests/models/test_flash_deepseek_v2.py
  function flash_deepseek_v2_handle (line 5) | def flash_deepseek_v2_handle(launcher):
  function flash_deepseek_v2 (line 11) | async def flash_deepseek_v2(flash_deepseek_v2_handle):
  function test_flash_deepseek_v2 (line 19) | async def test_flash_deepseek_v2(flash_deepseek_v2, response_snapshot):
  function test_flash_deepseek_v2_all_params (line 30) | async def test_flash_deepseek_v2_all_params(flash_deepseek_v2, response_...
  function test_flash_deepseek_v2_load (line 53) | async def test_flash_deepseek_v2_load(

FILE: integration-tests/models/test_flash_falcon.py
  function flash_falcon_handle (line 5) | def flash_falcon_handle(launcher):
  function flash_falcon (line 11) | async def flash_falcon(flash_falcon_handle):
  function test_flash_falcon (line 19) | async def test_flash_falcon(flash_falcon, response_snapshot):
  function test_flash_falcon_all_params (line 33) | async def test_flash_falcon_all_params(flash_falcon, response_snapshot):
  function test_flash_falcon_load (line 57) | async def test_flash_falcon_load(flash_falcon, generate_load, response_s...

FILE: integration-tests/models/test_flash_gemma.py
  function flash_gemma_handle (line 5) | def flash_gemma_handle(launcher):
  function flash_gemma (line 11) | async def flash_gemma(flash_gemma_handle):
  function test_flash_gemma_simple (line 19) | async def test_flash_gemma_simple(flash_gemma, response_snapshot):
  function test_flash_gemma_all_params (line 31) | async def test_flash_gemma_all_params(flash_gemma, response_snapshot):
  function test_flash_gemma_load (line 55) | async def test_flash_gemma_load(flash_gemma, generate_load, response_sna...

FILE: integration-tests/models/test_flash_gemma2.py
  function flash_gemma2_handle (line 5) | def flash_gemma2_handle(launcher):
  function flash_gemma2 (line 11) | async def flash_gemma2(flash_gemma2_handle):
  function test_flash_gemma2 (line 19) | async def test_flash_gemma2(flash_gemma2, response_snapshot):
  function test_flash_gemma2_load (line 34) | async def test_flash_gemma2_load(flash_gemma2, generate_load, response_s...

FILE: integration-tests/models/test_flash_gemma3.py
  function flash_gemma3_handle (line 9) | def flash_gemma3_handle(launcher):
  function flash_gemma3 (line 15) | async def flash_gemma3(flash_gemma3_handle):
  function test_flash_gemma3 (line 20) | async def test_flash_gemma3(flash_gemma3, response_snapshot):
  function test_flash_gemma3_image_cow_dog (line 35) | async def test_flash_gemma3_image_cow_dog(flash_gemma3, response_snapshot):
  function test_flash_gemma3_image_cow (line 62) | async def test_flash_gemma3_image_cow(flash_gemma3, response_snapshot):
  function test_exceed_window (line 85) | async def test_exceed_window(flash_gemma3, response_snapshot):
  function image_to_data_url (line 101) | def image_to_data_url(img: Image.Image, fmt: str) -> str:
  function test_flash_gemma3_image_base64_rgba (line 110) | async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_sna...
  function test_flash_gemma3_image_base64_rgb_png (line 133) | async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_...
  function test_flash_gemma3_image_base64_rgb_jpg (line 153) | async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_...

FILE: integration-tests/models/test_flash_gemma_gptq.py
  function flash_gemma_gptq_handle (line 5) | def flash_gemma_gptq_handle(launcher):
  function flash_gemma_gptq (line 11) | async def flash_gemma_gptq(flash_gemma_gptq_handle):
  function test_flash_gemma_gptq (line 19) | async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_respons...
  function test_flash_gemma_gptq_all_params (line 31) | async def test_flash_gemma_gptq_all_params(
  function test_flash_gemma_gptq_load (line 57) | async def test_flash_gemma_gptq_load(

FILE: integration-tests/models/test_flash_gpt2.py
  function flash_gpt2_handle (line 5) | def flash_gpt2_handle(launcher):
  function flash_gpt2 (line 11) | async def flash_gpt2(flash_gpt2_handle):
  function test_flash_gpt2 (line 18) | async def test_flash_gpt2(flash_gpt2, response_snapshot):
  function test_flash_gpt2_load (line 31) | async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snaps...

FILE: integration-tests/models/test_flash_grammar_llama.py
  function flash_llama_grammar_handle (line 8) | def flash_llama_grammar_handle(launcher):
  function flash_llama_grammar (line 16) | async def flash_llama_grammar(flash_llama_grammar_handle):
  function test_flash_llama_grammar (line 22) | async def test_flash_llama_grammar(flash_llama_grammar, response_snapshot):
  function test_flash_llama_grammar_regex (line 33) | async def test_flash_llama_grammar_regex(flash_llama_grammar, response_s...
  function test_flash_llama_grammar_json (line 52) | async def test_flash_llama_grammar_json(flash_llama_grammar, response_sn...
  function test_flash_llama_grammar_load (line 101) | async def test_flash_llama_grammar_load(
  function test_flash_llama_grammar_single_load_instance (line 133) | async def test_flash_llama_grammar_single_load_instance(

FILE: integration-tests/models/test_flash_llama.py
  function flash_llama_handle (line 5) | def flash_llama_handle(launcher):
  function flash_llama (line 11) | async def flash_llama(flash_llama_handle):
  function test_flash_llama_simple (line 18) | async def test_flash_llama_simple(flash_llama, response_snapshot):
  function test_flash_llama_all_params (line 29) | async def test_flash_llama_all_params(flash_llama, response_snapshot):
  function test_flash_llama_load (line 52) | async def test_flash_llama_load(flash_llama, generate_load, response_sna...

FILE: integration-tests/models/test_flash_llama_exl2.py
  function flash_llama_exl2_handle (line 5) | def flash_llama_exl2_handle(launcher):
  function flash_llama_exl2 (line 19) | async def flash_llama_exl2(flash_llama_exl2_handle):
  function test_flash_llama_exl2 (line 26) | async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_respons...
  function test_flash_llama_exl2_all_params (line 37) | async def test_flash_llama_exl2_all_params(
  function test_flash_llama_exl2_load (line 63) | async def test_flash_llama_exl2_load(

FILE: integration-tests/models/test_flash_llama_fp8.py
  function flash_llama_fp8_handle (line 5) | def flash_llama_fp8_handle(launcher):
  function flash_llama_fp8 (line 11) | async def flash_llama_fp8(flash_llama_fp8_handle):
  function test_flash_llama_fp8 (line 20) | async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot):
  function test_flash_llama_fp8_all_params (line 34) | async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snap...
  function test_flash_llama_fp8_load (line 58) | async def test_flash_llama_fp8_load(flash_llama_fp8, generate_load, resp...

FILE: integration-tests/models/test_flash_llama_fp8_kv_cache.py
  function flash_llama_fp8_kv_cache_handle (line 5) | def flash_llama_fp8_kv_cache_handle(launcher):
  function flash_llama_fp8_kv_cache (line 15) | async def flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache_handle):
  function test_flash_llama_fp8_kv_cache (line 23) | async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, respon...
  function test_flash_llama_fp8_kv_cache_all_params (line 39) | async def test_flash_llama_fp8_kv_cache_all_params(
  function test_flash_llama_fp8_kv_cache_load (line 64) | async def test_flash_llama_fp8_kv_cache_load(

FILE: integration-tests/models/test_flash_llama_gptq.py
  function flash_llama_gptq_handle (line 5) | def flash_llama_gptq_handle(launcher):
  function flash_llama_gptq (line 13) | async def flash_llama_gptq(flash_llama_gptq_handle):
  function test_flash_llama_gptq (line 21) | async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
  function test_flash_llama_gptq_all_params (line 33) | async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_sn...
  function test_flash_llama_gptq_load (line 56) | async def test_flash_llama_gptq_load(

FILE: integration-tests/models/test_flash_llama_marlin.py
  function flash_llama_marlin_handle (line 5) | def flash_llama_marlin_handle(launcher):
  function flash_llama_marlin (line 13) | async def flash_llama_marlin(flash_llama_marlin_handle):
  function test_flash_llama_marlin (line 21) | async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
  function test_flash_llama_marlin_all_params (line 33) | async def test_flash_llama_marlin_all_params(flash_llama_marlin, respons...
  function test_flash_llama_marlin_load (line 56) | async def test_flash_llama_marlin_load(

FILE: integration-tests/models/test_flash_llama_marlin_24.py
  function flash_llama_marlin24_handle (line 5) | def flash_llama_marlin24_handle(launcher):
  function flash_llama_marlin (line 13) | async def flash_llama_marlin(flash_llama_marlin24_handle):
  function test_flash_llama_marlin (line 22) | async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
  function test_flash_llama_marlin24_all_params (line 35) | async def test_flash_llama_marlin24_all_params(flash_llama_marlin, respo...
  function test_flash_llama_marlin24_load (line 59) | async def test_flash_llama_marlin24_load(

FILE: integration-tests/models/test_flash_llama_prefix.py
  function flash_llama_handle (line 5) | def flash_llama_handle(launcher):
  function flash_llama (line 11) | async def flash_llama(flash_llama_handle):
  function test_flash_llama_load (line 18) | async def test_flash_llama_load(

FILE: integration-tests/models/test_flash_llama_prefix_flashdecoding.py
  function flash_llama_handle_fd (line 5) | def flash_llama_handle_fd(launcher):
  function flash_llama_fd (line 13) | async def flash_llama_fd(flash_llama_handle_fd):
  function test_flash_llama_flashdecoding (line 20) | async def test_flash_llama_flashdecoding(

FILE: integration-tests/models/test_flash_medusa.py
  function flash_medusa_handle (line 5) | def flash_medusa_handle(launcher):
  function flash_medusa (line 13) | async def flash_medusa(flash_medusa_handle):
  function test_flash_medusa_simple (line 19) | async def test_flash_medusa_simple(flash_medusa, response_snapshot):
  function test_flash_medusa_all_params (line 29) | async def test_flash_medusa_all_params(flash_medusa, response_snapshot):
  function test_flash_medusa_load (line 51) | async def test_flash_medusa_load(flash_medusa, generate_load, response_s...

FILE: integration-tests/models/test_flash_mistral.py
  function flash_mistral_handle (line 5) | def flash_mistral_handle(launcher):
  function flash_mistral (line 11) | async def flash_mistral(flash_mistral_handle):
  function test_flash_mistral (line 17) | async def test_flash_mistral(flash_mistral, response_snapshot):
  function test_flash_mistral_all_params (line 28) | async def test_flash_mistral_all_params(flash_mistral, response_snapshot):
  function test_flash_mistral_load (line 50) | async def test_flash_mistral_load(flash_mistral, generate_load, response...

FILE: integration-tests/models/test_flash_mixtral.py
  function flash_mixtral_handle (line 5) | def flash_mixtral_handle(launcher):
  function flash_mixtral (line 11) | async def flash_mixtral(flash_mixtral_handle):
  function test_flash_mixtral (line 18) | async def test_flash_mixtral(flash_mixtral, response_snapshot):
  function test_flash_mixtral_all_params (line 33) | async def test_flash_mixtral_all_params(flash_mixtral, response_snapshot):
  function test_flash_mixtral_load (line 60) | async def test_flash_mixtral_load(flash_mixtral, generate_load, response...

FILE: integration-tests/models/test_flash_mixtral_awq.py
  function flash_mixtral_awq_handle (line 5) | def flash_mixtral_awq_handle(launcher):
  function flash_mixtral_awq (line 11) | async def flash_mixtral_awq(flash_mixtral_awq_handle):
  function test_flash_mixtral_awq (line 17) | async def test_flash_mixtral_awq(flash_mixtral_awq, response_snapshot):
  function test_flash_mixtral_awq_all_params (line 30) | async def test_flash_mixtral_awq_all_params(flash_mixtral_awq, response_...
  function test_flash_mixtral_awq_load (line 56) | async def test_flash_mixtral_awq_load(

FILE: integration-tests/models/test_flash_mixtral_gptq.py
  function flash_mixtral_gptq_handle (line 5) | def flash_mixtral_gptq_handle(launcher):
  function flash_mixtral_gptq (line 15) | async def flash_mixtral_gptq(flash_mixtral_gptq_handle):
  function test_flash_mixtral_gptq (line 21) | async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot):
  function test_flash_mixtral_gptq_all_params (line 35) | async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, respons...
  function test_flash_mixtral_gptq_load (line 61) | async def test_flash_mixtral_gptq_load(

FILE: integration-tests/models/test_flash_neox.py
  function flash_neox_handle (line 5) | def flash_neox_handle(launcher):
  function flash_neox (line 11) | async def flash_neox(flash_neox_handle):
  function test_flash_neox (line 19) | async def test_flash_neox(flash_neox, response_snapshot):
  function test_flash_neox_load (line 33) | async def test_flash_neox_load(flash_neox, generate_load, response_snaps...

FILE: integration-tests/models/test_flash_neox_sharded.py
  function flash_neox_sharded_handle (line 5) | def flash_neox_sharded_handle(launcher):
  function flash_neox_sharded (line 11) | async def flash_neox_sharded(flash_neox_sharded_handle):
  function test_flash_neox (line 18) | async def test_flash_neox(flash_neox_sharded, response_snapshot):
  function test_flash_neox_load (line 31) | async def test_flash_neox_load(flash_neox_sharded, generate_load, respon...

FILE: integration-tests/models/test_flash_pali_gemma.py
  function flash_pali_gemma_handle (line 5) | def flash_pali_gemma_handle(launcher):
  function flash_pali_gemma (line 17) | async def flash_pali_gemma(flash_pali_gemma_handle):
  function test_flash_pali_gemma (line 25) | async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot, cow...
  function test_flash_pali_gemma_two_images (line 36) | async def test_flash_pali_gemma_two_images(

FILE: integration-tests/models/test_flash_pali_gemma2.py
  function flash_pali_gemma_handle (line 5) | def flash_pali_gemma_handle(launcher):
  function flash_pali_gemma (line 13) | async def flash_pali_gemma(flash_pali_gemma_handle):
  function test_flash_pali_gemma_image (line 18) | async def test_flash_pali_gemma_image(flash_pali_gemma, response_snapshot):

FILE: integration-tests/models/test_flash_phi.py
  function flash_phi_handle (line 5) | def flash_phi_handle(launcher):
  function flash_phi (line 11) | async def flash_phi(flash_phi_handle):
  function test_flash_phi (line 18) | async def test_flash_phi(flash_phi, response_snapshot):
  function test_flash_phi_all_params (line 30) | async def test_flash_phi_all_params(flash_phi, response_snapshot):
  function test_flash_phi_load (line 54) | async def test_flash_phi_load(flash_phi, generate_load, response_snapshot):

FILE: integration-tests/models/test_flash_phi35_moe.py
  function flash_phi35_moe_handle (line 5) | def flash_phi35_moe_handle(launcher):
  function flash_phi35_moe (line 14) | async def flash_phi35_moe(flash_phi35_moe_handle):
  function test_flash_phi35_moe (line 20) | async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
  function test_flash_phi35_moe_all_params (line 34) | async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snap...
  function test_flash_phi35_moe_load (line 60) | async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, resp...

FILE: integration-tests/models/test_flash_qwen2.py
  function flash_qwen2_handle (line 5) | def flash_qwen2_handle(launcher):
  function flash_qwen2 (line 11) | async def flash_qwen2(flash_qwen2_handle):
  function test_flash_qwen2 (line 18) | async def test_flash_qwen2(flash_qwen2, response_snapshot):
  function test_flash_qwen2_all_params (line 30) | async def test_flash_qwen2_all_params(flash_qwen2, response_snapshot):
  function test_flash_qwen2_load (line 53) | async def test_flash_qwen2_load(flash_qwen2, generate_load, response_sna...

FILE: integration-tests/models/test_flash_qwen2_5_vl.py
  function flash_qwen2_5_vl_handle (line 5) | def flash_qwen2_5_vl_handle(launcher):
  function flash_qwen2_5 (line 11) | async def flash_qwen2_5(flash_qwen2_5_vl_handle):
  function test_flash_qwen2_5_vl_simple (line 17) | async def test_flash_qwen2_5_vl_simple(flash_qwen2_5, response_snapshot):
  function test_flash_qwen2_5_vl_simple_streaming (line 45) | async def test_flash_qwen2_5_vl_simple_streaming(flash_qwen2_5, response...
  function test_flash_qwen2_5_vl_bay (line 82) | async def test_flash_qwen2_5_vl_bay(flash_qwen2_5, response_snapshot):
  function test_flash_qwen2_5_vl_inpaint (line 104) | async def test_flash_qwen2_5_vl_inpaint(flash_qwen2_5, response_snapshot):

FILE: integration-tests/models/test_flash_qwen2_vl.py
  function flash_qwen2_vl_handle (line 5) | def flash_qwen2_vl_handle(launcher):
  function flash_qwen2 (line 11) | async def flash_qwen2(flash_qwen2_vl_handle):
  function test_flash_qwen2_vl_simple (line 17) | async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
  function test_flash_qwen2_vl_simple_streaming (line 45) | async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_sna...
  function test_flash_qwen2_vl_bay (line 82) | async def test_flash_qwen2_vl_bay(flash_qwen2, response_snapshot):
  function test_flash_qwen2_vl_inpaint (line 104) | async def test_flash_qwen2_vl_inpaint(flash_qwen2, response_snapshot):

FILE: integration-tests/models/test_flash_santacoder.py
  function flash_santacoder_handle (line 5) | def flash_santacoder_handle(launcher):
  function flash_santacoder (line 11) | async def flash_santacoder(flash_santacoder_handle):
  function test_flash_santacoder (line 18) | async def test_flash_santacoder(flash_santacoder, response_snapshot):
  function test_flash_santacoder_load (line 29) | async def test_flash_santacoder_load(

FILE: integration-tests/models/test_flash_starcoder.py
  function flash_starcoder_handle (line 5) | def flash_starcoder_handle(launcher):
  function flash_starcoder (line 11) | async def flash_starcoder(flash_starcoder_handle):
  function test_flash_starcoder (line 19) | async def test_flash_starcoder(flash_starcoder, response_snapshot):
  function test_flash_starcoder_default_params (line 31) | async def test_flash_starcoder_default_params(flash_starcoder, response_...
  function test_flash_starcoder_load (line 48) | async def test_flash_starcoder_load(flash_starcoder, generate_load, resp...

FILE: integration-tests/models/test_flash_starcoder2.py
  function flash_starcoder2_handle (line 5) | def flash_starcoder2_handle(launcher):
  function flash_starcoder2 (line 11) | async def flash_starcoder2(flash_starcoder2_handle):
  function test_flash_starcoder2 (line 19) | async def test_flash_starcoder2(flash_starcoder2, response_snapshot):
  function test_flash_starcoder2_default_params (line 31) | async def test_flash_starcoder2_default_params(flash_starcoder2, respons...
  function test_flash_starcoder2_load (line 48) | async def test_flash_starcoder2_load(

FILE: integration-tests/models/test_flash_starcoder2_lora.py
  function flash_starcoder2_handle (line 6) | def flash_starcoder2_handle(launcher):
  function flash_starcoder2 (line 14) | async def flash_starcoder2(flash_starcoder2_handle):
  function test_flash_starcoder2 (line 20) | async def test_flash_starcoder2(flash_starcoder2, response_snapshot):
  function test_flash_starcoder2_default_params (line 30) | async def test_flash_starcoder2_default_params(flash_starcoder2, respons...
  function test_flash_starcoder2_load (line 45) | async def test_flash_starcoder2_load(
  function test_flash_starcoder2_with_hugcode_adapter (line 59) | async def test_flash_starcoder2_with_hugcode_adapter(

FILE: integration-tests/models/test_flash_starcoder_gptq.py
  function flash_starcoder_gptq_handle (line 5) | def flash_starcoder_gptq_handle(launcher):
  function flash_starcoder_gptq (line 11) | async def flash_starcoder_gptq(flash_starcoder_gptq_handle):
  function test_flash_starcoder_gptq (line 18) | async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_respo...
  function test_flash_starcoder_gptq_load (line 49) | async def test_flash_starcoder_gptq_load(

FILE: integration-tests/models/test_grammar_llama.py
  function non_flash_llama_grammar_handle (line 8) | def non_flash_llama_grammar_handle(launcher):
  function non_flash_llama_grammar (line 19) | async def non_flash_llama_grammar(non_flash_llama_grammar_handle):
  function test_non_flash_llama_grammar_json (line 27) | async def test_non_flash_llama_grammar_json(non_flash_llama_grammar, res...

FILE: integration-tests/models/test_grammar_response_format_llama.py
  function llama_grammar_handle (line 8) | def llama_grammar_handle(launcher):
  function llama_grammar (line 20) | async def llama_grammar(llama_grammar_handle):
  function test_grammar_response_format_llama_json (line 27) | async def test_grammar_response_format_llama_json(llama_grammar, respons...
  function test_grammar_response_format_llama_error_if_tools_not_installed (line 103) | async def test_grammar_response_format_llama_error_if_tools_not_installed(

FILE: integration-tests/models/test_idefics.py
  function idefics_handle (line 5) | def idefics_handle(launcher):
  function idefics (line 13) | async def idefics(idefics_handle):
  function test_idefics (line 19) | async def test_idefics(idefics, response_snapshot, chicken):
  function test_idefics_two_images (line 36) | async def test_idefics_two_images(idefics, response_snapshot, chicken, c...
  function test_idefics_load (line 49) | async def test_idefics_load(idefics, generate_load, response_snapshot, c...

FILE: integration-tests/models/test_idefics2.py
  function flash_idefics2_next_handle (line 5) | def flash_idefics2_next_handle(launcher):
  function flash_idefics2_next (line 13) | async def flash_idefics2_next(flash_idefics2_next_handle):
  function test_flash_idefics2_next_simple (line 20) | async def test_flash_idefics2_next_simple(
  function test_flash_idefics2_two_images (line 36) | async def test_flash_idefics2_two_images(
  function test_flash_idefics2_next_all_params (line 53) | async def test_flash_idefics2_next_all_params(flash_idefics2_next, respo...
  function test_flash_idefics2_next_load (line 76) | async def test_flash_idefics2_next_load(

FILE: integration-tests/models/test_idefics3.py
  function flash_idefics3_next_handle (line 5) | def flash_idefics3_next_handle(launcher):
  function flash_idefics3_next (line 11) | async def flash_idefics3_next(flash_idefics3_next_handle):
  function test_flash_idefics3_next_simple_url (line 18) | async def test_flash_idefics3_next_simple_url(flash_idefics3_next, respo...

FILE: integration-tests/models/test_json_schema_constrain.py
  function model_handle (line 7) | def model_handle(launcher):
  function model_fixture (line 18) | async def model_fixture(model_handle):
  function test_json_schema_basic (line 77) | async def test_json_schema_basic(model_fixture, response_snapshot):
  function test_json_schema_complex (line 115) | async def test_json_schema_complex(model_fixture, response_snapshot):
  function test_json_schema_stream (line 159) | async def test_json_schema_stream(model_fixture, response_snapshot):

FILE: integration-tests/models/test_llava_next.py
  function flash_llava_next_handle (line 5) | def flash_llava_next_handle(launcher):
  function flash_llava_next (line 16) | async def flash_llava_next(flash_llava_next_handle):
  function test_flash_llava_next_simple (line 24) | async def test_flash_llava_next_simple(flash_llava_next, response_snapsh...
  function test_flash_llava_next_all_params (line 39) | async def test_flash_llava_next_all_params(flash_llava_next, response_sn...
  function test_flash_llava_next_load (line 63) | async def test_flash_llava_next_load(

FILE: integration-tests/models/test_lora_mistral.py
  function lora_mistral_handle (line 6) | def lora_mistral_handle(launcher):
  function lora_mistral (line 19) | async def lora_mistral(lora_mistral_handle):
  function test_lora_mistral (line 26) | async def test_lora_mistral(lora_mistral, response_snapshot):
  function test_lora_mistral_without_adapter (line 38) | async def test_lora_mistral_without_adapter(lora_mistral, response_snaps...
  function test_lora_mistral_with_dbpedia_adapter (line 62) | async def test_lora_mistral_with_dbpedia_adapter(lora_mistral, response_...
  function test_lora_mistral_with_customer_support_adapter (line 84) | async def test_lora_mistral_with_customer_support_adapter(
  function test_lora_mistral_without_customer_support_adapter (line 113) | async def test_lora_mistral_without_customer_support_adapter(

FILE: integration-tests/models/test_mamba.py
  function fused_kernel_mamba_handle (line 5) | def fused_kernel_mamba_handle(launcher):
  function fused_kernel_mamba (line 11) | async def fused_kernel_mamba(fused_kernel_mamba_handle):
  function test_mamba (line 18) | async def test_mamba(fused_kernel_mamba, response_snapshot):
  function test_mamba_all_params (line 30) | async def test_mamba_all_params(fused_kernel_mamba, response_snapshot):
  function test_mamba_load (line 57) | async def test_mamba_load(

FILE: integration-tests/models/test_mllama.py
  function mllama_handle (line 6) | def mllama_handle(launcher):
  function mllama (line 15) | async def mllama(mllama_handle):
  function test_mllama_simpl (line 21) | async def test_mllama_simpl(mllama, response_snapshot):
  function test_mllama_load (line 58) | async def test_mllama_load(mllama, generate_load, response_snapshot):

FILE: integration-tests/models/test_mpt.py
  function mpt_sharded_handle (line 5) | def mpt_sharded_handle(launcher):
  function mpt_sharded (line 11) | async def mpt_sharded(mpt_sharded_handle):
  function test_mpt (line 18) | async def test_mpt(mpt_sharded, response_snapshot):
  function test_mpt_load (line 35) | async def test_mpt_load(mpt_sharded, generate_load, response_snapshot):

FILE: integration-tests/models/test_mt0_base.py
  function mt0_base_handle (line 5) | def mt0_base_handle(launcher):
  function mt0_base (line 11) | async def mt0_base(mt0_base_handle):
  function test_mt0_base (line 18) | async def test_mt0_base(mt0_base, response_snapshot):
  function test_mt0_base_all_params (line 33) | async def test_mt0_base_all_params(mt0_base, response_snapshot):
  function test_mt0_base_load (line 56) | async def test_mt0_base_load(mt0_base, generate_load, response_snapshot):

FILE: integration-tests/models/test_neox.py
  function neox_handle (line 5) | def neox_handle(launcher):
  function neox (line 13) | async def neox(neox_handle):
  function test_neox (line 21) | async def test_neox(neox, response_snapshot):
  function test_neox_load (line 35) | async def test_neox_load(neox, generate_load, response_snapshot):

FILE: integration-tests/models/test_neox_sharded.py
  function neox_sharded_handle (line 5) | def neox_sharded_handle(launcher):
  function neox_sharded (line 13) | async def neox_sharded(neox_sharded_handle):
  function test_neox (line 21) | async def test_neox(neox_sharded, response_snapshot):
  function test_neox_load (line 35) | async def test_neox_load(neox_sharded, generate_load, response_snapshot):

FILE: integration-tests/models/test_opt.py
  function opt_sharded_handle (line 5) | def opt_sharded_handle(launcher):
  function opt_sharded (line 11) | async def opt_sharded(opt_sharded_handle):
  function test_opt (line 18) | async def test_opt(opt_sharded):

FILE: integration-tests/models/test_smolvlm.py
  function flash_smolvlm_next_handle (line 5) | def flash_smolvlm_next_handle(launcher):
  function flash_smolvlm_next (line 11) | async def flash_smolvlm_next(flash_smolvlm_next_handle):
  function test_flash_smolvlm_next_simple_url (line 18) | async def test_flash_smolvlm_next_simple_url(flash_smolvlm_next, respons...

FILE: integration-tests/models/test_t5_sharded.py
  function t5_sharded_handle (line 5) | def t5_sharded_handle(launcher):
  function t5_sharded (line 11) | async def t5_sharded(t5_sharded_handle):
  function test_t5_sharded (line 18) | async def test_t5_sharded(t5_sharded, response_snapshot):
  function test_t5_sharded_load (line 30) | async def test_t5_sharded_load(t5_sharded, generate_load, response_snaps...

FILE: integration-tests/models/test_tools_llama.py
  function flash_llama_grammar_tools_handle (line 11) | def flash_llama_grammar_tools_handle(launcher):
  function flash_llama_grammar_tools (line 21) | async def flash_llama_grammar_tools(flash_llama_grammar_tools_handle):
  function test_flash_llama_grammar_tools_nostream (line 83) | async def test_flash_llama_grammar_tools_nostream(
  function test_flash_llama_grammar_tools_openai (line 120) | async def test_flash_llama_grammar_tools_openai(
  function test_flash_llama_grammar_tools_auto_nostream (line 159) | async def test_flash_llama_grammar_tools_auto_nostream(
  function test_flash_llama_grammar_tools_choice_nostream (line 198) | async def test_flash_llama_grammar_tools_choice_nostream(
  function test_flash_llama_grammar_tools_choice_stream (line 237) | async def test_flash_llama_grammar_tools_choice_stream(
  function test_flash_llama_grammar_tools_insufficient_information_nostream (line 277) | async def test_flash_llama_grammar_tools_insufficient_information_nostream(
  function test_flash_llama_grammar_tools_insufficient_information_stream (line 311) | async def test_flash_llama_grammar_tools_insufficient_information_stream(
  function test_flash_llama_grammar_tools_sea_creatures_stream_auto (line 350) | async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(
  function test_flash_llama_grammar_tools_sea_creatures_stream_required (line 388) | async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
  function test_flash_llama_grammar_tools_sea_creatures_stream_none (line 429) | async def test_flash_llama_grammar_tools_sea_creatures_stream_none(
  function test_flash_llama_grammar_tools_sea_creatures_stream_function_object (line 467) | async def test_flash_llama_grammar_tools_sea_creatures_stream_function_o...
  function test_flash_llama_tool_reply_response (line 510) | async def test_flash_llama_tool_reply_response(

FILE: integration-tests/models/test_transformers_olmo.py
  function flash_llama_handle (line 5) | def flash_llama_handle(launcher):
  function flash_llama (line 11) | async def flash_llama(flash_llama_handle):
  function test_flash_llama_simple (line 18) | async def test_flash_llama_simple(flash_llama, response_snapshot):
  function test_flash_llama_load (line 30) | async def test_flash_llama_load(flash_llama, generate_load, response_sna...

FILE: integration-tests/neuron/test_generate.py
  function tgi_service (line 5) | async def tgi_service(neuron_launcher, neuron_model_config):
  function test_model_single_request (line 14) | async def test_model_single_request(tgi_service):
  function test_model_multiple_requests (line 67) | async def test_model_multiple_requests(tgi_service, neuron_generate_load):

FILE: integration-tests/neuron/test_implicit_env.py
  function tgi_service (line 7) | async def tgi_service(request, neuron_launcher, neuron_model_config):
  function test_model_single_request (line 39) | async def test_model_single_request(tgi_service):

FILE: launcher/build.rs
  function main (line 4) | fn main() -> Result<(), Box<dyn Error>> {

FILE: launcher/src/env_runtime.rs
  type Env (line 4) | pub(crate) struct Env {
    method new (line 15) | pub fn new() -> Self {
    method fmt (line 33) | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  function nvidia_smi (line 48) | fn nvidia_smi() -> Option<String> {
  function xpu_smi (line 55) | fn xpu_smi() -> Option<String> {
  function hl_smi (line 62) | fn hl_smi() -> Option<String> {

FILE: launcher/src/gpu.rs
  function get_cuda_capability (line 1) | pub fn get_cuda_capability() -> Option<(usize, usize)> {

FILE: launcher/src/main.rs
  function compute_optimal (line 28) | fn compute_optimal(config: Option<&Config>, compute: Option<&ComputeType...
  function human_size (line 48) | fn human_size(size: usize, suffix: &str) -> String {
  function vram_maximum (line 62) | fn vram_maximum(
  function get_config (line 91) | fn get_config(
  function resolve_attention (line 131) | fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<Str...
  type RawConfig (line 203) | struct RawConfig {
  type QuantizationConfig (line 229) | struct QuantizationConfig {
  type VisionConfig (line 234) | struct VisionConfig {}
  type TextConfig (line 237) | struct TextConfig {
  type Config (line 242) | struct Config {
    method get_head_dim (line 262) | fn get_head_dim(&self) -> Option<usize> {
    method flop (line 281) | fn flop(&self) -> Option<u64> {
    method kv_vram_per_tok (line 310) | fn kv_vram_per_tok(&self) -> Option<usize> {
    method mlp_vram_per_tok (line 320) | fn mlp_vram_per_tok(&self) -> Option<usize> {
    method token_vram (line 330) | fn token_vram(&self) -> Option<usize> {
    method model_vram (line 337) | fn model_vram(&self) -> Option<usize> {
    method from (line 355) | fn from(other: RawConfig) -> Self {
  type Quantization (line 407) | enum Quantization {
    method fmt (line 450) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type Dtype (line 490) | enum Dtype {
    method fmt (line 497) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type KVCacheDtype (line 511) | enum KVCacheDtype {
    method fmt (line 520) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type RopeScaling (line 533) | enum RopeScaling {
    method fmt (line 539) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type UsageStatsLevel (line 553) | pub enum UsageStatsLevel {
    method fmt (line 563) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type Args (line 582) | struct Args {
  type ShardStatus (line 917) | enum ShardStatus {
  function shard_manager (line 923) | fn shard_manager(
  function shutdown_shards (line 1251) | fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::...
  function num_cuda_devices (line 1262) | fn num_cuda_devices() -> Option<usize> {
  type PythonLogLevelEnum (line 1296) | enum PythonLogLevelEnum {
  type PythonLogLevel (line 1307) | struct PythonLogLevel {
  type PythonLogRecord (line 1312) | struct PythonLogRecord {
  type PythonLogMessage (line 1317) | struct PythonLogMessage {
    method trace (line 1323) | fn trace(&self) {
    type Error (line 1337) | type Error = serde_json::Error;
    method try_from (line 1339) | fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
  function log_lines (line 1344) | fn log_lines<R: Sized + Read>(mut bufread: BufReader<R>) {
  function find_num_shards (line 1374) | fn find_num_shards(
  type LauncherError (line 1415) | enum LauncherError {
  function download_convert_model (line 1434) | fn download_convert_model(
  function spawn_shards (line 1579) | fn spawn_shards(
  type Gpu (line 1692) | enum Gpu {
    method from (line 1713) | fn from(value: &str) -> Self {
    method fmt (line 1736) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  type ComputeType (line 1707) | struct ComputeType {
    method f16_flop (line 1754) | fn f16_flop(&self) -> Option<u64> {
    method vram (line 1787) | fn vram(&self, memory_fraction: f32) -> Option<usize> {
  method from (line 1814) | fn from(value: ComputeType) -> Self {
  function compute_type (line 1819) | fn compute_type(count: usize) -> Option<ComputeType> {
  function spawn_webserver (line 1831) | fn spawn_webserver(
  function terminate (line 2015) | fn terminate(process_name: &str, mut process: Child, timeout: Duration) ...
  function main (line 2038) | fn main() -> Result<(), LauncherError> {

FILE: load_tests/benchmarks.py
  class InferenceEngineRunner (line 15) | class InferenceEngineRunner:
    method __init__ (line 16) | def __init__(self, model: str):
    method run (line 19) | def run(self, parameters: list[tuple], gpus: int = 0):
    method stop (line 22) | def stop(self):
  class TGIDockerRunner (line 26) | class TGIDockerRunner(InferenceEngineRunner):
    method __init__ (line 27) | def __init__(
    method run (line 40) | def run(self, parameters: list[tuple], gpus: int = 0):
    method stop (line 58) | def stop(self):
  class BenchmarkRunner (line 63) | class BenchmarkRunner:
    method __init__ (line 64) | def __init__(
    method run (line 75) | def run(self, parameters: list[tuple], network_mode):
    method stop (line 98) | def stop(self):
  function run_docker (line 103) | def run_docker(
  function get_gpu_names (line 150) | def get_gpu_names() -> str:
  function get_gpu_name (line 157) | def get_gpu_name() -> str:
  function get_num_gpus (line 164) | def get_num_gpus() -> int:
  function build_df (line 168) | def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
  function main (line 190) | def main(sha, results_file):

FILE: load_tests/common.js
  function get_options (line 18) | function get_options() {
  function generate_payload (line 60) | function generate_payload(gpt, max_new_tokens) {
  function run (line 67) | function run() {

FILE: load_tests/filter.py
  function main (line 4) | def main():

FILE: load_tests/long.js
  function get_options (line 18) | function get_options() {
  function generate_payload (line 60) | function generate_payload(gpt, max_new_tokens) {
  function run (line 67) | function run() {

FILE: load_tests/orca.py
  function main (line 6) | def main():

FILE: router/build.rs
  function main (line 4) | fn main() -> Result<(), Box<dyn Error>> {

FILE: router/src/chat.rs
  type ToolCall (line 10) | struct ToolCall {
  type Call (line 17) | struct Call {
  type ChatEvent (line 22) | pub(crate) enum ChatEvent {
  type ChatChoice (line 28) | pub(crate) enum ChatChoice {
  function parse_output (line 33) | pub(crate) fn parse_output(generated_text: &str) -> Result<ChatChoice, I...
  function create_event_from_stream_token (line 67) | fn create_event_from_stream_token(
  type StreamState (line 141) | enum StreamState {
  type ChatState (line 150) | pub struct ChatState {
    method new (line 161) | pub fn new(
    method push (line 186) | pub fn push(&mut self, mut stream_token: StreamResponse) -> ChatEvent {
  function get_tool_call_content (line 309) | fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &S...
  function test_chat_stream (line 338) | fn test_chat_stream() {
  function test_chat_stream_usage (line 388) | fn test_chat_stream_usage() {
  function test_chat_stream_tool_no_tool_simple (line 457) | fn test_chat_stream_tool_no_tool_simple() {
  function test_chat_stream_tool_no_tool_empty (line 530) | fn test_chat_stream_tool_no_tool_empty() {
  function test_chat_stream_tool_get_weather (line 596) | fn test_chat_stream_tool_get_weather() {

FILE: router/src/config.rs
  type LlavaNext (line 7) | pub struct LlavaNext {
    method get_number_of_features (line 89) | pub fn get_number_of_features(&self, height: usize, width: usize) -> u...
  function get_anyres_image_grid_shape (line 13) | fn get_anyres_image_grid_shape(
  function select_best_resolution (line 26) | fn select_best_resolution(
  function get_unpadded_features (line 61) | fn get_unpadded_features(
  type Llama4VisionConfig (line 109) | pub struct Llama4VisionConfig {
  type Llama4 (line 117) | pub struct Llama4 {
    method image_size (line 221) | pub fn image_size(&self) -> usize {
    method patch_size (line 225) | pub fn patch_size(&self) -> usize {
    method pixel_shuffle_ratio (line 229) | pub fn pixel_shuffle_ratio(&self) -> f64 {
    method get_aspect_ratios (line 232) | pub fn get_aspect_ratios(
  function gcd (line 122) | fn gcd(a: usize, b: usize) -> usize {
  function get_factors (line 130) | fn get_factors(dividend: usize) -> HashSet<usize> {
  function find_supported_resolutions (line 143) | fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> V...
  function get_best_fit (line 173) | fn get_best_fit(
  type ClipVisionModel (line 247) | pub struct ClipVisionModel {
  type Idefics3 (line 254) | pub struct Idefics3 {}
    method get_max_longest_edge (line 257) | pub fn get_max_longest_edge(&self) -> usize {
    method get_number_of_features (line 261) | pub fn get_number_of_features(&self) -> usize {
    method get_max_longest_edge_for_image_resize (line 265) | pub fn get_max_longest_edge_for_image_resize(&self) -> usize {
    method get_max_image_size (line 269) | pub fn get_max_image_size(&self) -> usize {
  type Idefics2 (line 276) | pub struct Idefics2 {}
    method get_number_of_features (line 279) | pub fn get_number_of_features(&self, _height: usize, _width: usize) ->...
  type PaliTextConfig (line 286) | pub struct PaliTextConfig {
  type Paligemma (line 292) | pub struct Paligemma {
    method get_number_of_features (line 297) | pub fn get_number_of_features(&self, _height: usize, _width: usize) ->...
  type Qwen2VlVisionConfig (line 304) | pub struct Qwen2VlVisionConfig {
  type Qwen2Vl (line 319) | pub struct Qwen2Vl {
    method get_number_of_features (line 324) | pub fn get_number_of_features(&self, height: usize, width: usize) -> u...
  type Qwen2_5VlVisionConfig (line 332) | pub struct Qwen2_5VlVisionConfig {
  type Qwen2_5Vl (line 351) | pub struct Qwen2_5Vl {
    method get_number_of_features (line 356) | pub fn get_number_of_features(&self, height: usize, width: usize) -> u...
  type Gemma3VisionConfig (line 364) | pub struct Gemma3VisionConfig {
  type Gemma3 (line 371) | pub struct Gemma3 {
  type Config (line 378) | pub enum Config {
  type TextConfig (line 426) | pub struct TextConfig {}
  type VisionConfig (line 430) | pub struct VisionConfig {
  function test_llava_next_features (line 440) | fn test_llava_next_features() {

FILE: router/src/infer/chat_template.rs
  function raise_exception (line 10) | pub(crate) fn raise_exception(err_text: String) -> Result<String, miniji...
  function strftime_now (line 15) | pub(crate) fn strftime_now(format_str: String) -> Result<String, minijin...
  type ChatTemplate (line 20) | pub(crate) struct ChatTemplate {
    method new (line 28) | pub(crate) fn new(
    method apply (line 79) | pub(crate) fn apply(
  function test_chat_template (line 152) | fn test_chat_template() {
  function test_chat_template_with_tool_response (line 218) | fn test_chat_template_with_tool_response() {
  function test_chat_template_loop_controls (line 394) | fn test_chat_template_loop_controls() {
  function test_chat_template_invalid_with_raise (line 464) | fn test_chat_template_invalid_with_raise() {
  function test_chat_template_valid_with_raise (line 541) | fn test_chat_template_valid_with_raise() {
  function test_chat_template_valid_with_strftime_now (line 604) | fn test_chat_template_valid_with_strftime_now() {
  function test_chat_template_valid_with_add_generation_prompt (line 676) | fn test_chat_template_valid_with_add_generation_prompt() {
  type ChatTemplateTestItem (line 731) | struct ChatTemplateTestItem {
  function test_many_chat_templates (line 739) | fn test_many_chat_templates() {
  function test_chat_template_with_default_tool_template (line 1171) | fn test_chat_template_with_default_tool_template() {
  function test_chat_template_with_custom_tool_template (line 1216) | fn test_chat_template_with_custom_tool_template() {
  function test_chat_template_with_special_system_prompt (line 1254) | fn test_chat_template_with_special_system_prompt() {

FILE: router/src/infer/mod.rs
  type Backend (line 29) | pub trait Backend {
    method schedule (line 30) | fn schedule(
    method health (line 35) | async fn health(&self, current_health: bool) -> bool;
    method start_health (line 40) | fn start_health(&self) -> bool {
    method name (line 44) | fn name(&self) -> &'static str;
  type Infer (line 49) | pub struct Infer {
    method new (line 64) | pub(crate) fn new(
    method generate_stream (line 100) | pub(crate) async fn generate_stream<'a>(
    method tokenize (line 204) | pub(crate) async fn tokenize(
    method apply_chat_template (line 227) | pub(crate) fn apply_chat_template(
    method generate (line 245) | pub(crate) async fn generate(
    method generate_best_of (line 321) | pub(crate) async fn generate_best_of(
    method health (line 357) | pub(crate) async fn health(&self) -> bool {
  type GeneratedText (line 368) | pub struct GeneratedText {
  type InferStreamResponse (line 376) | pub enum InferStreamResponse {
  type InferResponse (line 395) | pub(crate) struct InferResponse {
  type InferError (line 409) | pub enum InferError {
    method error_type (line 431) | pub(crate) fn error_type(&self) -> &str {
    method into_openai_event (line 445) | pub(crate) fn into_openai_event(self) -> Event {
  type APIError (line 458) | pub struct APIError {
  type OpenaiErrorEvent (line 464) | pub struct OpenaiErrorEvent {

FILE: router/src/infer/tool_grammar.rs
  type ToolGrammar (line 8) | pub(crate) struct ToolGrammar {}
    method find_tool_by_name (line 12) | fn find_tool_by_name(tools: &[Tool], name: &str) -> Result<Tool, Infer...
    method apply (line 20) | pub fn apply(

FILE: router/src/kserve.rs
  type OutputChunk (line 15) | pub struct OutputChunk {
  type InferenceOutput (line 23) | pub struct InferenceOutput {
  type InferenceRequest (line 29) | pub(crate) struct InferenceRequest {
  type Input (line 38) | pub(crate) struct Input {
  type Output (line 46) | pub(crate) struct Output {
  type LiveResponse (line 51) | pub struct LiveResponse {
  type ReadyResponse (line 56) | pub struct ReadyResponse {
  type MetadataServerResponse (line 61) | pub struct MetadataServerResponse {
  function kserve_health_live (line 77) | pub async fn kserve_health_live() -> Json<LiveResponse> {
  function kserve_health_ready (line 92) | pub async fn kserve_health_ready() -> Json<ReadyResponse> {
  function kerve_server_metadata (line 107) | pub async fn kerve_server_metadata() -> Json<MetadataServerResponse> {
  function kserve_model_metadata (line 130) | pub async fn kserve_model_metadata(
  function kserve_model_metadata_ready (line 151) | pub async fn kserve_model_metadata_ready(
  function kserve_model_infer (line 169) | pub async fn kserve_model_infer(

FILE: router/src/lib.rs
  type Tokenizer (line 29) | pub enum Tokenizer {
  type PyTokenizer (line 38) | pub struct PyTokenizer<'a>(pyo3::Bound<'a, pyo3::PyAny>);
  function from_py (line 41) | fn from_py(
  type TokenizerTrait (line 66) | trait TokenizerTrait {
    method encode_trait (line 67) | fn encode_trait(
    method encode_trait (line 75) | fn encode_trait(
    method encode_trait (line 85) | fn encode_trait(
  type HubModelInfo (line 114) | pub struct HubModelInfo {
  type ChatTemplate (line 122) | pub struct ChatTemplate {
  type ChatTemplateVersions (line 129) | pub enum ChatTemplateVersions {
  type HubTokenizerConfig (line 137) | pub struct HubTokenizerConfig {
    method from_file (line 148) | pub fn from_file<P: AsRef<Path>>(filename: P) -> Option<Self> {
  type ChatTemplateStandalone (line 156) | pub struct ChatTemplateStandalone {
  type TokenizerConfigToken (line 162) | pub enum TokenizerConfigToken {
    method as_str (line 168) | pub fn as_str(&self) -> &str {
  type HubPreprocessorConfig (line 178) | pub enum HubPreprocessorConfig {
    method from_file (line 186) | pub fn from_file<P: AsRef<std::path::Path>>(filename: P) -> Option<Sel...
  type Idefics2Preprocessor (line 193) | pub struct Idefics2Preprocessor {
  type Gemma3Processor (line 199) | pub struct Gemma3Processor {
  type Llama4Processor (line 205) | pub struct Llama4Processor {
  type HubProcessorConfig (line 211) | pub struct HubProcessorConfig {
    method from_file (line 218) | pub fn from_file<P: AsRef<Path>>(filename: P) -> Option<Self> {
  type JsonSchemaConfig (line 227) | struct JsonSchemaConfig {
  type GrammarType (line 239) | pub(crate) enum GrammarType {
  type Info (line 261) | pub struct Info {
  type GenerateParameters (line 303) | pub(crate) struct GenerateParameters {
  function default_parameters (line 436) | fn default_parameters() -> GenerateParameters {
  type Prompt (line 462) | pub struct Prompt(pub Vec<String>);
    type Error (line 472) | type Error = String;
    method try_from (line 474) | fn try_from(value: PromptDeserializer) -> Result<Self, Self::Error> {
  type PromptDeserializer (line 466) | enum PromptDeserializer {
  type CompletionRequest (line 492) | pub struct CompletionRequest {
  type Completion (line 547) | enum Completion {
  type CompletionFinal (line 555) | pub(crate) struct CompletionFinal {
  type CompletionComplete (line 567) | pub(crate) struct CompletionComplete {
  type Chunk (line 575) | pub(crate) struct Chunk {
  type ChatCompletion (line 585) | pub(crate) struct ChatCompletion {
    method new (line 699) | pub(crate) fn new(
  type ChatCompletionComplete (line 598) | pub(crate) struct ChatCompletionComplete {
  type ChatCompletionLogprobs (line 607) | pub(crate) struct ChatCompletionLogprobs {
    method from (line 612) | fn from(value: (Token, Vec<Token>)) -> Self {
    method from (line 632) | fn from(value: (Vec<Token>, Vec<Vec<Token>>)) -> Self {
  type ChatCompletionLogprob (line 666) | pub(crate) struct ChatCompletionLogprob {
  type ChatCompletionTopLogprob (line 674) | pub(crate) struct ChatCompletionTopLogprob {
  type Usage (line 681) | pub(crate) struct Usage {
  type CompletionType (line 690) | enum CompletionType {
  type ChatCompletionChunk (line 758) | pub(crate) struct ChatCompletionChunk {
    method new (line 809) | pub(crate) fn new(
  type ChatCompletionChoice (line 771) | pub(crate) struct ChatCompletionChoice {
  type ToolCallDelta (line 779) | pub struct ToolCallDelta {
  type ChatCompletionDelta (line 788) | enum ChatCompletionDelta {
  type DeltaToolCall (line 794) | pub(crate) struct DeltaToolCall {
  type Function (line 802) | pub(crate) struct Function {
  type ChatRequest (line 829) | pub(crate) struct ChatRequest {
    method try_into_generate (line 941) | fn try_into_generate(self, infer: &Infer) -> Result<(GenerateRequest, ...
    method next_int_id (line 1038) | fn next_int_id(&self) -> Result<String, Box<dyn std::error::Error>> {
    method next_tool_call_id (line 1054) | fn next_tool_call_id(&self) -> String {
  type StreamOptions (line 1064) | struct StreamOptions {
  function default_tool_prompt (line 1071) | pub fn default_tool_prompt() -> String {
  type TypedChoice (line 1077) | pub enum TypedChoice {
  type FunctionName (line 1083) | pub struct FunctionName {
  type ToolChoice (line 1091) | pub enum ToolChoice {
    method from (line 1127) | fn from(value: ToolTypeDeserializer) -> Self {
  type ToolTypeDeserializer (line 1113) | enum ToolTypeDeserializer {
  type JsonSchemaTool (line 1144) | pub struct JsonSchemaTool {
  type FunctionsMap (line 1151) | struct FunctionsMap {
  type FunctionRef (line 1157) | struct FunctionRef {
  type Properties (line 1163) | struct Properties {
  function serialize_function (line 1168) | fn serialize_function<S>(functions: &Vec<FunctionRef>, serializer: S) ->...
  type FunctionDefinition (line 1179) | pub struct FunctionDefinition {
  function serialize_as_string (line 1187) | fn serialize_as_string<S>(value: &serde_json::Value, serializer: S) -> R...
  type Tool (line 1196) | pub(crate) struct Tool {
  type ChatTemplateInputs (line 1205) | pub(crate) struct ChatTemplateInputs<'a> {
  type ToolCall (line 1214) | pub struct ToolCall {
  type Url (line 1221) | pub struct Url {
  type MessageChunk (line 1228) | pub enum MessageChunk {
  type Message (line 1234) | pub struct Message {
  type MessageBody (line 1247) | pub enum MessageBody {
  type MessageContent (line 1262) | pub enum MessageContent {
    method push (line 1269) | pub fn push(&mut self, chunk: MessageChunk) {
  type TextMessage (line 1285) | pub struct TextMessage {
    method from (line 1295) | fn from(value: Message) -> Self {
  type ToolCallMessage (line 1322) | pub struct ToolCallMessage {
  type OutputMessage (line 1330) | pub(crate) enum OutputMessage {
  type GenerateRequest (line 1337) | pub(crate) struct GenerateRequest {
    method from (line 1366) | fn from(req: CompatGenerateRequest) -> Self {
  function default_true (line 1350) | fn default_true() -> bool {
  type CompatGenerateRequest (line 1355) | pub(crate) struct CompatGenerateRequest {
  type PrefillToken (line 1376) | pub struct PrefillToken {
  type Token (line 1386) | pub struct Token {
  type SimpleToken (line 1398) | pub struct SimpleToken {
  type FinishReason (line 1412) | pub enum FinishReason {
    method fmt (line 1423) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    method format (line 1433) | pub fn format(&self, use_stop: bool) -> String {
  type BestOfSequence (line 1442) | pub(crate) struct BestOfSequence {
  type Details (line 1458) | pub(crate) struct Details {
  type GenerateResponse (line 1474) | pub(crate) struct GenerateResponse {
  type ChatTokenizeResponse (line 1482) | pub(crate) struct ChatTokenizeResponse {
  type TokenizeResponse (line 1489) | pub(crate) struct TokenizeResponse(Vec<SimpleToken>);
  type StreamDetails (line 1492) | pub(crate) struct StreamDetails {
  type StreamResponse (line 1504) | pub(crate) struct StreamResponse {
  type ErrorResponse (line 1516) | pub(crate) struct ErrorResponse {
  type ModelInfo (line 1522) | pub(crate) struct ModelInfo {
  type ModelsInfo (line 1534) | pub(crate) struct ModelsInfo {
  method default (line 1541) | fn default() -> Self {
  function get_tokenizer (line 1554) | pub(crate) fn get_tokenizer() -> Tokenizer {
  function test_hub_nested_tokens_tokenizer_config (line 1562) | fn test_hub_nested_tokens_tokenizer_config() {
  function test_chat_simple_string (line 1635) | fn test_chat_simple_string() {
  function test_message_content_append (line 1658) | fn test_message_content_append() {
  function test_chat_request (line 1687) | fn test_chat_request() {
  function text_message_convert (line 1717) | fn text_message_convert() {
  function test_chat_stream_options (line 1733) | fn test_chat_stream_options() {
  function openai_output (line 1769) | fn openai_output() {
  function tool_choice_formats (line 1803) | fn tool_choice_formats() {

FILE: router/src/logging.rs
  type TraceParent (line 14) | struct TraceParent {
  function parse_traceparent (line 22) | fn parse_traceparent(header_value: &str) -> Option<TraceParent> {
  function trace_context_middleware (line 45) | pub async fn trace_context_middleware(mut request: Request, next: Next) ...
  function init_logging (line 72) | pub fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: St...

FILE: router/src/sagemaker.rs
  type SagemakerRequest (line 17) | pub(crate) enum SagemakerRequest {
  type SagemakerResponse (line 27) | pub(crate) enum SagemakerResponse {
  type SagemakerStreamResponse (line 37) | pub(crate) enum SagemakerStreamResponse {
  function sagemaker_compatibility (line 66) | pub(crate) async fn sagemaker_compatibility(

FILE: router/src/server.rs
  function encoding_to_tokens (line 71) | fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> V...
  function compat_generate (line 126) | pub(crate) async fn compat_generate(
  function get_model_info (line 161) | async fn get_model_info(info: Extension<Info>) -> Json<Info> {
  function openai_get_model_info (line 176) | async fn openai_get_model_info(info: Extension<Info>) -> Json<ModelsInfo> {
  function get_chat_tokenize (line 199) | async fn get_chat_tokenize(
  function health (line 230) | async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json...
  function generate (line 273) | async fn generate(
  function generate_internal (line 289) | pub(crate) async fn generate_internal(
  function generate_stream (line 476) | async fn generate_stream(
  function generate_stream_internal (line 508) | async fn generate_stream_internal(
  function completions (line 715) | pub(crate) async fn completions(
  function chat_completions (line 1168) | pub(crate) async fn chat_completions(
  function tokenize (line 1317) | async fn tokenize(
  function metrics (line 1334) | async fn metrics(prom_handle: Extension<PrometheusHandle>) -> String {
  type ComputeType (line 1339) | pub(crate) struct ComputeType(String);
  type ApiDoc (line 1429) | pub struct ApiDoc;
  function schema (line 1431) | pub fn schema() -> ApiDoc {
  function py_resolve_tokenizer (line 1435) | pub fn py_resolve_tokenizer(
  function legacy_tokenizer_handle (line 1461) | pub fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Opt...
  function run (line 1502) | pub async fn run(
  function start (line 1870) | async fn start(
  function get_hub_model_info (line 2338) | pub async fn get_hub_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
  function get_tokenizer_config (line 2357) | pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubToken...
  function shutdown_signal (line 2376) | async fn shutdown_signal() {
  function from (line 2405) | fn from(err: InferError) -> Self {
  method from (line 2429) | fn from(err: InferError) -> Self {
  type WebServerError (line 2440) | pub enum WebServerError {

FILE: router/src/usage_stats.rs
  constant TELEMETRY_URL (line 15) | const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";
  type UsageStatsLevel (line 18) | pub enum UsageStatsLevel {
  type UserAgent (line 25) | pub struct UserAgent {
    method new (line 32) | pub fn new(reduced_args: Args) -> Self {
  type EventType (line 42) | pub enum EventType {
  type UsageStatsEvent (line 50) | pub struct UsageStatsEvent {
    method new (line 58) | pub fn new(user_agent: UserAgent, event_type: EventType, error_reason:...
    method send (line 65) | pub async fn send(&self) {
  type Args (line 81) | pub struct Args {
    method new (line 106) | pub fn new(
  type Env (line 155) | pub struct Env {
    method new (line 388) | pub fn new() -> Self {
    method is_hpu_device (line 398) | pub fn is_hpu_device(&self) -> bool {
  type NvidiaSmiInfo (line 165) | struct NvidiaSmiInfo {
    method new (line 188) | fn new() -> Option<Vec<NvidiaSmiInfo>> {
  type XpuSmiInfo (line 239) | struct XpuSmiInfo {
    method new (line 249) | fn new() -> Option<Vec<XpuSmiInfo>> {
  type HpuSmiInfo (line 294) | struct HpuSmiInfo {
    method new (line 307) | fn new() -> Option<Vec<HpuSmiInfo>> {
  type SystemInfo (line 348) | pub struct SystemInfo {
    method new (line 357) | fn new() -> Self {
  method default (line 382) | fn default() -> Self {
  function is_container (line 403) | pub fn is_container() -> io::Result<bool> {

FILE: router/src/validation.rs
  type Validation (line 29) | pub struct Validation {
    method new (line 43) | pub(crate) fn new(
    method tokenize (line 105) | pub async fn tokenize(
    method validate_input (line 133) | async fn validate_input(
    method validate (line 204) | pub(crate) async fn validate(
    method validate_best_of (line 451) | pub(crate) fn validate_best_of(&self, best_of: usize) -> Result<usize,...
  function round_robin_task (line 465) | async fn round_robin_task(
  function tokenizer_worker (line 480) | fn tokenizer_worker(
  function format_from_mimetype (line 540) | fn format_from_mimetype(mimetype: &str) -> Option<ImageFormat> {
  function format_to_mimetype (line 558) | fn format_to_mimetype(format: ImageFormat) -> String {
  function fetch_image (line 570) | fn fetch_image(
  function image_tokens (line 639) | fn image_tokens(
  function image_tokens_fixup (line 802) | fn image_tokens_fixup(config: &Config, text: String) -> String {
  function prepare_input (line 813) | fn prepare_input<T: TokenizerTrait>(
  type TokenizerRequest (line 865) | type TokenizerRequest = (
  type Image (line 872) | pub struct Image {
  type Chunk (line 878) | pub enum Chunk {
  type ChunksToString (line 885) | pub trait ChunksToString {
    method chunks_to_string (line 887) | fn chunks_to_string(&self) -> String;
    method chunks_to_string (line 891) | fn chunks_to_string(&self) -> String {
  type ValidGrammar (line 905) | pub enum ValidGrammar {
  type ValidParameters (line 911) | pub struct ValidParameters {
  type ValidStoppingParameters (line 935) | pub struct ValidStoppingParameters {
  type ValidGenerateRequest (line 948) | pub struct ValidGenerateRequest {
  type ValidationError (line 962) | pub enum ValidationError {
  function test_validation_max_new_tokens (line 1041) | async fn test_validation_max_new_tokens() {
  function test_validation_input_length (line 1077) | async fn test_validation_input_length() {
  function test_validation_best_of_sampling (line 1112) | async fn test_validation_best_of_sampling() {
  function test_validation_top_p (line 1153) | async fn test_validation_top_p() {
  function test_validation_top_n_tokens (line 1225) | async fn test_validation_top_n_tokens() {
  function test_prepare_input_chunks (line 1309) | async fn test_prepare_input_chunks() {
  function test_idefics2_correct_n_fake_tokens (line 1366) | async fn test_idefics2_correct_n_fake_tokens() {

FILE: router/src/vertex.rs
  type GenerateVertexInstance (line 15) | pub(crate) struct GenerateVertexInstance {
  type VertexInstance (line 25) | pub(crate) enum VertexInstance {
  type VertexRequest (line 32) | pub(crate) struct VertexRequest {
  type VertexResponse (line 38) | pub(crate) struct VertexResponse {
  function vertex_compatibility (line 71) | pub(crate) async fn vertex_compatibility(
  function vertex_deserialization (line 159) | fn vertex_deserialization() {

FILE: server/bounds-from-nix.py
  function is_optional (line 12) | def is_optional(info: Union[str, Dict[str, str]]) -> bool:

FILE: server/exllama_kernels/exllama_kernels/exllama_ext.cpp
  function check_cuda (line 21) | void check_cuda(cudaError_t ret)
  function get_groupsize (line 68) | int get_groupsize(torch::Tensor w, torch::Tensor w_zeros)
  function set_tuning_params (line 80) | void set_tuning_params
  function cleanup (line 95) | void cleanup()
  function prepare_buffers (line 104) | void prepare_buffers
  function make_q4 (line 126) | uintptr_t make_q4
  function q4_matmul (line 168) | void q4_matmul
  function column_remap (line 218) | void column_remap
  function PYBIND11_MODULE (line 246) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: server/exllama_kernels/exllama_kernels/tuning.h
  type ExLlamaTuning (line 6) | struct ExLlamaTuning

FILE: server/exllamav2_kernels/exllamav2_kernels/ext.cpp
  function make_q_matrix (line 26) | uintptr_t make_q_matrix
  function gemm_half_q_half (line 100) | void gemm_half_q_half
  function PYBIND11_MODULE (line 135) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: server/tests/conftest.py
  function default_pb_parameters (line 10) | def default_pb_parameters():
  function default_pb_stop_parameters (line 22) | def default_pb_stop_parameters():

FILE: server/tests/models/test_bloom.py
  function default_bloom (line 17) | def default_bloom():
  function bloom_560m_tokenizer (line 29) | def bloom_560m_tokenizer():
  function default_pb_request (line 34) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
  function default_pb_batch (line 47) | def default_pb_batch(default_pb_request):
  function default_bloom_batch (line 52) | def default_bloom_batch(default_pb_batch, bloom_560m_tokenizer):
  function default_multi_requests_bloom_batch (line 59) | def default_multi_requests_bloom_batch(default_pb_request, bloom_560m_to...
  function test_batch_from_pb (line 72) | def test_batch_from_pb(default_pb_batch, default_bloom_batch):
  function test_batch_concatenate_no_prefill (line 102) | def test_batch_concatenate_no_prefill(default_bloom_batch):
  function test_causal_lm_batch_type (line 107) | def test_causal_lm_batch_type(default_bloom):
  function test_causal_lm_generate_token (line 111) | def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
  function test_causal_lm_generate_token_completion (line 160) | def test_causal_lm_generate_token_completion(default_bloom, default_bloo...
  function test_causal_lm_generate_token_completion_multi (line 180) | def test_causal_lm_generate_token_completion_multi(
  function test_batch_concatenate (line 230) | def test_batch_concatenate(

FILE: server/tests/models/test_causal_lm.py
  function default_causal_lm (line 12) | def default_causal_lm():
  function gpt2_tokenizer (line 17) | def gpt2_tokenizer():
  function default_pb_request (line 24) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
  function default_pb_batch (line 37) | def default_pb_batch(default_pb_request):
  function default_causal_lm_batch (line 42) | def default_causal_lm_batch(default_pb_batch, gpt2_tokenizer):
  function default_multi_requests_causal_lm_batch (line 49) | def default_multi_requests_causal_lm_batch(default_pb_request, gpt2_toke...
  function test_batch_from_pb (line 62) | def test_batch_from_pb(default_pb_batch, default_causal_lm_batch):
  function test_batch_concatenate_no_prefill (line 92) | def test_batch_concatenate_no_prefill(default_causal_lm_batch):
  function test_causal_lm_batch_type (line 97) | def test_causal_lm_batch_type(default_causal_lm):
  function test_causal_lm_generate_token (line 101) | def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_b...
  function test_causal_lm_generate_token_completion (line 152) | def test_causal_lm_generate_token_completion(
  function test_causal_lm_generate_token_completion_multi (line 172) | def test_causal_lm_generate_token_completion_multi(
  function test_batch_concatenate (line 224) | def test_batch_concatenate(

FILE: server/tests/models/test_model.py
  function get_test_model (line 9) | def get_test_model():
  function test_decode_streaming_english_spaces (line 31) | def test_decode_streaming_english_spaces():
  function test_decode_streaming_chinese_utf8 (line 52) | def test_decode_streaming_chinese_utf8():

FILE: server/tests/models/test_santacoder.py
  function default_santacoder (line 8) | def default_santacoder():
  function default_pb_request (line 13) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
  function default_pb_batch (line 26) | def default_pb_batch(default_pb_request):
  function default_fim_pb_request (line 31) | def default_fim_pb_request(default_pb_parameters, default_pb_stop_parame...
  function default_fim_pb_batch (line 50) | def default_fim_pb_batch(default_fim_pb_request):
  function test_santacoder_generate_token_completion (line 55) | def test_santacoder_generate_token_completion(default_santacoder, defaul...
  function test_fim_santacoder_generate_token_completion (line 81) | def test_fim_santacoder_generate_token_completion(

FILE: server/tests/models/test_seq2seq_lm.py
  function mt0_small_tokenizer (line 13) | def mt0_small_tokenizer():
  function default_seq2seq_lm (line 22) | def default_seq2seq_lm():
  function default_pb_request (line 27) | def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
  function default_pb_batch (line 40) | def default_pb_batch(default_pb_request):
  function default_seq2seq_lm_batch (line 45) | def default_seq2seq_lm_batch(default_pb_batch, mt0_small_tokenizer):
  function default_multi_requests_seq2seq_lm_batch (line 52) | def default_multi_requests_seq2seq_lm_batch(default_pb_request, mt0_smal...
  function test_batch_from_pb (line 65) | def test_batch_from_pb(default_pb_batch, default_seq2seq_lm_batch):
  function test_batch_concatenate_no_prefill (line 96) | def test_batch_concatenate_no_prefill(default_seq2seq_lm_batch):
  function test_seq2seq_lm_batch_type (line 101) | def test_seq2seq_lm_batch_type(default_seq2seq_lm):
  function test_seq2seq_lm_generate_token (line 105) | def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_l...
  function test_seq2seq_lm_generate_token_completion (line 172) | def test_seq2seq_lm_generate_token_completion(
  function test_seq2seq_lm_generate_token_completion_multi (line 189) | def test_seq2seq_lm_generate_token_completion_multi(
  function test_batch_concatenate (line 226) | def test_batch_concatenate(

FILE: server/tests/utils/test_adapter.py
  function test_parse_lora_adapters_empty (line 11) | def test_parse_lora_adapters_empty():
  function test_parse_lora_adapters_single (line 16) | def test_parse_lora_adapters_single():
  function test_parse_lora_adapters_with_path (line 21) | def test_parse_lora_adapters_with_path():
  function test_parse_lora_adapters_with_path_and_revision (line 28) | def test_parse_lora_adapters_with_path_and_revision():
  function test_parse_lora_adapters_multiple (line 35) | def test_parse_lora_adapters_multiple():
  function test_parse_lora_adapters_invalid_format (line 46) | def test_parse_lora_adapters_invalid_format():
  function test_get_attn_weights (line 54) | def test_get_attn_weights():
  function test_get_mlp_weights_with_gate_up_proj (line 86) | def test_get_mlp_weights_with_gate_up_proj():
  function test_get_mlp_weights_without_gate_up_proj (line 106) | def test_get_mlp_weights_without_gate_up_proj():
  function test_get_attn_weights_different_layers (line 119) | def test_get_attn_weights_different_layers(layer_index):
  function test_get_mlp_weights_different_layers (line 141) | def test_get_mlp_weights_different_layers(layer_index):
  function test_get_attn_weights_llama_compatibility (line 156) | def test_get_attn_weights_llama_compatibility():
  function test_get_mlp_weights_llama_compatibility (line 185) | def test_get_mlp_weights_llama_compatibility():
  function test_get_attn_weights_gemma_compatibility (line 202) | def test_get_attn_weights_gemma_compatibility():
  function test_get_mlp_weights_gemma_compatibility (line 231) | def test_get_mlp_weights_gemma_compatibility():

FILE: server/tests/utils/test_convert.py
  function test_convert_files (line 10) | def test_convert_files():

FILE: server/tests/utils/test_hub.py
  function offline (line 20) | def offline():
  function fresh_cache (line 28) | def fresh_cache():
  function prefetched (line 41) | def prefetched():
  function test_weight_hub_files_offline_error (line 53) | def test_weight_hub_files_offline_error(offline, fresh_cache):
  function test_weight_hub_files_offline_ok (line 59) | def test_weight_hub_files_offline_ok(prefetched, offline):
  function test_weight_hub_files (line 73) | def test_weight_hub_files():
  function test_weight_hub_files_llm (line 78) | def test_weight_hub_files_llm():
  function test_weight_hub_files_empty (line 83) | def test_weight_hub_files_empty():
  function test_download_weights (line 88) | def test_download_weights():
  function test_weight_files_revision_error (line 96) | def test_weight_files_revision_error():
  function test_weight_files_not_cached_error (line 101) | def test_weight_files_not_cached_error(fresh_cache):

FILE: server/tests/utils/test_layers.py
  class ProcessGroup (line 7) | class ProcessGroup:
    method __init__ (line 8) | def __init__(self, rank: int, world_size: int):
    method size (line 12) | def size(self) -> int:
    method rank (line 15) | def rank(self) -> int:
  class Weights (line 19) | class Weights:
    method __init__ (line 20) | def __init__(self, rank: int, world_size: int, vocab_size: int, hidden...
    method get_partial_sharded (line 26) | def get_partial_sharded(self, name: str, dim: int):
    method get_shape (line 38) | def get_shape(self, name: str):
  function test_weight_hub_files_offline_error (line 42) | def test_weight_hub_files_offline_error():

FILE: server/tests/utils/test_tokens.py
  function test_stop_sequence_criteria (line 10) | def test_stop_sequence_criteria():
  function test_stop_sequence_criteria_escape (line 19) | def test_stop_sequence_criteria_escape():
  function test_stopping_criteria (line 28) | def test_stopping_criteria():
  function test_stopping_criteria_eos (line 34) | def test_stopping_criteria_eos():
  function test_stopping_criteria_max (line 40) | def test_stopping_criteria_max():
  function test_batch_top_tokens (line 49) | def test_batch_top_tokens():

FILE: server/tests/utils/test_watermark.py
  function test_seed_rng (line 13) | def test_seed_rng():
  function test_get_greenlist_ids (line 20) | def test_get_greenlist_ids():
  function test_calc_greenlist_mask (line 28) | def test_calc_greenlist_mask():
  function test_bias_greenlist_logits (line 37) | def test_bias_greenlist_logits():
  function test_call (line 49) | def test_call():

FILE: server/tests/utils/test_weights.py
  function gptq_weights_loader (line 20) | def gptq_weights_loader():
  function gptq_weights_loader_awq (line 33) | def gptq_weights_loader_awq():
  function marlin_weights_loader (line 46) | def marlin_weights_loader():
  class MockSlice (line 293) | class MockSlice:
    method __init__ (line 294) | def __init__(self, tensor):
    method get_shape (line 297) | def get_shape(self):
    method __getitem__ (line 300) | def __getitem__(self, idx):
  function mock_get_slice (line 304) | def mock_get_slice(tensor_name, filename):
  function mock_handle (line 309) | def mock_handle(filename, device, dtype):
  class MockSafeOpen (line 315) | class MockSafeOpen:
    method __init__ (line 316) | def __init__(self, filename, framework, dummy_fs):
    method keys (line 321) | def keys(self):
    method __enter__ (line 324) | def __enter__(self):
    method __exit__ (line 327) | def __exit__(self, exc_type, exc_val, exc_tb):
  class MockWeights (line 331) | class MockWeights(Weights):
    method __init__ (line 332) | def __init__(
    method _get_handle (line 369) | def _get_handle(self, filename: Union[Path, str]):
    method get_shape (line 377) | def get_shape(self, tensor_name: str):
    method get_tensor (line 382) | def get_tensor(self, tensor_name: str):
  function test_weights (line 391) | def test_weights():
  function test_get_tensor (line 406) | def test_get_tensor():
  function test_get_weights_col_packed (line 439) | def test_get_weights_col_packed():
  function test_get_weights_col_packed_block_size (line 473) | def test_get_weights_col_packed_block_size():
  function test_get_weights_col_packed_block_size_arr (line 507) | def test_get_weights_col_packed_block_size_arr():
  function test_get_multi_weights_col (line 541) | def test_get_multi_weights_col():
  function test_get_weights_row (line 577) | def test_get_weights_row():
  function test_get_weights_col_awq (line 606) | def test_get_weights_col_awq(gptq_weights_loader_awq):
  function test_get_weights_col_gtpq (line 648) | def test_get_weights_col_gtpq(gptq_weights_loader):
  function test_get_weights_col_exl2 (line 687) | def test_get_weights_col_exl2():
  function test_get_weights_col_marlin (line 723) | def test_get_weights_col_marlin(marlin_weights_loader):
  function test_get_weights_col_packed_awq (line 753) | def test_get_weights_col_packed_awq(gptq_weights_loader_awq):
  function test_get_weights_col_packed_exl2 (line 795) | def test_get_weights_col_packed_exl2():
  function test_get_weights_col_packed_gptq (line 833) | def test_get_weights_col_packed_gptq(gptq_weights_loader):
  function test_get_weights_col_packed_marlin (line 873) | def test_get_weights_col_packed_marlin(marlin_weights_loader):
  function test_get_multi_weights_col_awq (line 906) | def test_get_multi_weights_col_awq(gptq_weights_loader_awq):
  function test_get_multi_weights_col_exl2 (line 946) | def test_get_multi_weights_col_exl2():
  function test_get_multi_weights_col_gptq (line 969) | def test_get_multi_weights_col_gptq(gptq_weights_loader):
  function test_get_multi_weights_col_marlin (line 1009) | def test_get_multi_weights_col_marlin(marlin_weights_loader):
  function test_get_weights_row_awq (line 1040) | def test_get_weights_row_awq(gptq_weights_loader_awq):
  function test_get_weights_row_exl2 (line 1079) | def test_get_weights_row_exl2():
  function test_get_weights_row_gptq (line 1116) | def test_get_weights_row_gptq(gptq_weights_loader):
  function test_get_weights_row_marlin (line 1155) | def test_get_weights_row_marlin(marlin_weights_loader):

FILE: server/text_generation_server/adapters/config.py
  class ModuleMap (line 15) | class ModuleMap:
  class AdapterConfig (line 21) | class AdapterConfig(ABC):
    method map_weights_for_model (line 25) | def map_weights_for_model(

FILE: server/text_generation_server/adapters/lora.py
  function get_start_stop_idxs_for_rank (line 31) | def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
  function shard_on_dim (line 38) | def shard_on_dim(
  function shard_lora_weights (line 57) | def shard_lora_weights(
  class LoraConfig (line 75) | class LoraConfig(AdapterConfig):
    method map_weights_for_model (line 82) | def map_weights_for_model(
    method load (line 104) | def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
  class LoraWeights (line 118) | class LoraWeights(AdapterWeights):
    method __init__ (line 121) | def __init__(
    method weights_a (line 155) | def weights_a(self) -> torch.Tensor:
    method weights_b (line 161) | def weights_b(self) -> torch.Tensor:
    method weights_a_t (line 167) | def weights_a_t(self) -> torch.Tensor:
    method weights_b_t (line 173) | def weights_b_t(self) -> torch.Tensor:
    method _transpose_weights (line 178) | def _transpose_weights(self):
    method get_batch_types (line 186) | def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
    method prepare_weights (line 206) | def prepare_weights(
  class RankSegments (line 287) | class RankSegments:
  class BatchLoraWeights (line 304) | class BatchLoraWeights(BatchAdapterWeights):
    method has_adapter (line 311) | def has_adapter(self, adapter_index: int) -> bool:
    method can_vectorize (line 314) | def can_vectorize(self, pg: ProcessGroup) -> bool:
    method load (line 321) | def load(
  class IPEXBatchLoraWeights (line 490) | class IPEXBatchLoraWeights(BatchLoraWeights):
    method load (line 492) | def load(
  function get_scaling_factor (line 598) | def get_scaling_factor(
  function _convert_lora (line 609) | def _convert_lora(v: AdapterWeights) -> AdapterWeights:

FILE: server/text_generation_server/adapters/weights.py
  class AdapterBatchMetadata (line 14) | class AdapterBatchMetadata:
  class AdapterWeights (line 30) | class AdapterWeights(ABC):
    method get_batch_types (line 32) | def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]:
    method speculative_tokens (line 36) | def speculative_tokens(self) -> int:
  class BatchAdapterWeights (line 40) | class BatchAdapterWeights(ABC):
    method has_adapter (line 42) | def has_adapter(self, adapter_index: int) -> bool:
    method load (line 46) | def load(
  class LayerAdapterWeights (line 56) | class LayerAdapterWeights:
    method __init__ (line 59) | def __init__(self):
    method add_adapter (line 62) | def add_adapter(self, adapter_idx: int, weights: AdapterWeights):
    method remove_adapter (line 65) | def remove_adapter(self, adapter_idx: int):
    method is_empty (line 70) | def is_empty(self) -> bool:
    method get_data (line 73) | def get_data(
  class AdapterBatchData (line 98) | class AdapterBatchData:
    method from_meta (line 107) | def from_meta(
    method ranks (line 122) | def ranks(self) -> Set[int]:
    method layer_names (line 134) | def layer_names(self) -> Set[str]:
    method adapter_keys (line 137) | def adapter_keys(self) -> Set[str]:
    method max_rank (line 144) | def max_rank(self) -> int:

FILE: server/text_generation_server/cache.py
  class Cache (line 10) | class Cache:
    method __init__ (line 11) | def __init__(self):
    method pop (line 14) | def pop(self, batch_id: int) -> Optional[B]:
    method set (line 17) | def set(self, entry: B):
    method delete (line 21) | def delete(self, batch_id: int):
    method clear (line 28) | def clear(self):
    method __len__ (line 33) | def __len__(self):

FILE: server/text_generation_server/cli.py
  class Quantization (line 18) | class Quantization(str, Enum):
  class Dtype (line 31) | class Dtype(str, Enum):
  class KVCacheDtype (line 36) | class KVCacheDtype(str, Enum):
  function serve (line 42) | def serve(
  function download_weights (line 135) | def download_weights(
  function quantize (line 339) | def quantize(

FILE: server/text_generation_server/interceptor.py
  class ExceptionInterceptor (line 11) | class ExceptionInterceptor(AsyncServerInterceptor):
    method __init__ (line 12) | def __init__(self, shutdown_callback):
    method intercept (line 15) | async def intercept(

FILE: server/text_generation_server/layers/attention/common.py
  class Seqlen (line 7) | class Seqlen:
    method __init__ (line 15) | def __init__(
    method clamp (line 50) | def clamp(self, max):

FILE: server/text_generation_server/layers/attention/cuda.py
  function paged_attention (line 30) | def paged_attention(
  function attention (line 228) | def attention(

FILE: server/text_generation_server/layers/attention/flash_attn_triton.py
  function cdiv_fn (line 31) | def cdiv_fn(x, y):
  function max_fn (line 36) | def max_fn(x, y):
  function dropout_offsets (line 41) | def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
  function dropout_rng (line 48) | def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
  function dropout_mask (line 57) | def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
  function load_fn (line 64) | def load_fn(block_ptr, first, second, pad):
  function _attn_fwd_inner (line 77) | def _attn_fwd_inner(
  function attn_fwd (line 311) | def attn_fwd(
  function check_args (line 668) | def check_args(
  class _attention (line 701) | class _attention(torch.autograd.Function):
    method forward (line 703) | def forward(

FILE: server/text_generation_server/layers/attention/flashinfer.py
  function unpad_2d_mask (line 24) | def unpad_2d_mask(
  function get_workspace (line 38) | def get_workspace(device):
  function create_prefill_with_paged_kv_state (line 46) | def create_prefill_with_paged_kv_state(
  function use_prefill_with_paged_kv_state (line 58) | def use_prefill_with_paged_kv_state(
  function create_prefill_state (line 130) | def create_prefill_state(
  function create_decode_state (line 141) | def create_decode_state(
  function create_decode_state_cuda_graphs (line 159) | def create_decode_state_cuda_graphs(
  function use_decode_state (line 188) | def use_decode_state(

FILE: server/text_generation_server/layers/attention/ipex.py
  function attention (line 17) | def attention(
  function paged_attention (line 89) | def paged_attention(

FILE: server/text_generation_server/layers/attention/kv_cache.py
  class KVScales (line 28) | class KVScales:
    method __post_init__ (line 43) | def __post_init__(self):
  class KVCache (line 51) | class KVCache:
    method __init__ (line 58) | def __init__(
    method can_scale (line 135) | def can_scale(self, kv_scales: KVScales) -> bool:
    method dtype (line 155) | def dtype(self):
    method key (line 160) | def key(self):
    method value (line 166) | def value(self):
    method store (line 171) | def store(
  function paged_reshape_and_cache (line 243) | def paged_reshape_and_cache(
  function get_kv_scales (line 310) | def get_kv_scales(weights: Weights, prefix: str) -> KVScales:

FILE: server/text_generation_server/layers/attention/rocm.py
  function _use_rocm_custom_paged_attention (line 32) | def _use_rocm_custom_paged_attention(
  function paged_attention (line 51) | def paged_attention(
  function attention (line 273) | def attention(

FILE: server/text_generation_server/layers/awq/conversion_utils.py
  function pack (line 9) | def pack(imatrix: torch.Tensor, direction: str = "column"):
  function unpack (line 35) | def unpack(qmatrix: torch.Tensor, direction: str = "column"):
  function apply_order (line 61) | def apply_order(
  function fast_awq_to_gptq (line 83) | def fast_awq_to_gptq(qweight, qzeros):

FILE: server/text_generation_server/layers/awq/quantize/cuda.py
  class WQLinear (line 19) | class WQLinear(nn.Module):
    method __init__ (line 20) | def __init__(
    method forward (line 43) | def forward(self, x):

FILE: server/text_generation_server/layers/awq/quantize/ipex.py
  class WQLinear (line 7) | class WQLinear(nn.Module):
    method __init__ (line 8) | def __init__(
    method forward (line 44) | def forward(self, x):

FILE: server/text_generation_server/layers/bnb.py
  class BNBWeight (line 10) | class BNBWeight(UnquantizedWeight):
    method get_linear (line 13) | def get_linear(self, bias: torch.Tensor):
  class Linear8bitLt (line 17) | class Linear8bitLt(torch.nn.Module):
    method __init__ (line 18) | def __init__(
    method init_8bit_state (line 49) | def init_8bit_state(self):
    method forward (line 55) | def forward(self, x: torch.Tensor):
  class BNBFP4Weight (line 76) | class BNBFP4Weight(UnquantizedWeight):
    method get_linear (line 79) | def get_linear(self, bias: torch.Tensor):
  class BNBNF4Weight (line 84) | class BNBNF4Weight(UnquantizedWeight):
    method get_linear (line 87) | def get_linear(self, bias: torch.Tensor):
  class Linear4bit (line 91) | class Linear4bit(torch.nn.Module):
    method __init__ (line 92) | def __init__(self, weight, bias, quant_type):
    method forward (line 104) | def forward(self, x: torch.Tensor):

FILE: server/text_generation_server/layers/compressed_tensors/loader.py
  class CompressedTensorsLoader (line 34) | class CompressedTensorsLoader(WeightsLoader):
    method __init__ (line 37) | def __init__(self, config: Dict[str, Any]):
    method get_weights (line 74) | def get_weights(self, weights: Weights, prefix: str):
    method get_weights_col_packed (line 78) | def get_weights_col_packed(
    method get_multi_weights_col (line 87) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 91) | def get_weights_row(self, weights: Weights, prefix: str):
    method _get_target_loaders (line 95) | def _get_target_loaders(
    method _create_loader_for_group (line 126) | def _create_loader_for_group(
    method _lookup_loader (line 181) | def _lookup_loader(self, prefix: str) -> WeightsLoader:

FILE: server/text_generation_server/layers/compressed_tensors/w8a8_int.py
  class W8A8IntLoader (line 22) | class W8A8IntLoader(WeightsLoader):
    method __init__ (line 27) | def __init__(
    method __str__ (line 54) | def __str__(self) -> str:
    method get_weights (line 63) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 78) | def get_weights_col_packed(
    method get_multi_weights_col (line 106) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_weights_row (line 128) | def get_weights_row(self, weights: "Weights", prefix: str):
  function _get_tensor_or_else (line 147) | def _get_tensor_or_else(
  class Int8Weight (line 159) | class Int8Weight(Weight):
    method get_linear (line 164) | def get_linear(self, bias: torch.Tensor):
  class W8A8IntLinear (line 183) | class W8A8IntLinear(torch.nn.Module):
    method __init__ (line 184) | def __init__(
    method forward (line 210) | def forward(self, input: torch.Tensor) -> torch.Tensor:

FILE: server/text_generation_server/layers/compressed_tensors/w8an_fp.py
  class W8ANFpLoader (line 16) | class W8ANFpLoader(WeightsLoader):
    method __init__ (line 21) | def __init__(
    method __str__ (line 43) | def __str__(self) -> str:
    method get_weights (line 51) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 75) | def get_weights_col_packed(
    method get_multi_weights_col (line 118) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_weights_row (line 169) | def get_weights_row(self, weights: "Weights", prefix: str):

FILE: server/text_generation_server/layers/compressed_tensors/wna16_int.py
  class WNA16IntLoader (line 12) | class WNA16IntLoader(WeightsLoader):
    method __init__ (line 17) | def __init__(self, weights: QuantizationArgs):
    method __str__ (line 24) | def __str__(self) -> str:
    method get_weights (line 29) | def get_weights(self, weights: Weights, prefix: str):
    method get_weights_col_packed (line 61) | def get_weights_col_packed(
    method get_multi_weights_col (line 103) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 148) | def get_weights_row(self, weights: Weights, prefix: str):

FILE: server/text_generation_server/layers/compressed_tensors/wna16_int_24.py
  class WNA16Int24Loader (line 11) | class WNA16Int24Loader(WeightsLoader):
    method __init__ (line 16) | def __init__(self, weight_args: QuantizationArgs):
    method __str__ (line 30) | def __str__(self) -> str:
    method get_weights (line 35) | def get_weights(self, weights: Weights, prefix: str):
    method get_weights_col_packed (line 49) | def get_weights_col_packed(
    method get_multi_weights_col (line 71) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 88) | def get_weights_row(self, weights: Weights, prefix: str):

FILE: server/text_generation_server/layers/conv.py
  function load_conv2d (line 6) | def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_...
  function load_conv2d_no_bias (line 23) | def load_conv2d_no_bias(

FILE: server/text_generation_server/layers/eetq.py
  class EETQWeight (line 13) | class EETQWeight(UnquantizedWeight):
    method get_linear (line 16) | def get_linear(self, bias: torch.Tensor):
  class EETQLinear (line 27) | class EETQLinear(torch.nn.Module):
    method __init__ (line 28) | def __init__(
    method forward (line 44) | def forward(self, input: torch.Tensor) -> torch.Tensor:

FILE: server/text_generation_server/layers/exl2.py
  class Exl2Weight (line 9) | class Exl2Weight(Weight):
    method __post_init__ (line 20) | def __post_init__(self):
    method device (line 25) | def device(self) -> torch.device:
    method get_linear (line 28) | def get_linear(self, bias: torch.Tensor):
  class Exl2WeightsLoader (line 34) | class Exl2WeightsLoader(WeightsLoader):
    method get_weights (line 37) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 61) | def get_weights_col_packed(
    method get_weights_col (line 69) | def get_weights_col(self, weights: Weights, prefix: str):
    method get_multi_weights_col (line 73) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 76) | def get_weights_row(self, weights: Weights, prefix: str):

FILE: server/text_generation_server/layers/fp8.py
  function get_fp8_linear (line 44) | def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
  function normalize_e4m3fn_to_native_float8 (line 79) | def normalize_e4m3fn_to_native_float8(
  function per_tensor_dequantize (line 103) | def per_tensor_dequantize(
  function requantize_with_max_scale (line 113) | def requantize_with_max_scale(
  function fp8_quantize (line 136) | def fp8_quantize(
  class HybridFP8UnquantLoader (line 187) | class HybridFP8UnquantLoader(WeightsLoader):
    method __init__ (line 190) | def __init__(
    method get_weights (line 200) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 239) | def get_weights_col_packed(
    method get_multi_weights_col (line 289) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_weights_row (line 357) | def get_weights_row(self, weights: "Weights", prefix: str):
  class Fp8Weight (line 400) | class Fp8Weight(Weight):
    method get_linear (line 409) | def get_linear(self, bias: torch.Tensor):
  class Fp8Linear (line 428) | class Fp8Linear(torch.nn.Module):
    method __init__ (line 431) | def __init__(
    method from_unquant (line 465) | def from_unquant(cls, weight, bias, dtype):
    method from_fp8 (line 477) | def from_fp8(
    method get_shared_device_identity (line 500) | def get_shared_device_identity(cls, device):
    method forward (line 507) | def forward(self, input: torch.Tensor) -> torch.Tensor:
  function _load_scalar_or_matrix_scale (line 582) | def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: t...

FILE: server/text_generation_server/layers/gptq/__init__.py
  class GPTQWeight (line 19) | class GPTQWeight(Weight):
    method __post_init__ (line 29) | def __post_init__(self):
    method device (line 34) | def device(self) -> torch.device:
    method get_linear (line 37) | def get_linear(self, bias: torch.Tensor):
  class GPTQWeightsLoader (line 93) | class GPTQWeightsLoader(WeightsLoader):
    method __init__ (line 98) | def __init__(
    method get_weights (line 117) | def get_weights(self, weights: Weights, prefix: str):
    method is_layer_skipped_quantization (line 197) | def is_layer_skipped_quantization(
    method get_weights_col_packed (line 202) | def get_weights_col_packed(
    method get_multi_weights_col (line 262) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 331) | def get_weights_row(self, weights: Weights, prefix: str):
    method _get_gptq_params (line 433) | def _get_gptq_params(self, weights: Weights):

FILE: server/text_generation_server/layers/gptq/custom_autotune.py
  class Autotuner (line 14) | class Autotuner(triton.KernelInterface):
    method __init__ (line 15) | def __init__(
    method _bench (line 64) | def _bench(self, *args, config, **meta):
    method run (line 96) | def run(self, *args, **kwargs):
    method prune_configs (line 133) | def prune_configs(self, kwargs):
    method warmup (line 157) | def warmup(self, *args, **kwargs):
  function autotune (line 170) | def autotune(
  function matmul248_kernel_config_pruner (line 217) | def matmul248_kernel_config_pruner(configs, nargs):

FILE: server/text_generation_server/layers/gptq/exllama.py
  function ext_make_q4 (line 9) | def ext_make_q4(qweight, qzeros, scales, g_idx, device):
  function ext_q4_matmul (line 16) | def ext_q4_matmul(x, q4, q4_width):
  function set_device (line 36) | def set_device(device):
  function create_exllama_buffers (line 41) | def create_exllama_buffers(max_total_tokens: int):
  class Ex4bitLinear (line 66) | class Ex4bitLinear(torch.nn.Module):
    method __init__ (line 69) | def __init__(self, weight: GPTQWeight, bias):
    method forward (line 129) | def forward(self, x):

FILE: server/text_generation_server/layers/gptq/exllamav2.py
  class _ExtraTensors (line 28) | class _ExtraTensors:
  function ext_gemm_half_q_half (line 36) | def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
  function make_group_map (line 45) | def make_group_map(q_groups: torch.Tensor, num_qrows: int):
  function ext_make_q_matrix (line 67) | def ext_make_q_matrix(
  function set_device (line 154) | def set_device(device):
  function create_exllama_buffers (line 159) | def create_exllama_buffers(max_total_tokens: int):
  class QuantLinear (line 178) | class QuantLinear(nn.Module):
    method __init__ (line 183) | def __init__(
    method post_init (line 215) | def post_init(self, temp_dq):
    method forward (line 226) | def forward(self, x, force_cuda=False):
    method temp_dq_size (line 233) | def temp_dq_size(self):
    method temp_fwd_size (line 236) | def temp_fwd_size(self, max_input_len, max_batch_size):
    method scratch_space_fixed (line 239) | def scratch_space_fixed(self, max_input_len, max_batch_size):
  class ExLlamaV2DeviceTensors (line 243) | class ExLlamaV2DeviceTensors:
    method __init__ (line 250) | def __init__(self, device, scratch_bytes):
    method prepare (line 254) | def prepare(self):
    method get_scratch_slice (line 259) | def get_scratch_slice(self, size_bytes):

FILE: server/text_generation_server/layers/gptq/ipex.py
  class QuantLinear (line 9) | class QuantLinear(nn.Module):
    method __init__ (line 10) | def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsi...
    method new (line 44) | def new(cls, bits, groupsize, infeatures, outfeatures, bias):
    method pack (line 65) | def pack(self, linear, scales, zeros, g_idx=None):
    method forward (line 122) | def forward(self, x):

FILE: server/text_generation_server/layers/gptq/quantize.py
  class Quantizer (line 25) | class Quantizer(nn.Module):
    method __init__ (line 26) | def __init__(self, shape=1):
    method configure (line 32) | def configure(
    method _quantize (line 54) | def _quantize(self, x, scale, zero, maxq):
    method find_params (line 60) | def find_params(self, x, weight=False):
    method quantize (line 145) | def quantize(self, x):
    method enabled (line 151) | def enabled(self):
    method ready (line 154) | def ready(self):
  class GPTQ (line 158) | class GPTQ:
    method __init__ (line 159) | def __init__(self, layer, observe=False):
    method add_batch (line 174) | def add_batch(self, inp, out):
    method print_loss (line 209) | def print_loss(self, name, q_weight, weight_error, timecost):
    method fasterquant (line 243) | def fasterquant(
    method free (line 357) | def free(self):
  function get_wikitext2 (line 366) | def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_ptb (line 398) | def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_c4 (line 430) | def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_ptb_new (line 498) | def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_c4_new (line 530) | def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
  function get_loaders (line 584) | def get_loaders(
  function find_layers (line 599) | def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
  function sequential (line 615) | def sequential(
  function make_quant_linear (line 754) | def make_quant_linear(module, names, bits, groupsize, name=""):
  function pack (line 780) | def pack(model, quantizers, bits, groupsize):
  function setdeepattr (line 794) | def setdeepattr(module, full_name, tensor):
  function getdeepattr (line 802) | def getdeepattr(module, full_name):
  function load_weights_pre_hook (line 810) | def load_weights_pre_hook(module_name, weights, recursive=False):
  function load_weights_post_hook (line 842) | def load_weights_post_hook(module_name, weights, recursive=False):
  function quantize (line 867) | def quantize(

FILE: server/text_generation_server/layers/gptq/triton.py
  function matmul_248_kernel (line 105) | def matmul_248_kernel(
  function matmul248 (line 204) | def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
  class QuantLinearFunction (line 244) | class QuantLinearFunction(torch.autograd.Function):
    method forward (line 247) | def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  class QuantLinear (line 252) | class QuantLinear(nn.Module):
    method __init__ (line 253) | def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsi...
    method new (line 273) | def new(cls, bits, groupsize, infeatures, outfeatures, bias):
    method pack (line 294) | def pack(self, linear, scales, zeros, g_idx=None):
    method forward (line 351) | def forward(self, x):

FILE: server/text_generation_server/layers/gptq/utils.py
  function torch_snr_error (line 5) | def torch_snr_error(

FILE: server/text_generation_server/layers/layernorm.py
  function load_layer_norm (line 11) | def load_layer_norm(cls, prefix, weights, eps):
  function load_layer_norm_no_bias (line 23) | def load_layer_norm_no_bias(cls, prefix, weights, eps):
  class FastLayerNorm (line 39) | class FastLayerNorm(nn.LayerNorm):
    method forward (line 40) | def forward(self, hidden_states, residual=None):
    method forward (line 78) | def forward(self, hidden_states, residual=None):
    method forward (line 89) | def forward(self, hidden_states, residual=None):
  class FastLayerNorm (line 77) | class FastLayerNorm(nn.LayerNorm):
    method forward (line 40) | def forward(self, hidden_states, residual=None):
    method forward (line 78) | def forward(self, hidden_states, residual=None):
    method forward (line 89) | def forward(self, hidden_states, residual=None):
  class FastLayerNorm (line 88) | class FastLayerNorm(nn.LayerNorm):
    method forward (line 40) | def forward(self, hidden_states, residual=None):
    method forward (line 78) | def forward(self, hidden_states, residual=None):
    method forward (line 89) | def forward(self, hidden_states, residual=None):
  class FastRMSNorm (line 101) | class FastRMSNorm(nn.Module):
    method __init__ (line 102) | def __init__(self, weight: torch.Tensor, eps: float):
    method load (line 109) | def load(cls, prefix, weights, eps=1e-6):
    method forward (line 113) | def forward(self, hidden_states, residual=None):

FILE: server/text_generation_server/layers/linear.py
  class FastLinear (line 21) | class FastLinear(torch.nn.Module):
    method __init__ (line 22) | def __init__(
    method load (line 35) | def load(cls, config, prefix: str, weights, bias: bool):
    method forward (line 43) | def forward(self, input: torch.Tensor) -> torch.Tensor:
  class FastLinearROCm (line 47) | class FastLinearROCm(torch.nn.Module):
    method __init__ (line 48) | def __init__(
    method load (line 69) | def load(cls, config, prefix: str, weights, bias: bool):
    method forward (line 77) | def forward(self, inp: torch.Tensor) -> torch.Tensor:
  function get_linear (line 116) | def get_linear(weight, bias):

FILE: server/text_generation_server/layers/lora.py
  class LoraLinear (line 35) | class LoraLinear(nn.Module):
    method __init__ (line 36) | def __init__(
    method forward_layer_type (line 44) | def forward_layer_type(
    method forward_lora (line 212) | def forward_lora(
    method collect_lora_a (line 231) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
  class TensorParallelMultiAdapterLinear (line 235) | class TensorParallelMultiAdapterLinear(LoraLinear):
    method __init__ (line 236) | def __init__(
    method load (line 249) | def load(
    method forward (line 261) | def forward(
    method collect_lora_a (line 304) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
  class TensorParallelAdapterRowLinear (line 319) | class TensorParallelAdapterRowLinear(LoraLinear):
    method __init__ (line 320) | def __init__(self, base_layer, layer_id, layer_name, process_group):
    method load (line 325) | def load(cls, base_layer, layer_id, layer_name, process_group):
    method forward (line 328) | def forward(
    method collect_lora_a (line 347) | def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:

FILE: server/text_generation_server/layers/marlin/fp8.py
  class GPTQMarlinFP8Linear (line 25) | class GPTQMarlinFP8Linear(nn.Module):
    method __init__ (line 30) | def __init__(
    method from_unquant (line 60) | def from_unquant(cls, weight, bias, dtype):
    method from_fp8 (line 65) | def from_fp8(
    method forward (line 75) | def forward(self, A: torch.Tensor) -> torch.Tensor:
  function pack_fp8_as_int32 (line 97) | def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
  function repack_fp8_for_marlin (line 128) | def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):

FILE: server/text_generation_server/layers/marlin/gptq.py
  function can_use_gptq_marlin (line 39) | def can_use_gptq_marlin(
  class GPTQMarlinWeightsLoader (line 55) | class GPTQMarlinWeightsLoader(WeightsLoader):
    method __init__ (line 60) | def __init__(
    method get_weights (line 77) | def get_weights(self, weights: Weights, prefix: str):
    method get_weights_col_packed (line 110) | def get_weights_col_packed(
    method get_multi_weights_col (line 153) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 195) | def get_weights_row(self, weights: Weights, prefix: str):
    method _get_gptq_params (line 237) | def _get_gptq_params(self, weights: Weights):
  class GPTQMarlinWeight (line 253) | class GPTQMarlinWeight(Weight):
    method __post_init__ (line 266) | def __post_init__(self):
    method get_linear (line 272) | def get_linear(self, bias: torch.Tensor):
  function repack_gptq_for_marlin (line 279) | def repack_gptq_for_marlin(
  class GPTQMarlinLinear (line 371) | class GPTQMarlinLinear(nn.Module):
    method __init__ (line 377) | def __init__(
    method forward (line 422) | def forward(self, A: torch.Tensor) -> torch.Tensor:
  function awq_to_marlin_zero_points (line 450) | def awq_to_marlin_zero_points(
  function _check_valid_shape (line 474) | def _check_valid_shape(in_features: int, out_features: int):

FILE: server/text_generation_server/layers/marlin/marlin.py
  class MarlinWeightsLoader (line 20) | class MarlinWeightsLoader(WeightsLoader):
    method __init__ (line 23) | def __init__(self, *, bits: int, is_marlin_24: bool):
    method get_weights (line 27) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 58) | def get_weights_col_packed(
    method get_multi_weights_col (line 89) | def get_multi_weights_col(self, weights: Weights, prefixes: List[str],...
    method get_weights_row (line 128) | def get_weights_row(self, weights: Weights, prefix: str):
  class MarlinWeight (line 170) | class MarlinWeight(Weight):
    method __post_init__ (line 182) | def __post_init__(self):
    method get_linear (line 186) | def get_linear(self, bias: torch.Tensor):
  class MarlinLinear (line 190) | class MarlinLinear(nn.Module):
    method __init__ (line 191) | def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tenso...
    method forward (line 223) | def forward(self, A: torch.Tensor) -> torch.Tensor:
  class GPTQMarlin24Weight (line 252) | class GPTQMarlin24Weight:
    method __post_init__ (line 268) | def __post_init__(self):
    method get_linear (line 273) | def get_linear(self, bias: torch.Tensor):
  class GPTQMarlin24Linear (line 280) | class GPTQMarlin24Linear(nn.Module):
    method __init__ (line 281) | def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch...
    method forward (line 346) | def forward(self, A: torch.Tensor) -> torch.Tensor:

FILE: server/text_generation_server/layers/marlin/util.py
  function _check_marlin_kernels (line 23) | def _check_marlin_kernels():
  function get_perms (line 37) | def get_perms() -> Tuple[List[int], List[int]]:
  function permute_scales (line 47) | def permute_scales(scales: torch.Tensor):
  function get_pack_factor (line 60) | def get_pack_factor(bits: int) -> int:
  function pack_cols (line 66) | def pack_cols(
  function unpack_cols (line 92) | def unpack_cols(
  function marlin_zero_points (line 124) | def marlin_zero_points(

FILE: server/text_generation_server/layers/medusa.py
  class ResBlock (line 12) | class ResBlock(torch.nn.Module):
    method __init__ (line 13) | def __init__(self, config, prefix, weights):
    method forward (line 20) | def forward(self, x):
  class MedusaModel (line 24) | class MedusaModel(torch.nn.Module):
    method __init__ (line 25) | def __init__(self, config, medusa_config, weights):
    method forward (line 34) | def forward(self, x):
  class MedusaHead (line 41) | class MedusaHead(torch.nn.Module):
    method __init__ (line 42) | def __init__(self, config, medusa_config, prefix, weights):
    method forward (line 55) | def forward(self, x):
  class MedusaHeadV1 (line 62) | class MedusaHeadV1(nn.Module):
    method __init__ (line 63) | def __init__(self, lm_head, medusa):
    method load (line 69) | def load(config, prefix: str, weights):
    method forward (line 97) | def forward(
  class MedusaHeadV2 (line 109) | class MedusaHeadV2(nn.Module):
    method __init__ (line 110) | def __init__(self, config, prefix, weights):
    method forward (line 150) | def forward(self, x):

FILE: server/text_generation_server/layers/mlp.py
  class MLPSpeculatorLayerNorm (line 11) | class MLPSpeculatorLayerNorm(nn.Module):
    method __init__ (line 27) | def __init__(
    method forward (line 39) | def forward(self, x):
  function simple_norm (line 51) | def simple_norm(x: torch.Tensor, eps=1e-06):
  class MLPSpeculatorModelTied (line 58) | class MLPSpeculatorModelTied(torch.nn.Module):
    method __init__ (line 59) | def __init__(self, config, prefix, weights):
    method forward (line 96) | def forward(
  class MLPSpeculatorModel (line 142) | class MLPSpeculatorModel(torch.nn.Module):
    method __init__ (line 143) | def __init__(self, config, prefix, weights):
    method forward (line 192) | def forward(
  class MLPSpeculatorHead (line 235) | class MLPSpeculatorHead(nn.Module):
    method __init__ (line 236) | def __init__(self, lm_head, mlp_speculator, scale_input: bool):
    method forward (line 242) | def forward(
    method load (line 257) | def load(config, prefix: str, weights):

FILE: server/text_generation_server/layers/moe/__init__.py
  class MoELayer (line 45) | class MoELayer(Protocol):
    method __init__ (line 46) | def __init__(
    method forward (line 64) | def forward(
  class DenseMoELayer (line 69) | class DenseMoELayer(nn.Module):
    method __init__ (line 77) | def __init__(
    method forward (line 158) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
  class SparseMoELayer (line 197) | class SparseMoELayer(nn.Module):
    method __init__ (line 204) | def __init__(
    method forward (line 265) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
    method is_supported (line 269) | def is_supported(weights: Weights) -> bool:

FILE: server/text_generation_server/layers/moe/fp8.py
  class FP8SparseMoELayer (line 20) | class FP8SparseMoELayer(nn.Module):
    method __init__ (line 21) | def __init__(
    method forward (line 72) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
  function _load_expert_weights (line 94) | def _load_expert_weights(
  function _load_expert_multi_weights_col (line 143) | def _load_expert_multi_weights_col(
  function _load_expert_weights_row (line 161) | def _load_expert_weights_row(

FILE: server/text_generation_server/layers/moe/fused_moe_ipex.py
  function grouped_topk (line 21) | def grouped_topk(
  function fused_topk (line 53) | def fused_topk(

FILE: server/text_generation_server/layers/moe/gptq_marlin.py
  function can_use_marlin_moe_gemm (line 29) | def can_use_marlin_moe_gemm(
  class GPTQMarlinMoEWeight (line 47) | class GPTQMarlinMoEWeight:
  class GPTQMarlinSparseMoELayer (line 56) | class GPTQMarlinSparseMoELayer(nn.Module):
    method __init__ (line 61) | def __init__(
    method forward (line 119) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
  function _load_expert_multi_weights_col (line 151) | def _load_expert_multi_weights_col(
  function _load_expert_weights_row (line 171) | def _load_expert_weights_row(
  function _pack_weight (line 191) | def _pack_weight(
  function fused_marlin_moe (line 243) | def fused_marlin_moe(

FILE: server/text_generation_server/layers/moe/unquantized.py
  class UnquantizedSparseMoELayer (line 18) | class UnquantizedSparseMoELayer(nn.Module):
    method __init__ (line 19) | def __init__(
    method forward (line 68) | def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> ...
  function _load_expert_multi_weights_col (line 107) | def _load_expert_multi_weights_col(
  function _load_expert_weights_row (line 137) | def _load_expert_weights_row(
  function fused_moe (line 166) | def fused_moe(

FILE: server/text_generation_server/layers/rotary.py
  function _create_inv_freq (line 17) | def _create_inv_freq(dim, base, device):
  function _get_rope_config (line 24) | def _get_rope_config(config):
  class PositionRotaryEmbedding (line 34) | class PositionRotaryEmbedding(nn.Module):
    method __init__ (line 35) | def __init__(self, inv_freq, scaling_factor):
    method forward (line 46) | def forward(
    method static (line 83) | def static(cls, config, dim, base, device):
    method load (line 206) | def load(cls, config, prefix, weights):
    method _update_cos_sin_cache (line 251) | def _update_cos_sin_cache(self, dtype, device, seqlen):
    method get_cos_sin (line 270) | def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: t...
  class SuRotaryEmbedding (line 289) | class SuRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 290) | def __init__(
    method _update_cos_sin_cache (line 309) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  class Phi3LongRoPEScaledRotaryEmbedding (line 336) | class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 337) | def __init__(
    method _update_cos_sin_cache (line 362) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  class DynamicPositionRotaryEmbedding (line 393) | class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 394) | def __init__(self, dim, max_position_embeddings, base, device, scaling...
    method _update_cos_sin_cache (line 401) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  function find_correction_dim (line 427) | def find_correction_dim(num_rotations, dim, base=10000, max_position_emb...
  function find_correction_range (line 434) | def find_correction_range(
  function linear_ramp_mask (line 442) | def linear_ramp_mask(min, max, dim):
  function get_mscale (line 451) | def get_mscale(scale: float = 1.0, mscale: float = 1.0):
  class YarnPositionRotaryEmbedding (line 457) | class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
    method __init__ (line 458) | def __init__(
    method _update_cos_sin_cache (line 490) | def _update_cos_sin_cache(self, dtype, device, seqlen):
  function apply_llama3_scaling (line 532) | def apply_llama3_scaling(
  class RotaryPositionEmbeddingMultimodalSections (line 561) | class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
    method __init__ (line 562) | def __init__(self, inv_freq: torch.Tensor, scaling_factor: float, sect...
    method _update_cos_sin_cache (line 574) | def _update_cos_sin_cache(
    method get_cos_sin (line 591) | def get_cos_sin(

FILE: server/text_generation_server/layers/speculative.py
  class SpeculativeHead (line 9) | class SpeculativeHead(torch.nn.Module):
    method __init__ (line 10) | def __init__(self, lm_head, speculator):
    method load (line 16) | def load(config, prefix: str, weights):
    method forward (line 44) | def forward(

FILE: server/text_generation_server/layers/tensor_parallel.py
  class LayerConcat (line 11) | class LayerConcat(torch.nn.Module):
    method __init__ (line 17) | def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1):
    method forward (line 25) | def forward(self, x: torch.Tensor):
  class SuperLayer (line 30) | class SuperLayer(torch.nn.Module):
    method __init__ (line 31) | def __init__(self, linear):
    method forward (line 35) | def forward(self, x):
  class TensorParallelHead (line 39) | class TensorParallelHead(SuperLayer):
    method __init__ (line 40) | def __init__(self, linear, process_group, should_gather: bool):
    method load (line 46) | def load(config, prefix: str, weights):
    method forward (line 75) | def forward(self, input: torch.Tensor) -> torch.Tensor:
  class TensorParallelColumnLinear (line 118) | class TensorParallelColumnLinear(SuperLayer):
    method load_gate_up (line 120) | def load_gate_up(cls, config, prefix: str, weights, bias: bool):
    method load_qkv (line 131) | def load_qkv(
    method load (line 154) | def load(cls, config, prefix: str, weights, bias: bool):
    method load_multi (line 164) | def load_multi(cls, config, prefixes: List[str], weights, bias: bool, ...
  class TensorParallelRowLinear (line 183) | class TensorParallelRowLinear(SuperLayer):
    method __init__ (line 184) | def __init__(self, linear, process_group):
    method load (line 189) | def load(cls, config, prefix: str, weights, bias: bool):
    method forward (line 202) | def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.T...
  class TensorParallelEmbedding (line 212) | class TensorParallelEmbedding(torch.nn.Module):
    method __init__ (line 213) | def __init__(self, prefix: str, weights, reduce=True):
    method forward (line 235) | def forward(self, input: torch.Tensor) -> torch.Tensor:

FILE: server/text_generation_server/models/__init__.py
  class ModelType (line 217) | class ModelType(enum.Enum):
  function get_model (line 425) | def get_model(
  function get_model_with_lora_adapters (line 1802) | def get_model_with_lora_adapters(

FILE: server/text_generation_server/models/bloom.py
  class BloomCausalLMBatch (line 15) | class BloomCausalLMBatch(CausalLMBatch):
    method from_pb (line 17) | def from_pb(
  class BLOOMSharded (line 29) | class BLOOMSharded(CausalLM):
    method batch_type (line 31) | def batch_type(self) -> Type[CausalLMBatch]:
    method forward (line 34) | def forward(

FILE: server/text_generation_server/models/causal_lm.py
  class CausalLMBatch (line 38) | class CausalLMBatch(Batch):
    method to_pb (line 73) | def to_pb(self) -> generate_pb2.CachedBatch:
    method from_pb (line 83) | def from_pb(
    method filter (line 175) | def filter(self, request_ids: List[int]) -> Optional["CausalLMBatch"]:
    method concatenate (line 279) | def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch":
    method __len__ (line 491) | def __len__(self):
  class CausalLMBatchKeysLast (line 496) | class CausalLMBatchKeysLast(CausalLMBatch):
  class CausalLM (line 500) | class CausalLM(Model):
    method __init__ (line 501) | def __init__(
    method fallback (line 584) | def fallback(
    method batch_type (line 659) | def batch_type(self) -> Type[CausalLMBatch]:
    method forward (line 662) | def forward(
    method generate_token (line 686) | def generate_token(

FILE: server/text_generation_server/models/custom_modeling/bloom_modeling.py
  function _make_causal_mask (line 68) | def _make_causal_mask(
  function _expand_mask (line 88) | def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
  function build_alibi_tensor (line 99) | def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int) -> ...
  function dropout_add (line 156) | def dropout_add(
  function _split_heads (line 178) | def _split_heads(
  function _merge_heads (line 210) | def _merge_heads(x: torch.Tensor, num_heads: int, head_dim: int) -> torc...
  class BloomAttention (line 236) | class BloomAttention(nn.Module):
    method __init__ (line 237) | def __init__(self, prefix, config: BloomConfig, weights):
    method compute_attention (line 280) | def compute_attention(
    method forward (line 357) | def forward(
  class BloomMLP (line 435) | class BloomMLP(nn.Module):
    method __init__ (line 436) | def __init__(self, prefix, config: BloomConfig, weights):
    method forward (line 450) | def forward(
  class BloomBlock (line 474) | class BloomBlock(nn.Module):
    method __init__ (line 475) | def __init__(self, layer_id: int, config: BloomConfig, weights):
    method forward (line 500) | def forward(
  class BloomPreTrainedModel (line 556) | class BloomPreTrainedModel(PreTrainedModel):
    method _convert_to_standard_cache (line 562) | def _convert_to_standard_cache(
    method _convert_to_bloom_cache (line 582) | def _convert_to_bloom_cache(
  class BloomModel (line 601) | class BloomModel(BloomPreTrainedModel):
    method __init__ (line 602) | def __init__(self, config: BloomConfig, weights):
    method _prepare_attn_mask (line 635) | def _prepare_attn_mask(
    method set_input_embeddings (line 664) | def set_input_embeddings(self, new_embeddings: torch.Tensor):
    method forward (line 667) | def forward(
  class BloomForCausalLM (line 818) | class BloomForCausalLM(BloomPreTrainedModel):
    method __init__ (line 819) | def __init__(self, prefix: str, config, weights):
    method prepare_inputs_for_generation (line 829) | def prepare_inputs_for_generation(
    method forward (line 860) | def forward(

FILE: server/text_generation_server/models/custom_modeling/clip.py
  class CLIPVisionEmbeddings (line 23) | class CLIPVisionEmbeddings(nn.Module):
    method __init__ (line 24) | def __init__(self, prefix, config: CLIPVisionConfig, weights):
    method forward (line 56) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
  class CLIPTextEmbeddings (line 70) | class CLIPTextEmbeddings(nn.Module):
    method __init__ (line 71) | def __init__(self, config: CLIPTextConfig):
    method forward (line 87) | def forward(
  class CLIPAttention (line 109) | class CLIPAttention(nn.Module):
    method __init__ (line 112) | def __init__(self, prefix, config, weights):
    method _shape (line 142) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 149) | def forward(
  class CLIPMLP (line 234) | class CLIPMLP(nn.Module):
    method __init__ (line 235) | def __init__(self, prefix, config, weights):
    method forward (line 246) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class CLIPEncoderLayer (line 253) | class CLIPEncoderLayer(nn.Module):
    method __init__ (line 254) | def __init__(self, prefix, config: CLIPConfig, weights):
    method forward (line 268) | def forward(
  class CLIPPreTrainedModel (line 299) | class CLIPPreTrainedModel(nn.Module):
  class CLIPEncoder (line 386) | class CLIPEncoder(nn.Module):
    method __init__ (line 395) | def __init__(self, prefix, config: CLIPConfig, weights):
    method forward (line 407) | def forward(
  class CLIPTextTransformer (line 446) | class CLIPTextTransformer(nn.Module):
    method __init__ (line 447) | def __init__(self, prefix: str, config: CLIPTextConfig, weights=None):
    method forward (line 461) | def forward(
  class CLIPTextModel (line 533) | class CLIPTextModel(CLIPPreTrainedModel):
    method __init__ (line 538) | def __init__(self, prefix, config: CLIPTextConfig):
    method forward (line 544) | def forward(
  class CLIPVisionTransformer (line 575) | class CLIPVisionTransformer(nn.Module):
    method __init__ (line 576) | def __init__(self, prefix, config: CLIPVisionConfig, weights):
    method forward (line 591) | def forward(
  class CLIPVisionModel (line 619) | class CLIPVisionModel(CLIPPreTrainedModel):
    method __init__ (line 624) | def __init__(self, config: CLIPVisionConfig):
    method get_input_embeddings (line 630) | def get_input_embeddings(self) -> nn.Module:
    method forward (line 633) | def forward(
  class CLIPModel (line 665) | class CLIPModel(nn.Module):
    method __init__ (line 666) | def __init__(self, prefix, config: CLIPConfig, weights):
    method get_text_features (line 691) | def get_text_features(
    method get_image_features (line 724) | def get_image_features(
    method forward (line 760) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
  class CohereRotary (line 56) | class CohereRotary(PositionRotaryEmbedding):
    method forward (line 57) | def forward(
  class CohereLayerNorm (line 101) | class CohereLayerNorm(nn.Module):
    method __init__ (line 102) | def __init__(self, prefix, weights, eps):
    method forward (line 110) | def forward(self, hidden_states):
  function load_attention (line 156) | def load_attention(config, prefix, weights):
  function _load_gqa (line 169) | def _load_gqa(config, prefix: str, weights):
  class FlashCohereAttention (line 201) | class FlashCohereAttention(torch.nn.Module):
    method __init__ (line 202) | def __init__(
    method forward (line 262) | def forward(
  class CohereMLP (line 334) | class CohereMLP(nn.Module):
    method __init__ (line 335) | def __init__(self, prefix, config, weights):
    method forward (line 366) | def forward(self, hidden_states):
  class FlashCohereLayer (line 374) | class FlashCohereLayer(nn.Module):
    method __init__ (line 375) | def __init__(self, prefix: str, layer_id, config, weights):
    method forward (line 390) | def forward(
  class FlashCohereModel (line 427) | class FlashCohereModel(torch.nn.Module):
    method __init__ (line 428) | def __init__(self, prefix: str, config, weights):
    method forward (line 458) | def forward(
  class FlashCohereForCausalLM (line 498) | class FlashCohereForCausalLM(torch.nn.Module):
    method __init__ (line 499) | def __init__(self, prefix: str, config, weights):
    method forward (line 522) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
  class DbrxAttentionConfig (line 55) | class DbrxAttentionConfig(PretrainedConfig):
    method __init__ (line 56) | def __init__(
  class DbrxFFNConfig (line 77) | class DbrxFFNConfig(PretrainedConfig):
    method __init__ (line 78) | def __init__(
  class DbrxConfig (line 112) | class DbrxConfig(PretrainedConfig):
    method __init__ (line 119) | def __init__(
    method num_key_value_heads (line 172) | def num_key_value_heads(self):
  function promote_scalar (line 178) | def promote_scalar(x: torch.Tensor) -> torch.Tensor:
  function load_attention (line 182) | def load_attention(config, prefix, weights):
  function _load_experts (line 193) | def _load_experts(config, prefix, weights):
  function _load_experts_quantized (line 224) | def _load_experts_quantized(config, prefix, weights, cls):
  class DbrxAttention (line 264) | class DbrxAttention(torch.nn.Module):
    method __init__ (line 265) | def __init__(
    method forward (line 310) | def forward(
  class DbrxNormAttentionNorm (line 374) | class DbrxNormAttentionNorm(nn.Module):
    method __init__ (line 375) | def __init__(
    method forward (line 394) | def forward(
  function select_experts (line 429) | def select_experts(
  function round_up (line 447) | def round_up(x: torch.Tensor, value: int):
  class BlockSparseMoE (line 451) | class BlockSparseMoE(nn.Module):
    method __init__ (line 452) | def __init__(self, prefix, config: DbrxConfig, weights):
    method forward (line 501) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DenseMoE (line 533) | class DenseMoE(nn.Module):
    method __init__ (line 534) | def __init__(self, prefix, config: DbrxConfig, weights):
    method forward (line 584) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DbrxLayer (line 631) | class DbrxLayer(nn.Module):
    method __init__ (line 632) | def __init__(self, prefix: str, layer_id, config, weights):
    method forward (line 643) | def forward(
  class DbrxModel (line 675) | class DbrxModel(torch.nn.Module):
    method __init__ (line 676) | def __init__(self, prefix: str, config, weights):
    method forward (line 702) | def forward(
  class FlashDbrxForCausalLM (line 741) | class FlashDbrxForCausalLM(torch.nn.Module):
    method __init__ (line 742) | def __init__(self, prefix: str, config, weights):
    method forward (line 757) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
  class DeepseekV2Config (line 51) | class DeepseekV2Config(PretrainedConfig):
    method __init__ (line 52) | def __init__(
  class DeepseekV2Attention (line 157) | class DeepseekV2Attention(torch.nn.Module):
    method __init__ (line 158) | def __init__(
    method forward (line 258) | def forward(
  class DeepseekV2MLP (line 366) | class DeepseekV2MLP(nn.Module):
    method __init__ (line 367) | def __init__(self, prefix: str, config, weights, intermediate_size: int):
    method forward (line 397) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
  class DeepseekV2MoE (line 421) | class DeepseekV2MoE(nn.Module):
    method __init__ (line 422) | def __init__(
    method forward (line 464) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DeepseekV2Layer (line 484) | class DeepseekV2Layer(nn.Module):
    method __init__ (line 485) | def __init__(self, prefix, layer_id, config, weights):
    method forward (line 523) | def forward(
  class DeepseekV2Model (line 561) | class DeepseekV2Model(torch.nn.Module):
    method __init__ (line 562) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 588) | def forward(
  class FlashDeepseekV2ForCausalLM (line 627) | class FlashDeepseekV2ForCausalLM(torch.nn.Module):
    method __init__ (line 628) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 640) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
  class DeepseekV3Config (line 51) | class DeepseekV3Config(PretrainedConfig):
    method __init__ (line 52) | def __init__(
  class DeepseekV3Attention (line 157) | class DeepseekV3Attention(torch.nn.Module):
    method __init__ (line 158) | def __init__(
    method forward (line 258) | def forward(
  class DeepseekV3MLP (line 366) | class DeepseekV3MLP(nn.Module):
    method __init__ (line 367) | def __init__(self, prefix: str, config, weights, intermediate_size: int):
    method forward (line 397) | def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
  class DeepseekV3MoE (line 421) | class DeepseekV3MoE(nn.Module):
    method __init__ (line 422) | def __init__(
    method forward (line 473) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class DeepseekV3Layer (line 493) | class DeepseekV3Layer(nn.Module):
    method __init__ (line 494) | def __init__(self, prefix, layer_id, config, weights):
    method forward (line 532) | def forward(
  class DeepseekV3Model (line 570) | class DeepseekV3Model(torch.nn.Module):
    method __init__ (line 571) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 597) | def forward(
  class FlashDeepseekV3ForCausalLM (line 636) | class FlashDeepseekV3ForCausalLM(torch.nn.Module):
    method __init__ (line 637) | def __init__(self, prefix: str, config, weights: Weights):
    method forward (line 649) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
  class Gemma2Config (line 50) | class Gemma2Config(PretrainedConfig):
    method __init__ (line 51) | def __init__(
  class Gemma2FastRMSNorm (line 106) | class Gemma2FastRMSNorm(FastRMSNorm):
    method load (line 108) | def load(cls, prefix: str, weights, eps=1e-6):
    method forward (line 118) | def forward(self, hidden_states, residual=None):
  function load_attention (line 129) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 142) | def _load_gqa(config, prefix: str, weights):
  class FlashGemma2Attention (line 164) | class FlashGemma2Attention(torch.nn.Module):
    method __init__ (line 165) | def __init__(
    method forward (line 230) | def forward(
  class Gemma2MLP (line 298) | class Gemma2MLP(nn.Module):
    method __init__ (line 299) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 348) | def forward(self, hidden_states, adapter_data):
  class FlashGemma2Layer (line 356) | class FlashGemma2Layer(nn.Module):
    method __init__ (line 357) | def __init__(
    method forward (line 392) | def forward(
  class FlashGemma2Model (line 434) | class FlashGemma2Model(torch.nn.Module):
    method __init__ (line 435) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 462) | def forward(
  class FlashGemma2ForCausalLM (line 503) | class FlashGemma2ForCausalLM(torch.nn.Module):
    method __init__ (line 504) | def __init__(self, prefix: str, config, weights, *, causal: bool = True):
    method forward (line 533) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
  class Gemma3FastRMSNorm (line 62) | class Gemma3FastRMSNorm(FastRMSNorm):
    method load (line 64) | def load(cls, prefix: str, weights, eps=1e-6):
    method forward (line 74) | def forward(self, hidden_states, residual=None):
  function load_attention (line 85) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 98) | def _load_gqa(config, prefix: str, weights):
  class FlashGemma3Attention (line 120) | class FlashGemma3Attention(torch.nn.Module):
    method __init__ (line 121) | def __init__(
    method forward (line 203) | def forward(
  class Gemma3MLP (line 329) | class Gemma3MLP(nn.Module):
    method __init__ (line 330) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 379) | def forward(self, hidden_states, adapter_data):
  class FlashGemma3Layer (line 387) | class FlashGemma3Layer(nn.Module):
    method __init__ (line 388) | def __init__(
    method forward (line 423) | def forward(
  class FlashGemma3Model (line 467) | class FlashGemma3Model(torch.nn.Module):
    method __init__ (line 468) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 496) | def forward(
  class FlashGemma3ForCausalLM (line 545) | class FlashGemma3ForCausalLM(torch.nn.Module):
    method __init__ (line 546) | def __init__(self, prefix: str, config, weights, *, causal: bool = True):
    method forward (line 576) | def forward(
  class Gemma3MultimodalInputProjection (line 610) | class Gemma3MultimodalInputProjection(torch.nn.Module):
    method __init__ (line 611) | def __init__(self, prefix, config, weights):
    method forward (line 633) | def forward(self, vision_outputs: torch.Tensor):
  class Gemma3ForConditionalGeneration (line 654) | class Gemma3ForConditionalGeneration(nn.Module):
    method __init__ (line 655) | def __init__(self, prefix, config, weights):
    method get_attention_mask (line 705) | def get_attention_mask(
    method get_vision_embeds (line 766) | def get_vision_embeds(
    method get_inputs_embeds (line 782) | def get_inputs_embeds(
    method forward (line 799) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
  class GemmaConfig (line 48) | class GemmaConfig(PretrainedConfig):
    method __init__ (line 49) | def __init__(
  class GemmaFastRMSNorm (line 104) | class GemmaFastRMSNorm(FastRMSNorm):
    method load (line 106) | def load(cls, prefix: str, weights, eps=1e-6):
    method forward (line 116) | def forward(self, hidden_states, residual=None):
  function load_attention (line 127) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 140) | def _load_gqa(config, prefix: str, weights):
  class FlashGemmaAttention (line 162) | class FlashGemmaAttention(torch.nn.Module):
    method __init__ (line 163) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 202) | def forward(
  class GemmaMLP (line 264) | class GemmaMLP(nn.Module):
    method __init__ (line 265) | def __init__(self, prefix: str, config, weights):
    method forward (line 296) | def forward(self, hidden_states):
  class FlashGemmaLayer (line 302) | class FlashGemmaLayer(nn.Module):
    method __init__ (line 303) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 319) | def forward(
  class FlashGemmaModel (line 357) | class FlashGemmaModel(torch.nn.Module):
    method __init__ (line 358) | def __init__(self, prefix: str, config, weights, causal: bool):
    method forward (line 383) | def forward(
  class FlashGemmaForCausalLM (line 422) | class FlashGemmaForCausalLM(torch.nn.Module):
    method __init__ (line 423) | def __init__(self, prefix: str, config, weights, *, causal: bool = True):
    method forward (line 450) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
  function load_qkv (line 42) | def load_qkv(config, prefix: str, weights, head_size, num_heads):
  function _load_qkv_gptq (line 57) | def _load_qkv_gptq(config, prefix: str, weights):
  function _load_qkv (line 88) | def _load_qkv(config, prefix: str, weights, head_size, num_heads):
  function load_row (line 135) | def load_row(config, prefix: str, weights, bias: bool):
  function load_col (line 154) | def load_col(config, prefix: str, weights, bias: bool):
  class FlashGPT2Attention (line 169) | class FlashGPT2Attention(torch.nn.Module):
    method __init__ (line 170) | def __init__(
    method forward (line 210) | def forward(
  class GPT2MLP (line 263) | class GPT2MLP(nn.Module):
    method __init__ (line 264) | def __init__(self, prefix: str, config, weights):
    method forward (line 294) | def forward(self, hidden_states):
  class FlashGPT2Layer (line 300) | class FlashGPT2Layer(nn.Module):
    method __init__ (line 301) | def __init__(self, prefix: str, config, weights):
    method forward (line 317) | def forward(
  class FlashGPT2Model (line 352) | class FlashGPT2Model(torch.nn.Module):
    method __init__ (line 353) | def __init__(self, prefix: str, config, weights):
    method forward (line 383) | def forward(
  class FlashGPT2ForCausalLM (line 416) | class FlashGPT2ForCausalLM(torch.nn.Module):
    method __init__ (line 417) | def __init__(self, prefix: str, config, weights):
    method forward (line 436) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
  function load_attention (line 49) | def load_attention(config, prefix: str, weights):
  function load_row (line 59) | def load_row(config, prefix: str, weights, bias: bool):
  class GPTJRotary (line 72) | class GPTJRotary(PositionRotaryEmbedding):
    method forward (line 73) | def forward(
  class FlashGPTJAttention (line 117) | class FlashGPTJAttention(torch.nn.Module):
    method __init__ (line 118) | def __init__(
    method forward (line 164) | def forward(
  class GPTJMLP (line 227) | class GPTJMLP(nn.Module):
    method __init__ (line 228) | def __init__(self, prefix: str, config, weights):
    method forward (line 253) | def forward(self, hidden_states):
  class FlashGPTJLayer (line 259) | class FlashGPTJLayer(nn.Module):
    method __init__ (line 260) | def __init__(self, prefix: str, config, weights):
    method forward (line 271) | def forward(
  class FlashGPTJModel (line 303) | class FlashGPTJModel(torch.nn.Module):
    method __init__ (line 304) | def __init__(self, prefix: str, config, weights):
    method forward (line 333) | def forward(
  class FlashGPTJForCausalLM (line 373) | class FlashGPTJForCausalLM(torch.nn.Module):
    method __init__ (line 374) | def __init__(self, prefix: str, config, weights):
    method forward (line 387) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
  function load_attention (line 72) | def load_attention(config, prefix: str, weights, layer_id):
  function no_fp8 (line 125) | def no_fp8(weights: Weights):
  class FlashLlamaAttention (line 137) | class FlashLlamaAttention(torch.nn.Module):
    method __init__ (line 138) | def __init__(
    method forward (line 206) | def forward(
  class Phi3MoE (line 270) | class Phi3MoE(nn.Module):
    method __init__ (line 271) | def __init__(
    method forward (line 294) | def forward(self, x, adapter_data) -> torch.Tensor:
  class LlamaMLP (line 306) | class LlamaMLP(nn.Module):
    method __init__ (line 307) | def __init__(self, prefix, config, weights, index):
    method forward (line 379) | def forward(self, hidden_states, adapter_data):
  class FlashLlamaLayer (line 407) | class FlashLlamaLayer(nn.Module):
    method __init__ (line 408) | def __init__(self, index, prefix, config, weights):
    method forward (line 459) | def forward(
  class FlashLlamaModel (line 503) | class FlashLlamaModel(torch.nn.Module):
    method __init__ (line 504) | def __init__(self, prefix, config, weights):
    method forward (line 572) | def forward(
  class FlashLlamaForCausalLM (line 617) | class FlashLlamaForCausalLM(torch.nn.Module):
    method __init__ (line 618) | def __init__(self, prefix: str, config, weights, name=None):
    method forward (line 663) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
  class MistralConfig (line 57) | class MistralConfig(PretrainedConfig):
    method __init__ (line 60) | def __init__(
  class MistralAttention (line 111) | class MistralAttention(torch.nn.Module):
    method __init__ (line 112) | def __init__(self, prefix: str, config, weights, layer_id):
    method forward (line 180) | def forward(
  class MistralMLP (line 252) | class MistralMLP(nn.Module):
    method __init__ (line 253) | def __init__(self, prefix: str, config, weights, layer_id):
    method forward (line 307) | def forward(self, hidden_states, adapter_data):
  class MistralLayer (line 333) | class MistralLayer(nn.Module):
    method __init__ (line 334) | def __init__(self, prefix: str, config, weights, layer_id):
    method forward (line 355) | def forward(
  class MistralModel (line 397) | class MistralModel(torch.nn.Module):
    method __init__ (line 398) | def __init__(self, prefix: str, config, weights):
    method forward (line 425) | def forward(
  class FlashMistralForCausalLM (line 467) | class FlashMistralForCausalLM(torch.nn.Module):
    method __init__ (line 468) | def __init__(self, prefix: str, config, weights, name=None):
    method forward (line 500) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
  class MixtralConfig (line 48) | class MixtralConfig(PretrainedConfig):
    method __init__ (line 51) | def __init__(
  function promote_scalar (line 106) | def promote_scalar(x: torch.Tensor) -> torch.Tensor:
  function load_attention (line 110) | def load_attention(config, prefix: str, weights):
  function _load_gqa (line 123) | def _load_gqa(config, prefix: str, weights):
  function _load_experts (line 146) | def _load_experts(config, prefix: str, mat, weights):
  class MixtralAttention (line 182) | class MixtralAttention(torch.nn.Module):
    method __init__ (line 183) | def __init__(
    method forward (line 230) | def forward(
  function select_experts (line 300) | def select_experts(gate_logits: torch.Tensor, top_k: int):
  function round_up (line 313) | def round_up(x: torch.Tensor, value: int):
  class MixtralMoE (line 317) | class MixtralMoE(nn.Module):
    method __init__ (line 318) | def __init__(
    method forward (line 342) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class MixtralLayer (line 354) | class MixtralLayer(nn.Module):
    method __init__ (line 355) | def __init__(self, prefix: str, layer_id, config, weights):
    method forward (line 379) | def forward(
  class MixtralModel (line 419) | class MixtralModel(torch.nn.Module):
    method __init__ (line 420) | def __init__(self, prefix: str, config, weights):
    method forward (line 451) | def forward(
  class FlashMixtralForCausalLM (line 493) | class FlashMixtralForCausalLM(torch.nn.Module):
    method __init__ (line 494) | def __init__(self, prefix: str, config, weights):
    method forward (line 510) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
  class GPTNeoXConfig (line 51) | class GPTNeoXConfig(TransformersGPTNeoXConfig):
  function load_row (line 57) | def load_row(config, prefix: str, weights, bias: bool):
  function load_qkv (line 73) | def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_...
  class FlashNeoxAttention (line 98) | class FlashNeoxAttention(torch.nn.Module):
    method __init__ (line 99) | def __init__(self, config, prefix, weights):
    method forward (line 142) | def forward(
  class FlashMLP (line 204) | class FlashMLP(nn.Module):
    method __init__ (line 205) | def __init__(self, config, prefix, weights):
    method forward (line 226) | def forward(self, hidden_states):
  class FlashNeoXLayer (line 233) | class FlashNeoXLayer(nn.Module):
    method __init__ (line 234) | def __init__(self, layer_id, config, weights):
    method forward (line 257) | def forward(
  class FlashGPTNeoXPreTrainedModel (line 318) | class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
  class FlashGPTNeoXModel (line 325) | class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
    method __init__ (line 326) | def __init__(self, prefix: str, config, weights):
    method forward (line 351) | def forward(
  class FlashGPTNeoXForCausalLM (line 390) | class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
    method __init__ (line 391) | def __init__(self, prefix, config, weights):
    method forward (line 405) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
  class PaliGemmaForConditionalGeneration (line 29) | class PaliGemmaForConditionalGeneration(nn.Module):
    method __init__ (line 30) | def __init__(self, prefix, config, weights):
    method get_vision_embeds (line 67) | def get_vision_embeds(
    method get_inputs_embeds (line 83) | def get_inputs_embeds(
    method forward (line 96) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
  class PhiConfig (line 30) | class PhiConfig(PretrainedConfig):
    method __init__ (line 31) | def __init__(
  function load_attention (line 70) | def load_attention(config, prefix, weights):
  function _load_gqa (line 83) | def _load_gqa(config, prefix: str, weights):
  class FlashPhiAttention (line 107) | class FlashPhiAttention(torch.nn.Module):
    method __init__ (line 108) | def __init__(
    method forward (line 155) | def forward(
  class PhiMLP (line 226) | class PhiMLP(nn.Module):
    method __init__ (line 227) | def __init__(self, prefix, config, weights):
    method forward (line 255) | def forward(self, hidden_states):
  class FlashPhiLayer (line 261) | class FlashPhiLayer(nn.Module):
    method __init__ (line 262) | def __init__(self, prefix: str, layer_id, config, weights):
    method forward (line 276) | def forward(
  class FlashPhiModel (line 310) | class FlashPhiModel(torch.nn.Module):
    method __init__ (line 311) | def __init__(self, prefix: str, config, weights):
    method forward (line 343) | def forward(
  class FlashPhiForCausalLM (line 382) | class FlashPhiForCausalLM(torch.nn.Module):
    method __init__ (line 383) | def __init__(self, prefix: str, config, weights):
    method forward (line 398) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
  class PhiMoEConfig (line 30) | class PhiMoEConfig(PretrainedConfig):
    method __init__ (line 121) | def __init__(
    method _rope_scaling_validation (line 191) | def _rope_scaling_validation(self):

FILE: server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
  function load_attention (line 28) | def load_attention(config, prefix, weights, layer_id):
  function _load_gqa (line 55) | def _load_gqa(config, prefix: str, weights):
  class Qwen2Attention (line 68) | class Qwen2Attention(torch.nn.Module):
    method __init__ (line 69) | def __init__(
    method forward (line 124) | def forward(
  class Qwen2MLP (line 196) | class Qwen2MLP(nn.Module):
    method __init__ (line 197) | def __init__(self, prefix, config, weights, index):
    method forward (line 246) | def forward(self, hidden_states, adapter_data):
  class Qwen2Layer (line 254) | class Qwen2Layer(nn.Module):
    method __init__ (line 255) | def __init__(self, prefix, layer_id, config, weights):
    method forward (line 273) | def forward(
  class Qwen2Model (line 313) | class Qwen2Model(torch.nn.Module):
    method __init__ (line 314) | def __init__(self, prefix: str, config, weights):
    method forward (line 343) | def forward(
  class Qwen2ForCausalLM (line 387) | class Qwen2ForCausalLM(torch.nn.Module):
    method __init__ (line 388) | def __init__(self, prefix: str, config, weights):
    method forward (line 416) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
  function load_row (line 25) | def load_row(config, prefix: str, weights, bias: bool):
  class RWConfig (line 41) | class RWConfig(PretrainedConfig):
    method __init__ (line 48) | def __init__(
  class FlashRWAttention (line 128) | class FlashRWAttention(torch.nn.Module):
    method __init__ (line 129) | def __init__(
    method forward (line 177) | def forward(
  class FlashRWLargeAttention (line 240) | class FlashRWLargeAttention(torch.nn.Module):
    method __init__ (line 241) | def __init__(
    method forward (line 298) | def forward(
  class FlashMLP (line 362) | class FlashMLP(nn.Module):
    method __init__ (line 363) | def __init__(self, config, prefix: str, weights):
    method forward (line 374) | def forward(self, hidden_states):
  class FlashRWLayer (line 381) | class FlashRWLayer(nn.Module):
    method __init__ (line 382) | def __init__(
    method forward (line 429) | def forward(
  class FlashRWLayerNorm (line 489) | class FlashRWLayerNorm(nn.Module):
    method __init__ (line 490) | def __init__(self, config, prefix: str, weights):
    method forward (line 520) | def forward(
  class FlashRWLargeLayer (line 534) | class FlashRWLargeLayer(nn.Module):
    method __init__ (line 535) | def __init__(self, layer_id, prefix: str, config, weights):
    method forward (line 552) | def forward(
  class FlashRWPreTrainedModel (line 592) | class FlashRWPreTrainedModel(PreTrainedModel):
  class FlashRWModel (line 596) | class FlashRWModel(FlashRWPreTrainedModel):
    method __init__ (line 597) | def __init__(self, prefix: str, config, weights):
    method forward (line 630) | def forward(
  class FlashRWForCausalLM (line 669) | class FlashRWForCausalLM(FlashRWPreTrainedModel):
    method __init__ (line 670) | def __init__(self, prefix: str, config, weights):
    method forward (line 682) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
  function load_multi_mqa (line 27) | def load_multi_mqa(
  function _load_multi_mqa_gptq (line 44) | def _load_multi_mqa_gptq(
  function _load_multi_mqa (line 131) | def _load_multi_mqa(
  function load_col (line 201) | def load_col(config, prefix: str, weights, bias: bool):
  function load_row (line 214) | def load_row(config, prefix: str, weights, bias: bool):
  class FlashMQAttention (line 230) | class FlashMQAttention(torch.nn.Module):
    method __init__ (line 231) | def __init__(self, prefix, config, weights):
    method forward (line 266) | def forward(
  class MLP (line 323) | class MLP(nn.Module):
    method __init__ (line 324) | def __init__(self, prefix, config, weights):
    method forward (line 345) | def forward(self, hidden_states):
  class Block (line 352) | class Block(nn.Module):
    method __init__ (line 353) | def __init__(self, prefix: str, layer_id, config, weights):
    method forward (line 373) | def forward(
  class FlashSantacoderModel (line 402) | class FlashSantacoderModel(nn.Module):
    method __init__ (line 403) | def __init__(self, prefix: str, config, weights):
    method forward (line 437) | def forward(
  class FlashSantacoderForCausalLM (line 471) | class FlashSantacoderForCausalLM(nn.Module):
    method __init__ (line 472) | def __init__(self, prefix, config, weights):
    method forward (line 486) | def forward(

FILE: server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
  class Starcoder2Config (line 54) | class Starcoder2Config(PretrainedConfig):
    method __init__ (line 57) | def __init__(
  function load_attention (line 114) | def load_attention(config, prefix, weights, layer_id):
  function _load_gqa (line 141) | def _load_gqa(config, prefix: str, weights):
  class Starcoder2Attention (line 173) | class Starcoder2Attention(torch.nn.Module):
    method __init__ (line 174) | def __init__(
    method forward (line 230) | def forward(
  class Starcoder2MLP (line 302) | class Starcoder2MLP(nn.Module):
    method __init__ (line 303) | def __init__(self, prefix, config, weights, index):
    method forward (line 345) | def forward(self, hidden_states, adapter_data):
  class Starcoder2GatedMLP (line 351) | class Starcoder2GatedMLP(nn.Module):
    method __init__ (line 352) | def __init__(self, index, prefix, config, weights):
    method forward (line 401) | def forward(self, hidden_states, adapter_data):
  class Starcoder2Layer (line 420) | class Starcoder2Layer(nn.Module):
    method __init__ (line 421) | def __init__(self, layer_id, config, weights):
    method forward (line 443) | def forward(
  class Starcoder2Model (line 485) | class Starcoder2Model(torch.nn.Module):
    method __init__ (line 486) | def __init__(self, prefix, config, weights):
    method forward (line 515) | def forward(
  class FlashStarcoder2ForCausalLM (line 559) | class FlashStarcoder2ForCausalLM(torch.nn.Module):
    method __init__ (line 560) | def __init__(self, prefix, config, weights):
    method forward (line 589) | def forward(

FILE: server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py
  class Gemma3TextConfig (line 32) | class Gemma3TextConfig(PretrainedConfig):
    method __init__ (line 154) | def __init__(
  class Gemma3Config (line 220) | class Gemma3Config(PretrainedConfig):
    method __init__ (line 274) | def __init__(

FILE: server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
  class Gemma3ImageProcessor (line 63) | class Gemma3ImageProcessor(BaseImageProcessor):
    method __init__ (line 99) | def __init__(
    method pan_and_scan (line 135) | def pan_and_scan(
    method _process_images_for_pas (line 226) | def _process_images_for_pas(
    method preprocess (line 252) | def preprocess(

FILE: server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
  class Gemma3ImagesKwargs (line 38) | class Gemma3ImagesKwargs(ImagesKwargs):
  class Gemma3ProcessorKwargs (line 46) | class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
  class Gemma3Processor (line 60) | class Gemma3Processor(ProcessorMixin):
    method __init__ (line 67) | def __init__(
    method __call__ (line 105) | def __call__(
    method batch_decode (line 181) | def batch_decode(self, *args, **kwargs):
    method decode (line 189) | def decode(self, *args, **kwargs):
    method model_input_names (line 198) | def model_input_names(self):

FILE: server/text_generation_server/models/custom_modeling/gemma3/utils.py
  function is_valid_list_of_images (line 22) | def is_valid_list_of_images(images: List):
  function make_nested_list_of_images (line 26) | def make_nested_list_of_images(

FILE: server/text_generation_server/models/custom_modeling/idefics2.py
  function repeat_kv (line 39) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Idefics2VisionEmbeddings (line 53) | class Idefics2VisionEmbeddings(nn.Module):
    method __init__ (line 64) | def __init__(self, prefix, config, weights):
    method forward (line 91) | def forward(
  class Idefics2VisionAttention (line 134) | class Idefics2VisionAttention(nn.Module):
    method __init__ (line 135) | def __init__(self, prefix, config, weights):
    method forward (line 164) | def forward(
  class Idefics2VisionMLP (line 232) | class Idefics2VisionMLP(nn.Module):
    method __init__ (line 233) | def __init__(self, prefix, config, weights):
    method forward (line 244) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Idefics2EncoderLayer (line 251) | class Idefics2EncoderLayer(nn.Module):
    method __init__ (line 252) | def __init__(self, prefix, config, weights):
    method forward (line 269) | def forward(
  class Idefics2Encoder (line 291) | class Idefics2Encoder(nn.Module):
    method __init__ (line 292) | def __init__(self, prefix, config, weights):
    method forward (line 305) | def forward(
  class Idefics2VisionTransformer (line 319) | class Idefics2VisionTransformer(nn.Module):
    method __init__ (line 320) | def __init__(self, prefix, config, weights):
    method forward (line 335) | def forward(
  class Idefics2MLP (line 380) | class Idefics2MLP(nn.Module):
    method __init__ (line 381) | def __init__(self, prefix, config, weights):
    method forward (line 408) | def forward(self, hidden_states):
  class Idefics2RMSNorm (line 418) | class Idefics2RMSNorm(nn.Module):
    method __init__ (line 419) | def __init__(self, prefix, weights, eps):
    method forward (line 429) | def forward(self, hidden_states):
  class Idefics2PerceiverAttention (line 437) | class Idefics2PerceiverAttention(nn.Module):
    method __init__ (line 438) | def __init__(self, prefix, config, weights):
    method forward (line 472) | def forward(
  class Idefics2PerceiverLayer (line 544) | class Idefics2PerceiverLayer(nn.Module):
    method __init__ (line 545) | def __init__(self, prefix, config, weights):
    method forward (line 572) | def forward(
  class Idefics2PerceiverResampler (line 605) | class Idefics2PerceiverResampler(nn.Module):
    method __init__ (line 606) | def __init__(self, prefix, config, weights) -> None:
    method forward (line 632) | def forward(
  class Idefics2Connector (line 664) | class Idefics2Connector(nn.Module):
    method __init__ (line 665) | def __init__(self, prefix, config, weights):
    method forward (line 674) | def forward(self, image_hidden_states, attention_mask):
  class Idefics2ForConditionalGeneration (line 682) | class Idefics2ForConditionalGeneration(nn.Module):
    method __init__ (line 683) | def __init__(self, prefix, config, weights):
    method _merge_input_ids_with_image_features (line 723) | def _merge_input_ids_with_image_features(
    method get_vision_embeds (line 736) | def get_vision_embeds(
    method get_inputs_embeds (line 805) | def get_inputs_embeds(
    method forward (line 820) | def forward(

FILE: server/text_generation_server/models/custom_modeling/idefics3.py
  function repeat_kv (line 38) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Idefics3VisionEmbeddings (line 52) | class Idefics3VisionEmbeddings(nn.Module):
    method __init__ (line 63) | def __init__(self, prefix, config, weights):
    method forward (line 90) | def forward(
  class Idefics3VisionAttention (line 133) | class Idefics3VisionAttention(nn.Module):
    method __init__ (line 134) | def __init__(self, prefix, config, weights):
    method forward (line 163) | def forward(
  class Idefics3VisionMLP (line 231) | class Idefics3VisionMLP(nn.Module):
    method __init__ (line 232) | def __init__(self, prefix, config, weights):
    method forward (line 243) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Idefics3EncoderLayer (line 250) | class Idefics3EncoderLayer(nn.Module):
    method __init__ (line 251) | def __init__(self, prefix, config, weights):
    method forward (line 268) | def forward(
  class Idefics3Encoder (line 290) | class Idefics3Encoder(nn.Module):
    method __init__ (line 291) | def __init__(self, prefix, config, weights):
    method forward (line 304) | def forward(
  class Idefics3VisionTransformer (line 318) | class Idefics3VisionTransformer(nn.Module):
    method __init__ (line 319) | def __init__(self, prefix, config, weights):
    method forward (line 334) | def forward(
  class Idefics3SimpleMLP (line 379) | class Idefics3SimpleMLP(nn.Module):
    method __init__ (line 380) | def __init__(self, prefix, config, weights):
    method forward (line 391) | def forward(self, x):
  class Idefics3Connector (line 395) | class Idefics3Connector(nn.Module):
    method __init__ (line 396) | def __init__(self, prefix, config, weights):
    method pixel_shuffle (line 401) | def pixel_shuffle(self, x, scale_factor=2):
    method forward (line 417) | def forward(self, image_hidden_states):
  class Idefics3ForConditionalGeneration (line 423) | class Idefics3ForConditionalGeneration(nn.Module):
    method __init__ (line 424) | def __init__(self, prefix, config, weights):
    method _merge_input_ids_with_image_features (line 466) | def _merge_input_ids_with_image_features(
    method get_vision_embeds (line 479) | def get_vision_embeds(
    method get_inputs_embeds (line 547) | def get_inputs_embeds(
    method forward (line 562) | def forward(

FILE: server/text_generation_server/models/custom_modeling/idefics_config.py
  class IdeficsVisionConfig (line 31) | class IdeficsVisionConfig(PretrainedConfig):
    method __init__ (line 75) | def __init__(
  class IdeficsPerceiverConfig (line 107) | class IdeficsPerceiverConfig(PretrainedConfig):
    method __init__ (line 132) | def __init__(
  class IdeficsConfig (line 152) | class IdeficsConfig(PretrainedConfig):
    method __init__ (line 228) | def __init__(
    method to_dict (line 314) | def to_dict(self):

FILE: server/text_generation_server/models/custom_modeling/idefics_image_processing.py
  function convert_to_rgb (line 48) | def convert_to_rgb(image):
  class IdeficsImageProcessor (line 61) | class IdeficsImageProcessor(BaseImageProcessor):
    method __init__ (line 81) | def __init__(
    method preprocess (line 96) | def preprocess(
    method fetch_images (line 184) | def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
    method rescale (line 226) | def rescale(
    method normalize (line 260) | def normalize(

FILE: server/text_generation_server/models/custom_modeling/idefics_modeling.py
  class BaseModelOutputWithPastImage (line 61) | class BaseModelOutputWithPastImage(BaseModelOutputWithPast):
  class CausalLMOutputWithPastImage (line 66) | class CausalLMOutputWithPastImage(CausalLMOutputWithPast):
  function expand_inputs_for_generation (line 81) | def expand_inputs_for_generation(
  function update_model_kwargs_for_generation (line 129) | def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder...
  function prepare_inputs_for_generation (line 166) | def prepare_inputs_for_generation(input_ids, past=None, **kwargs):
  function freeze_model (line 201) | def freeze_model(model, module_exceptions=[]):
  class IdeficsDecoupledPartialTPEmbedding (line 220) | class IdeficsDecoupledPartialTPEmbedding(nn.Module):
    method __init__ (line 221) | def __init__(
    method forward (line 235) | def forward(self, input_ids):
  class IdeficsDecoupledTensorParallelLinear (line 254) | class IdeficsDecoupledTensorParallelLinear(nn.Module):
    method __init__ (line 263) | def __init__(
    method forward (line 277) | def forward(self, input: torch.Tensor) -> torch.Tensor:
    method extra_repr (line 284) | def extra_repr(self) -> str:
  function _make_causal_mask (line 296) | def _make_causal_mask(
  function _expand_mask (line 326) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  class IdeficsRMSNorm (line 342) | class IdeficsRMSNorm(nn.Module):
    method __init__ (line 343) | def __init__(self, prefix, weights, eps=1e-6):
    method forward (line 353) | def forward(self, hidden_states, residual=None):
  class IdeficsMLP (line 445) | class IdeficsMLP(nn.Module):
    method __init__ (line 446) | def __init__(
    method forward (line 468) | def forward(self, hidden_states):
  class IdeficsAttention (line 478) | class IdeficsAttention(nn.Module):
    method __init__ (line 481) | def __init__(
    method _shape (line 553) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 560) | def forward(
  class IdeficsDecoderLayer (line 687) | class IdeficsDecoderLayer(nn.Module):
    method __init__ (line 688) | def __init__(self, layer_id: int, config: IdeficsConfig, weights):
    method forward (line 715) | def forward(
  class IdeficsGatedCrossAttentionLayer (line 774) | class IdeficsGatedCrossAttentionLayer(nn.Module):
    method __init__ (line 775) | def __init__(self, layer_id, config: IdeficsConfig, weights):
    method forward (line 813) | def forward(
  class IdeficsPreTrainedModel (line 908) | class IdeficsPreTrainedModel(PreTrainedModel):
  class IdeficsModel (line 999) | class IdeficsModel(IdeficsPreTrainedModel):
    method __init__ (line 1007) | def __init__(self, config: IdeficsConfig, weights):
    method _prepare_decoder_attention_mask (line 1091) | def _prepare_decoder_attention_mask(
    method forward (line 1119) | def forward(
  class IdeficsForVisionText2Text (line 1417) | class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
    method __init__ (line 1418) | def __init__(
    method forward (line 1434) | def forward(
    method prepare_inputs_for_generation (line 1525) | def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
    method _expand_inputs_for_generation (line 1533) | def _expand_inputs_for_generation(
    method _update_model_kwargs_for_generation (line 1540) | def _update_model_kwargs_for_generation(
    method _reorder_cache (line 1548) | def _reorder_cache(past, beam_idx):

FILE: server/text_generation_server/models/custom_modeling/idefics_perceiver.py
  class IdeficsPerceiverResampler (line 52) | class IdeficsPerceiverResampler(nn.Module):
    method __init__ (line 53) | def __init__(
    method forward (line 127) | def forward(self, context: torch.Tensor) -> torch.Tensor:
  class IdeficsPerceiverAttention (line 140) | class IdeficsPerceiverAttention(nn.Module):
    method __init__ (line 141) | def __init__(
    method forward (line 194) | def forward(self, context: torch.Tensor, latents: torch.Tensor) -> tor...
  class IdeficsMLP (line 242) | class IdeficsMLP(nn.Module):
    method __init__ (line 243) | def __init__(
    method forward (line 268) | def forward(

FILE: server/text_generation_server/models/custom_modeling/idefics_processing.py
  function incremental_to_binary_attention_mask (line 41) | def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
  function image_attention_mask_for_packed_input_ids (line 57) | def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
  function is_url (line 104) | def is_url(string):
  function is_image (line 113) | def is_image(string):
  class IdeficsProcessor (line 119) | class IdeficsProcessor(ProcessorMixin):
    method __init__ (line 138) | def __init__(
    method __call__ (line 168) | def __call__(
    method batch_decode (line 425) | def batch_decode(self, *args, **kwargs):
    method decode (line 432) | def decode(self, *args, **kwargs):
    method model_input_names (line 440) | def model_input_names(self):

FILE: server/text_generation_server/models/custom_modeling/idefics_vision.py
  class IdeficsVisionModelOutput (line 41) | class IdeficsVisionModelOutput(ModelOutput):
  class IdeficsVisionEmbeddings (line 70) | class IdeficsVisionEmbeddings(nn.Module):
    method __init__ (line 71) | def __init__(self, prefix, config, weights):
    method forward (line 100) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
  class IdeficsVisionAttention (line 115) | class IdeficsVisionAttention(nn.Module):
    method __init__ (line 118) | def __init__(self, prefix, config, weights):
    method _shape (line 153) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 160) | def forward(
  class IdeficsVisionMLP (line 252) | class IdeficsVisionMLP(nn.Module):
    method __init__ (line 253) | def __init__(self, prefix, config, weights):
    method forward (line 264) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class IdeficsVisionEncoderLayer (line 272) | class IdeficsVisionEncoderLayer(nn.Module):
    method __init__ (line 273) | def __init__(self, prefix, config, weights):
    method forward (line 289) | def forward(
  class IdeficsVisionEncoder (line 331) | class IdeficsVisionEncoder(nn.Module):
    method __init__ (line 340) | def __init__(self, prefix, config, weights):
    method forward (line 355) | def forward(
  class IdeficsVisionTransformer (line 458) | class IdeficsVisionTransformer(nn.Module):
    method __init__ (line 459) | def __init__(self, prefix, config, weights):
    method forward (line 479) | def forward(

FILE: server/text_generation_server/models/custom_modeling/llava_next.py
  function get_anyres_image_grid_shape (line 37) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
  function unpad_image (line 60) | def unpad_image(tensor, original_size):
  class LlavaNextMultiModalProjector (line 94) | class LlavaNextMultiModalProjector(nn.Module):
    method __init__ (line 95) | def __init__(self, prefix, config, weights):
    method forward (line 106) | def forward(self, image_features):
  class LlavaNextForConditionalGeneration (line 113) | class LlavaNextForConditionalGeneration(nn.Module):
    method __init__ (line 114) | def __init__(self, prefix, config, weights):
    method _merge_input_ids_with_image_features (line 149) | def _merge_input_ids_with_image_features(
    method get_vision_embeds (line 166) | def get_vision_embeds(
    method get_inputs_embeds (line 254) | def get_inputs_embeds(
    method forward (line 271) | def forward(

FILE: server/text_generation_server/models/custom_modeling/mamba_modeling.py
  class InferenceParams (line 25) | class InferenceParams:
  class MambaConfig (line 36) | class MambaConfig(PretrainedConfig):
    method __init__ (line 37) | def __init__(
  class MambaBlock (line 71) | class MambaBlock(nn.Module):
    method __init__ (line 72) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 94) | def forward(self, hidden_states: torch.Tensor, inference_params=None):
    method step (line 140) | def step(self, hidden_states, conv_state, ssm_state):
  class ResidualBlock (line 170) | class ResidualBlock(nn.Module):
    method __init__ (line 171) | def __init__(self, prefix, config, weights, layer_id):
    method forward (line 180) | def forward(
  class MambaModel (line 195) | class MambaModel(nn.Module):
    method __init__ (line 196) | def __init__(self, config, weights):
    method forward (line 218) | def forward(

FILE: server/text_generation_server/models/custom_modeling/mllama.py
  function _prepare_aspect_ratio_attention_mask (line 46) | def _prepare_aspect_ratio_attention_mask(
  function _prepare_4d_causal_attention_mask_with_cache_position (line 78) | def _prepare_4d_causal_attention_mask_with_cache_position(
  function _prepare_cross_attention_mask (line 142) | def _prepare_cross_attention_mask(
  class MllamaVisionMLP (line 175) | class MllamaVisionMLP(nn.Module):
    method __init__ (line 176) | def __init__(self, *, prefix, config, weights):
    method forward (line 187) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class MllamaVisionSdpaAttention (line 194) | class MllamaVisionSdpaAttention(nn.Module):
    method __init__ (line 195) | def __init__(self, *, prefix, config, weights):
    method forward (line 216) | def forward(
  class MllamaVisionEncoderLayer (line 253) | class MllamaVisionEncoderLayer(nn.Module):
    method __init__ (line 254) | def __init__(self, *, prefix, config, weights, is_gated: bool):
    method forward (line 285) | def forward(
  class MllamaVisionEncoder (line 306) | class MllamaVisionEncoder(nn.Module):
    method __init__ (line 307) | def __init__(self, *, prefix, config, weights, is_gated: bool, num_lay...
    method forward (line 320) | def forward(
  class MllamaPrecomputedAspectRatioEmbedding (line 338) | class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
    method __init__ (line 339) | def __init__(self, *, prefix, config, weights):
    method forward (line 352) | def forward(
  class MllamaPrecomputedPositionEmbedding (line 365) | class MllamaPrecomputedPositionEmbedding(nn.Module):
    method __init__ (line 366) | def __init__(self, *, prefix, config, weights):
    method forward (line 387) | def forward(
  class MllamaVisionModel (line 407) | class MllamaVisionModel(nn.Module):
    method __init__ (line 408) | def __init__(self, *, prefix, config, weights):
    method apply_class_embedding (line 484) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T...
    method forward (line 490) | def forward(
  class MllamaTextCrossAttention (line 622) | class MllamaTextCrossAttention(nn.Module):
    method __init__ (line 625) | def __init__(self, *, prefix, config, weights, layer_idx):
    method forward (line 674) | def forward(
  class MllamaTextMLP (line 778) | class MllamaTextMLP(nn.Module):
    method __init__ (line 779) | def __init__(self, *, prefix, config, weights):
    method forward (line 801) | def forward(self, x):
  class FlashLlamaCrossLayer (line 811) | class FlashLlamaCrossLayer(torch.nn.Module):
    method __init__ (line 814) | def __init__(self, *, prefix, config, weights, index) -> None:
    method forward (line 842) | def forward(
  class MllamaTextRMSNorm (line 889) | class MllamaTextRMSNorm(nn.Module):
    method __init__ (line 890) | def __init__(self, weight, eps):
    method load (line 896) | def load(cls, *, prefix, weights, eps):
    method forward (line 902) | def forward(self, hidden_states):
    method extra_repr (line 909) | def extra_repr(self):
  class MllamaForConditionalGeneration (line 913) | class MllamaForConditionalGeneration(nn.Module):
    method __init__ (line 914) | def __init__(self, prefix, config, weights):
    method vision_forward (line 935) | def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_...
    method forward (line 953) | def forward(

FILE: server/text_generation_server/models/custom_modeling/mpt_modeling.py
  function load_col (line 30) | def load_col(config, prefix, weights, bias):
  function _reset_is_causal (line 81) | def _reset_is_causal(
  function scaled_multihead_dot_product_attention (line 94) | def scaled_multihead_dot_product_attention(
  function check_valid_inputs (line 167) | def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bflo...
  function flash_attn_fn (line 179) | def flash_attn_fn(
  function triton_flash_attn_fn (line 254) | def triton_flash_attn_fn(
  class MultiheadAttention (line 325) | class MultiheadAttention(nn.Module):
    method __init__ (line 332) | def __init__(
    method forward (line 386) | def forward(
  class MultiQueryAttention (line 423) | class MultiQueryAttention(nn.Module):
    method __init__ (line 430) | def __init__(self, config, prefix, weights, verbose=False):
    method forward (line 479) | def forward(
  function attn_bias_shape (line 517) | def attn_bias_shape(
  function build_attn_bias (line 534) | def build_attn_bias(
  function gen_slopes (line 557) | def gen_slopes(n_heads, alibi_bias_max=8, device=None):
  function build_alibi_bias (line 567) | def build_alibi_bias(
  class MPTMLP (line 591) | class MPTMLP(nn.Module):
    method __init__ (line 592) | def __init__(self, config, prefix, weights):
    method forward (line 608) | def forward(self, x):
  class MPTBlock (line 612) | class MPTBlock(nn.Module):
    method __init__ (line 613) | def __init__(self, config, prefix, weights):
    method forward (line 640) | def forward(
  function _cast_if_autocast_enabled (line 663) | def _cast_if_autocast_enabled(tensor):
  class LPLayerNorm (line 675) | class LPLayerNorm(torch.nn.LayerNorm):
    method __init__ (line 676) | def __init__(
    method forward (line 701) | def forward(self, x):
  function rms_norm (line 722) | def rms_norm(x, weight=None, eps=1e-05):
  class RMSNorm (line 729) | class RMSNorm(torch.nn.Module):
    method __init__ (line 730) | def __init__(
    method forward (line 742) | def forward(self, x):
  class LPRMSNorm (line 746) | class LPRMSNorm(RMSNorm):
    method __init__ (line 747) | def __init__(
    method forward (line 758) | def forward(self, x):
  class MPTPreTrainedModel (line 779) | class MPTPreTrainedModel(PreTrainedModel):
  class MPTModel (line 784) | class MPTModel(MPTPreTrainedModel):
    method __init__ (line 785) | def __init__(self, prefix: str, config, weights):
    method _attn_bias (line 859) | def _attn_bias(
    method _apply_prefix_mask (line 917) | def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: tor...
    method _apply_sequence_id (line 940) | def _apply_sequence_id(
    method forward (line 956) | def forward(
  class MPTForCausalLM (line 1088) | class MPTForCausalLM(MPTPreTrainedModel):
    method __init__ (line 1089) | def __init__(self, prefix: str, config, weights):
    method forward (line 1115) | def forward(
    method prepare_inputs_for_generation (line 1168) | def prepare_inputs_for_generation(
    method _reorder_cache (line 1202) | def _reorder_cache(past_key_values, beam_idx):

FILE: server/text_generation_server/models/custom_modeling/neox_modeling.py
  function make_causal_mask (line 53) | def make_causal_mask(
  function expand_mask (line 73) | def expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
  function prepare_attn_mask (line 84) | def prepare_attn_mask(
  class GPTNeoXPreTrainedModel (line 111) | class GPTNeoXPreTrainedModel(PreTrainedModel):
  class GPTNeoXAttention (line 118) | class GPTNeoXAttention(nn.Module):
    method __init__ (line 119) | def __init__(self, config, prefix, weights):
    method forward (line 161) | def forward(
    method _split_heads (line 239) | def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
    method _merge_heads (line 252) | def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
    method _attn (line 265) | def _attn(self, query, key, value, attention_mask=None, head_mask=None):
  class RotaryEmbedding (line 310) | class RotaryEmbedding(torch.nn.Module):
    method __init__ (line 311) | def __init__(self, dim, max_position_embeddings, base=10000, device=No...
    method rotate_half (line 324) | def rotate_half(x):
    method _create_cos_sin (line 331) | def _create_cos_sin(inv_freq, max_position_embeddings, dtype, device):
    method forward (line 340) | def forward(self, q, k, position_ids, seq_len=None):
  function rotary_forward (line 356) | def rotary_forward(q, k, cos, sin, position_ids):
  class GPTNeoXMLP (line 371) | class GPTNeoXMLP(nn.Module):
    method __init__ (line 372) | def __init__(self, config, prefix, weights):
    method forward (line 387) | def forward(self, hidden_states):
  class GPTNeoXLayer (line 394) | class GPTNeoXLayer(nn.Module):
    method __init__ (line 395) | def __init__(self, layer_id, prefix: str, config, weights):
    method forward (line 415) | def forward(
  class GPTNeoXModel (line 462) | class GPTNeoXModel(GPTNeoXPreTrainedModel):
    method __init__ (line 463) | def __init__(self, prefix: str, config, weights):
    method forward (line 485) | def forward(
  class GPTNeoxForCausalLM (line 628) | class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
    method __init__ (line 631) | def __init__(self, prefix: str, config, weights):
    method forward (line 644) | def forward(
    method prepare_inputs_for_generation (line 744) | def prepare_inputs_for_generation(
    method _reorder_cache (line 786) | def _reorder_cache(self, past_key_values, beam_idx):

FILE: server/text_generation_server/models/custom_modeling/opt_modeling.py
  function _make_causal_mask (line 43) | def _make_causal_mask(
  function _expand_mask (line 77) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  class OPTLearnedPositionalEmbedding (line 93) | class OPTLearnedPositionalEmbedding(nn.Module):
    method __init__ (line 98) | def __init__(self, prefix: str, weights):
    method forward (line 107) | def forward(
  class OPTAttention (line 124) | class OPTAttention(nn.Module):
    method __init__ (line 127) | def __init__(
    method _shape (line 175) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 182) | def forward(
  class OPTDecoderLayer (line 316) | class OPTDecoderLayer(nn.Module):
    method __init__ (line 317) | def __init__(self, layer_id: int, prefix: str, config: OPTConfig, weig...
    method forward (line 345) | def forward(
  class OPTPreTrainedModel (line 429) | class OPTPreTrainedModel(PreTrainedModel):
  class OPTDecoder (line 433) | class OPTDecoder(OPTPreTrainedModel):
    method __init__ (line 434) | def __init__(self, prefix: str, config: OPTConfig, weights):
    method _prepare_decoder_attention_mask (line 492) | def _prepare_decoder_attention_mask(
    method forward (line 519) | def forward(
  class OPTModel (line 703) | class OPTModel(OPTPreTrainedModel):
    method __init__ (line 704) | def __init__(self, prefix: str, config: OPTConfig, weights):
    method forward (line 709) | def forward(
  class OPTForCausalLM (line 760) | class OPTForCausalLM(OPTPreTrainedModel):
    method __init__ (line 761) | def __init__(self, prefix, config, weights):
    method forward (line 774) | def forward(
    method prepare_inputs_for_generation (line 829) | def prepare_inputs_for_generation(
    method _reorder_cache (line 856) | def _reorder_cache(past_key_values, beam_idx):

FILE: server/text_generation_server/models/custom_modeling/phi_modeling.py
  class PhiConfig (line 22) | class PhiConfig(PretrainedConfig):
    method __init__ (line 23) | def __init__(
  class RotaryEmbedding (line 67) | class RotaryEmbedding(nn.Module):
    method __init__ (line 68) | def __init__(self, dim, max_seq_len):
    method apply_rotary_emb_qkv (line 78) | def apply_rotary_emb_qkv(self, qkv, seqlen_offset):
  class PhiCausalLMHead (line 115) | class PhiCausalLMHead(nn.Module):
    method __init__ (line 116) | def __init__(self, config, weights):
    method forward (line 127) | def forward(self, hidden_states):
  class PhiMHA (line 134) | class PhiMHA(nn.Module):
    method __init__ (line 135) | def __init__(self, prefix, config, weights):
    method forward (line 155) | def forward(
  class PhiMLP (line 196) | class PhiMLP(nn.Module):
    method __init__ (line 197) | def __init__(self, prefix, config, weights):
    method forward (line 215) | def forward(self, hidden_states):
  class PhiBlock (line 223) | class PhiBlock(nn.Module):
    method __init__ (line 224) | def __init__(self, layer_id, config, weights):
    method forward (line 233) | def forward(
  class PhiModel (line 250) | class PhiModel(nn.Module):
    method __init__ (line 251) | def __init__(self, prefix: str, config, weights):
    method forward (line 265) | def forward(
  class PhiForCausalLM (line 291) | class PhiForCausalLM(torch.nn.Module):
    method __init__ (line 292) | def __init__(self, prefix: str, config, weights):
    method forward (line 303) | def forward(

FILE: server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
  class Qwen2_5_VLVideosProcessorKwargs (line 63) | class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
  class Qwen2_5_VLProcessorKwargs (line 67) | class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
  class Qwen2_5_VLProcessor (line 77) | class Qwen2_5_VLProcessor(ProcessorMixin):
    method __init__ (line 97) | def __init__(
    method __call__ (line 112) | def __call__(
    method batch_decode (line 232) | def batch_decode(self, *args, **kwargs):
    method decode (line 239) | def decode(self, *args, **kwargs):
    method post_process_image_text_to_text (line 246) | def post_process_image_text_to_text(self, generated_outputs):
    method model_input_names (line 265) | def model_input_names(self):
  class Qwen2_5_VLVisionConfig (line 275) | class Qwen2_5_VLVisionConfig(PretrainedConfig):
    method __init__ (line 279) | def __init__(
  class Qwen2_5_VLConfig (line 315) | class Qwen2_5_VLConfig(PretrainedConfig):
    method __init__ (line 316) | def __init__(
  function rotate_half (line 379) | def rotate_half(x):
  function apply_rotary_pos_emb_vision (line 386) | def apply_rotary_pos_emb_vision(
  class Qwen2_5VLAttention (line 400) | class Qwen2_5VLAttention(nn.Module):
    method __init__ (line 401) | def __init__(self, *, prefix, config, weights):
    method forward (line 425) | def forward(
  class Qwen2_5VLVisionMLP (line 529) | class Qwen2_5VLVisionMLP(nn.Module):
    method __init__ (line 530) | def __init__(self, *, prefix, config, weights):
    method forward (line 548) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Qwen2_5VLVisionBlock (line 556) | class Qwen2_5VLVisionBlock(nn.Module):
    method __init__ (line 557) | def __init__(self, prefix, config, weights):
    method forward (line 580) | def forward(
  class Qwen2_5VLPatchMerger (line 592) | class Qwen2_5VLPatchMerger(nn.Module):
    method __init__ (line 593) | def __init__(self, *, prefix, config, weights):
    method forward (line 608) | def forward(self, hidden_states) -> torch.Tensor:
  class Qwen2_5VisionModel (line 617) | class Qwen2_5VisionModel(nn.Module):
    method __init__ (line 618) | def __init__(self, *, prefix, config, weights):
    method apply_class_embedding (line 665) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T...
    method get_window_index (line 671) | def get_window_index(self, grid_thw):
    method forward (line 718) | def forward(
  class Qwen2_5VLForConditionalGeneration (line 817) | class Qwen2_5VLForConditionalGeneration(nn.Module):
    method __init__ (line 818) | def __init__(self, prefix, config, weights):
    method get_position_ids (line 867) | def get_position_ids(
    method get_vision_embeds (line 944) | def get_vision_embeds(
    method get_inputs_embeds (line 954) | def get_inputs_embeds(
    method forward (line 967) | def forward(

FILE: server/text_generation_server/models/custom_modeling/qwen2_vl.py
  function rotate_half (line 50) | def rotate_half(x):
  function apply_rotary_pos_emb_vision (line 57) | def apply_rotary_pos_emb_vision(
  class Qwen2VLAttention (line 71) | class Qwen2VLAttention(nn.Module):
    method __init__ (line 72) | def __init__(self, *, prefix, config, weights):
    method forward (line 95) | def forward(
  class Qwen2VLVisionMLP (line 199) | class Qwen2VLVisionMLP(nn.Module):
    method __init__ (line 200) | def __init__(self, *, prefix, config, weights):
    method forward (line 210) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class Qwen2VLVisionBlock (line 217) | class Qwen2VLVisionBlock(nn.Module):
    method __init__ (line 218) | def __init__(self, prefix, config, weights):
    method forward (line 241) | def forward(
  class Qwen2VLPatchMerger (line 252) | class Qwen2VLPatchMerger(nn.Module):
    method __init__ (line 253) | def __init__(self, *, prefix, config, weights):
    method forward (line 268) | def forward(self, hidden_states) -> torch.Tensor:
  class Qwen2VisionModel (line 277) | class Qwen2VisionModel(nn.Module):
    method __init__ (line 278) | def __init__(self, *, prefix, config, weights):
    method apply_class_embedding (line 320) | def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.T...
    method forward (line 326) | def forward(
  class Qwen2VLForConditionalGeneration (line 393) | class Qwen2VLForConditionalGeneration(nn.Module):
    method __init__ (line 394) | def __init__(self, prefix, config, weights):
    method get_position_ids (line 448) | def get_position_ids(
    method get_vision_embeds (line 522) | def get_vision_embeds(
    method get_inputs_embeds (line 532) | def get_inputs_embeds(
    method forward (line 545) | def forward(

FILE: server/text_generation_server/models/custom_modeling/siglip.py
  class SiglipVisionEmbeddings (line 21) | class SiglipVisionEmbeddings(nn.Module):
    method __init__ (line 22) | def __init__(self, prefix, config: SiglipVisionConfig, weights):
    method forward (line 52) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
  class SiglipAttention (line 62) | class SiglipAttention(nn.Module):
    method __init__ (line 65) | def __init__(self, prefix, config, weights):
    method _shape (line 95) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 102) | def forward(
  class SiglipMLP (line 163) | class SiglipMLP(nn.Module):
    method __init__ (line 164) | def __init__(self, prefix, config, weights):
    method forward (line 175) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  class SiglipEncoderLayer (line 182) | class SiglipEncoderLayer(nn.Module):
    method __init__ (line 183) | def __init__(self, prefix, config: SiglipConfig, weights):
    method forward (line 197) | def forward(
  class SiglipMultiheadAttentionPoolingHead (line 216) | class SiglipMultiheadAttentionPoolingHead(nn.Module):
    method __init__ (line 219) | def __init__(self, prefix, config: SiglipVisionConfig, weights):
    method forward (line 229) | def forward(self, hidden_state):
  function _trunc_normal_ (line 242) | def _trunc_normal_(tensor, mean, std, a, b):
  function trunc_normal_tf_ (line 278) | def trunc_normal_tf_(
  function variance_scaling_ (line 308) | def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="no...
  function lecun_normal_ (line 333) | def lecun_normal_(tensor):
  function default_flax_embed_init (line 337) | def default_flax_embed_init(tensor):
  class SiglipEncoder (line 341) | class SiglipEncoder(nn.Module):
    method __init__ (line 350) | def __init__(self, prefix, config: SiglipConfig, weights):
    method forward (line 362) | def forward(
  class SiglipVisionTransformer (line 377) | class SiglipVisionTransformer(nn.Module):
    method __init__ (line 378) | def __init__(self, prefix, config: SiglipVisionConfig, weights):
    method forward (line 389) | def forward(

FILE: server/text_generation_server/models/custom_modeling/t5_modeling.py
  class PartialTPEmbedding (line 58) | class PartialTPEmbedding(nn.Module):
    method __init__ (line 59) | def __init__(self, prefix: str, weights):
    method forward (line 64) | def forward(self, input: torch.Tensor) -> torch.Tensor:
  function layer_norm (line 69) | def layer_norm(hidden_states, weight, epsilon):
  class T5LayerNorm (line 85) | class T5LayerNorm(nn.Module):
    method __init__ (line 86) | def __init__(self, prefix, weights, eps=1e-6):
    method forward (line 95) | def forward(self, hidden_states):
  class T5DenseActDense (line 117) | class T5DenseActDense(nn.Module):
    method __init__ (line 118) | def __init__(self, config: T5Config, prefix, weights):
    method forward (line 146) | def forward(self, hidden_states):
  class T5DenseGatedActDense (line 159) | class T5DenseGatedActDense(nn.Module):
    method __init__ (line 160) | def __init__(self, config: T5Config, prefix, weights):
    method forward (line 190) | def forward(self, hidden_states):
  class T5LayerFF (line 204) | class T5LayerFF(nn.Module):
    method __init__ (line 205) | def __init__(self, config: T5Config, prefix, weights):
    method forward (line 223) | def forward(self, hidden_states):
  class T5Attention (line 230) | class T5Attention(nn.Module):
    method __init__ (line 231) | def __init__(
    method _relative_position_bucket (line 274) | def _relative_position_bucket(
    method compute_bias (line 328) | def compute_bias(self, query_length, key_length, device=None):
    method forward (line 355) | def forward(
  class T5LayerSelfAttention (line 504) | class T5LayerSelfAttention(nn.Module):
    method __init__ (line 505) | def __init__(self, config, prefix, weights, has_relative_attention_bia...
    method forward (line 520) | def forward(
  class T5LayerCrossAttention (line 547) | class T5LayerCrossAttention(nn.Module):
    method __init__ (line 548) | def __init__(self, config, prefix, weights):
    method forward (line 563) | def forward(
  class T5Block (line 594) | class T5Block(nn.Module):
    method __init__ (line 595) | def __init__(self, config, prefix, weights, has_relative_attention_bia...
    method forward (line 621) | def forward(
  class T5PreTrainedModel (line 746) | class T5PreTrainedModel(PreTrainedModel):
    method _shift_right (line 754) | def _shift_right(self, input_ids):
  class T5Stack (line 786) | class T5Stack(T5PreTrainedModel):
    method __init__ (line 787) | def __init__(self, config, prefix, weights, embed_tokens):
    method forward (line 811) | def forward(
  class T5ForConditionalGeneration (line 1015) | class T5ForConditionalGeneration(T5PreTrainedModel):
    method __init__ (line 1016) | def __init__(self, config: T5Config, weights):
    method forward (line 1056) | def forward(
    method prepare_inputs_for_generation (line 1167) | def prepare_inputs_for_generation(
    method prepare_decoder_input_ids_from_labels (line 1196) | def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
    method _reorder_cache (line 1199) | def _reorder_cache(self, past_key_values, beam_idx):

FILE: server/text_generation_server/models/custom_modeling/vlm.py
  function load_text_model (line 1) | def load_text_model(prefix, config, weights, name=None):
  function load_vision_model (line 43) | def load_vision_model(prefix, config, weights):

FILE: server/text_generation_server/models/flash_causal_lm.py
  function small_power_of_2 (line 87) | def small_power_of_2(n: int):
  function init_cpu_threads_env (line 91) | def init_cpu_threads_env(rank_id: int, world_size: int):
  class FlashCausalLMBatch (line 126) | class FlashCausalLMBatch(Batch):
    method to_pb (line 210) | def to_pb(self) -> generate_pb2.CachedBatch:
    method batch_tokenized_inputs (line 224) | def batch_tokenized_inputs(
    method from_tokenized (line 244) | def from_tokenized(
    method from_pb (line 473) | def from_pb(
    method filter (line 485) | def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
    method concatenate (line 691) | def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCau...
    method prepare_for_prefill (line 948) | def prepare_for_prefill(self):
    method __len__ (line 1177) | def __len__(self):
  class FlashCausalLM (line 1193) | class FlashCausalLM(Model):
    method __init__ (line 1194) | def __init__(
    method batch_type (line 1344) | def batch_type(self) -> Type[FlashCausalLMBatch]:
    method init_kv_cache (line 1347) | def init_kv_cache(
    method cuda_graph_warmup (line 1369) | def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
    method warmup (line 1507) | def warmup(
    method tunableop_warmup (line 1686) | def tunableop_warmup(self, seqlen: int, max_bt: int):
    method forward (line 1726) | def forward(
    method generate_token (line 1891) | def generate_token(
    method _forward_context (line 2432) | def _forward_context(

FILE: server/text_generation_server/models/galactica.py
  function _insert_split_marker (line 29) | def _insert_split_marker(m: re.Match):
  function escape_custom_split_sequence (line 46) | def escape_custom_split_sequence(text):
  class GalacticaCausalLMBatch (line 63) | class GalacticaCausalLMBatch(CausalLMBatch):
    method from_pb (line 65) | def from_pb(

FILE: server/text_generation_server/models/globals.py
  function set_adapter_to_index (line 70) | def set_adapter_to_index(adapter_to_index: Dict[str, int]):
  function get_adapter_to_index (line 75) | def get_adapter_to_index():

FILE: server/text_generation_server/models/idefics_causal_lm.py
  class IdeficsCausalLMBatch (line 44) | class IdeficsCausalLMBatch(Batch):
    method to_pb (line 80) | def to_pb(self) -> generate_pb2.CachedBatch:
    method from_pb (line 90) | def from_pb(
    method from_pb_processor (line 100) | def from_pb_processor(
    method filter (line 231) | def filter(self, request_ids: List[int]) -> Optional["IdeficsCausalLMB...
    method concatenate (line 347) | def concatenate(
    method __len__ (line 584) | def __len__(self):
  class IdeficsCausalLM (line 588) | class IdeficsCausalLM(Model):
    method __init__ (line 589) | def __init__(
    method batch_type (line 672) | def batch_type(self) -> Type[IdeficsCausalLMBatch]:
    method forward (line 675) | def forward(
    method generate_token (line 708) | def generate_token(

FILE: server/text_generation_server/models/mamba.py
  function new_inference_params (line 36) | def new_inference_params(
  class MambaBatch (line 78) | class MambaBatch(Batch):
    method to_pb (line 113) | def to_pb(self) -> generate_pb2.CachedBatch:
    method from_pb (line 123) | def from_pb(
    method filter (line 199) | def filter(self, request_ids: List[int]) -> Optional["MambaBatch"]:
    method concatenate (line 281) | def concatenate(cls, batches: List["MambaBatch"]) -> "MambaBatch":
    method __len__ (line 402) | def __len__(self):
  class Mamba (line 406) | class Mamba(Model):
    method __init__ (line 407) | def __init__(
    method batch_type (line 475) | def batch_type(self) -> Type[MambaBatch]:
    method warmup (line 478) | def warmup(
    method cuda_graph_warmup (line 501) | def cuda_graph_warmup(self, batch_size: int):
    method tunableop_warmup (line 544) | def tunableop_warmup(self, batch_size: int, seqlen: int):
    method forward (line 568) | def forward(
    method generate_token (line 616) | def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tup...

FILE: server/text_generation_server/models/metadata_kernels.py
  function has_triton (line 18) | def has_triton():
  function block_tables_to_padded (line 30) | def block_tables_to_padded(
  function block_tables_to_ragged (line 51) | def block_tables_to_ragged(
  function copy_next_input_ids_inplace (line 99) | def copy_next_input_ids_inplace(
  function prepare_position_slot_ids (line 126) | def prepare_position_slot_ids(
  function slots_filtering (line 145) | def slots_filtering(
  function triton_slots_filtering (line 164) | def triton_slots_filtering(
  function triton_block_tables_to_padded (line 193) | def triton_block_tables_to_padded(
  function triton_block_tables_to_ragged (line 224) | def triton_block_tables_to_ragged(
  function triton_copy_next_input_ids_inplace (line 255) | def triton_copy_next_input_ids_inplace(
  function triton_prepare_position_slot_ids (line 311) | def triton_prepare_position_slot_ids(

FILE: server/text_generation_server/models/mllama_causal_lm.py
  class MllamaCausalLMBatch (line 26) | class MllamaCausalLMBatch(VlmCausalLMBatch):
    method prepare_for_prefill (line 32) | def prepare_for_prefill(self):
    method concatenate (line 37) | def concatenate(cls, batches):
    method filter (line 60) | def filter(self, request_ids: List[int]):
    method batch_tokenized_inputs (line 92) | def batch_tokenized_inputs(
    method from_pb_processor (line 158) | def from_pb_processor(
  class MllamaCausalLM (line 202) | class MllamaCausalLM(VlmCausalLM):
    method set_inputs_embeds (line 203) | def set_inputs_embeds(self, batch):
    method cuda_graph_warmup (line 207) | def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
    method forward (line 210) | def forward(

FILE: server/text_generation_server/models/model.py
  class Model (line 29) | class Model(ABC):
    method __init__ (line 30) | def __init__(
    method info (line 105) | def info(self) -> InfoResponse:
    method batch_type (line 123) | def batch_type(self) -> Type[B]:
    method generate_token (line 127) | def generate_token(
    method warmup (line 132) | def warmup(
    method decode_token (line 144) | def decode_token(
    method check_initialized (line 173) | def check_initialized(self):

FILE: server/text_generation_server/models/seq2seq_lm.py
  class Seq2SeqLMBatch (line 36) | class Seq2SeqLMBatch(Batch):
    method to_pb (line 76) | def to_pb(self) -> generate_pb2.CachedBatch:
    method from_pb (line 87) | def from_pb(
    method filter (line 181) | def filter(self, request_ids: List[int]) -> Optional["Seq2SeqLMBatch"]:
    method concatenate (line 296) | def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBat...
    method __len__ (line 538) | def __len__(self):
  class Seq2SeqLM (line 542) | class Seq2SeqLM(Model):
    method __init__ (line 543) | def __init__(
    method fallback (line 621) | def fallback(
    method batch_type (line 685) | def batch_type(self) -> Type[Seq2SeqLMBatch]:
    method forward (line 688) | def forward(
    method generate_token (line 726) | def generate_token(

FILE: server/text_generation_server/models/transformers_flash_causal_lm.py
  function tgi_flash_attention_forward (line 20) | def tgi_flash_attention_forward(
  class TransformersFlashCausalLM (line 96) | class TransformersFlashCausalLM(FlashCausalLM):
    method __init__ (line 97) | def __init__(
    method fallback (line 237) | def fallback(
    method _model_forward (line 255) | def _model_forward(

FILE: server/text_generation_server/models/transformers_flash_vlm.py
  function tgi_flash_attention_forward (line 37) | def tgi_flash_attention_forward(
  class TransformersFlashVlmCausalLM (line 150) | class TransformersFlashVlmCausalLM(VlmCausalLM):
    method __init__ (line 151) | def __init__(
    method get_position_ids (line 319) | def get_position_ids(self, input_ids, image_grid_thw, position_ids):
    method pre_process_inputs (line 322) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
    method post_process_outputs (line 328) | def post_process_outputs(self, logits, lm_head_indices):
    method fallback (line 332) | def fallback(
    method _model_forward (line 358) | def _model_forward(
  class TransformersQwen2VlmCausalLM (line 418) | class TransformersQwen2VlmCausalLM(TransformersFlashVlmCausalLM):
    method get_position_ids (line 419) | def get_position_ids(self, input_ids: torch.Tensor, image_grid_thw: to...
    method post_process_outputs (line 492) | def post_process_outputs(self, logits, lm_head_indices):
    method pre_process_inputs (line 495) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
  class TransformersGemma3VlmCausalLM (line 501) | class TransformersGemma3VlmCausalLM(TransformersFlashVlmCausalLM):
    method get_attention_mask (line 502) | def get_attention_mask(self, input_ids, cu_seqlen_prefill):
    method pre_process_inputs (line 557) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
  class TransformersLlama4VlmCausalLM (line 573) | class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM):
    method pre_process_inputs (line 574) | def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
    method get_vision_embeds (line 580) | def get_vision_embeds(
    method get_inputs_embeds (line 598) | def get_inputs_embeds(self, input_ids, vision_embeds=None):

FILE: server/text_generation_server/models/types.py
  class Batch (line 13) | class Batch(ABC):
    method to_pb (line 15) | def to_pb(self) -> generate_pb2.CachedBatch:
    method from_pb (line 20) | def from_pb(
    method filter (line 30) | def filter(self, request_ids: List[int]) -> "Batch":
    method concatenate (line 35) | def concatenate(cls, batches: List["Batch"]) -> "Batch":
    method __len__ (line 39) | def __len__(self):
  class GeneratedText (line 44) | class GeneratedText:
    method to_pb (line 50) | def to_pb(self) -> generate_pb2.GeneratedText:
  class Tokens (line 60) | class Tokens:
    method to_pb (line 66) | def to_pb(self) -> generate_pb2.Tokens:
    method __len__ (line 74) | def __len__(self):
    method __add__ (line 77) | def __add__(self, other: "Tokens") -> "Tokens":
  class Generation (line 87) | class Generation:
    method to_pb (line 95) | def to_pb(self) -> generate_pb2.Generation:

FILE: server/text_generation_server/models/vlm_causal_lm.py
  function prompt_split_image_llama4 (line 33) | def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk):
  function _prompt_split_image (line 61) | def _prompt_split_image(
  function get_anyres_image_grid_shape (line 90) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
  function image_text_replacement (line 113) | def image_text_replacement(processor, image_input, config) -> str:
  function image_text_replacement_fixup (line 186) | def image_text_replacement_fixup(config, text: str) -> str:
  function preprocess_text (line 194) | def preprocess_text(config, text: str) -> str:
  function preprocess_image (line 200) | def preprocess_image(config, img):
  function get_unpadded_features (line 215) | def get_unpadded_features(
  function get_number_of_features (line 242) | def get_number_of_features(height: int, width: int, config) -> int:
  function scatter_image_embeds (line 269) | def scatter_image_embeds(
  function gather_image_embeds (line 283) | def gather_image_embeds(
  class ImagePositions (line 293) | class ImagePositions:
  class VlmCausalLMBatch (line 301) | class VlmCausalLMBatch(FlashCausalLMBatch):
    method concatenate (line 315) | def concatenate(cls, batches):
    method filter (line 348) | def filter(self, request_ids: List[int]):
    method batch_tokenized_inputs (line 379) | def batch_tokenized_inputs(
    method get_image_positions (line 457) | def get_image_positions(
    method from_pb_processor (line 528) | def from_pb_processor(
    method prepare_for_prefill (line 551) | def prepare_for_prefill(self):
    method update_encoder_cache (line 619) | def update_encoder_cache(self, encoder_outputs, request_id, img_pos):
    method gather_vision_embeds (line 624) | def gather_vision_embeds(self):
    method free_encoder_cache (line 687) | def free_encoder_cache(self):
  class VlmCausalLM (line 696) | class VlmCausalLM(FlashCausalLM):
    method __init__ (line 697) | def __init__(
    method batch_type (line 729) | def batch_type(self) -> Type[VlmCausalLMBatch]:
    method cuda_graph_warmup (line 732) | def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
    method get_vision_embeds (line 875) | def get_vision_embeds(
    method get_inputs_embeds (line 890) | def get_inputs_embeds(
    method encode_images (line 900) | def encode_images(self, batch):
    method set_inputs_embeds (line 939) | def set_inputs_embeds(self, batch):
    method forward (line 953) | def forward(

FILE: server/text_generation_server/server.py
  class SignalHandler (line 41) | class SignalHandler:
    method __init__ (line 44) | def __init__(self):
    method set_keep_processing (line 48) | def set_keep_processing(self, value: bool):
    method exit_gracefully (line 51) | def exit_gracefully(self, signum, frame):
  class TextGenerationService (line 56) | class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServi...
    method __init__ (line 57) | def __init__(
    method Info (line 73) | async def Info(self, request, context):
    method Health (line 76) | async def Health(self, request, context):
    method ServiceDiscovery (line 81) | async def ServiceDiscovery(self, request, context):
    method ClearCache (line 84) | async def ClearCache(self, request, context):
    method FilterBatch (line 91) | async def FilterBatch(self, request, context):
    method Warmup (line 100) | async def Warmup(self, request, context):
    method Prefill (line 151) | async def Prefill(self, request, context):
    method Decode (line 193) | async def Decode(self, request, context):
  function serve (line 229) | def serve(

FILE: server/text_generation_server/tracing.py
  class UDSOpenTelemetryAioServerInterceptor (line 16) | class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterce...
    method __init__ (line 17) | def __init__(self):
    method _start_span (line 20) | def _start_span(self, handler_call_details, context, set_status_on_exc...
  function setup_tracing (line 57) | def setup_tracing(otlp_service_name: str, otlp_endpoint: str):

FILE: server/text_generation_server/utils/adapter.py
  class AdapterInfo (line 28) | class AdapterInfo:
  class AdapterParameters (line 35) | class AdapterParameters:
  class AdapterSource (line 44) | class AdapterSource:
  function parse_lora_adapters (line 50) | def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]:
  function load_and_merge_adapters (line 71) | def load_and_merge_adapters(
  class AdapterParametersContainer (line 99) | class AdapterParametersContainer:
    method __hash__ (line 103) | def __hash__(self) -> int:
  function _load_and_merge (line 108) | def _load_and_merge(
  function check_architectures (line 149) | def check_architectures(
  function load_module_map (line 188) | def load_module_map(
  function get_attn_weights (line 236) | def get_attn_weights(i, layer):
  function get_mlp_weights (line 259) | def get_mlp_weights(i, layer):
  function build_layer_weight_lookup (line 303) | def build_layer_weight_lookup(model):

FILE: server/text_generation_server/utils/chunks.py
  function concat_text_chunks (line 8) | def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str:

FILE: server/text_generation_server/utils/convert.py
  function _remove_duplicate_names (line 12) | def _remove_duplicate_names(
  function convert_file (line 62) | def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]):
  function convert_files (line 96) | def convert_files(pt_files: List[Path], sf_files: List[Path], discard_na...

FILE: server/text_generation_server/utils/dist.py
  class FakeBarrier (line 16) | class FakeBarrier:
    method wait (line 17) | def wait(self):
  class FakeGroup (line 21) | class FakeGroup(ProcessGroup):
    method __init__ (line 22) | def __init__(self, rank, size):
    method allreduce (line 27) | def allreduce(self, *args, **kwargs):
    method allgather (line 30) | def allgather(self, inputs, local_tensor, **kwargs):
    method barrier (line 38) | def barrier(self, *args, **kwargs):
    method size (line 41) | def size(self):
    method rank (line 44) | def rank(self):
  function initialize_torch_distributed (line 48) | def initialize_torch_distributed():

FILE: server/text_generation_server/utils/hub.py
  function _cached_weight_files (line 21) | def _cached_weight_files(
  function _weight_hub_files_from_model_info (line 32) | def _weight_hub_files_from_model_info(
  function _weight_files_from_dir (line 46) | def _weight_files_from_dir(d: Path, extension: str) -> List[str]:
  function _get_cached_revision_directory (line 62) | def _get_cached_revision_directory(
  function weight_hub_files (line 97) | def weight_hub_files(
  function try_to_load_from_cache (line 119) | def try_to_load_from_cache(
  function weight_files (line 133) | def weight_files(
  function download_weights (line 188) | def download_weights(

FILE: server/text_generation_server/utils/import_utils.py
  function is_ipex_available (line 9) | def is_ipex_available():
  function get_cuda_free_memory (line 13) | def get_cuda_free_memory(device, memory_fraction):
  function get_xpu_free_memory (line 20) | def get_xpu_free_memory(device, memory_fraction):
  function get_cpu_free_memory (line 29) | def get_cpu_free_memory(device, memory_fraction):
  function noop (line 38) | def noop(*args, **kwargs):

FILE: server/text_generation_server/utils/kernels.py
  function load_kernel (line 9) | def load_kernel(*, module: str, repo_id: str):

FILE: server/text_generation_server/utils/log.py
  function log_once (line 6) | def log_once(log, msg: str, master=True):
  function log_master (line 13) | def log_master(log, msg: str):

FILE: server/text_generation_server/utils/logits_process.py
  class StaticWarper (line 25) | class StaticWarper:
    method __init__ (line 26) | def __init__(
    method __call__ (line 50) | def __call__(self, scores):
  function static_warper (line 79) | def static_warper(
  class HeterogeneousRepetitionPenaltyLogitsProcessor (line 90) | class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor):
    method __init__ (line 102) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t...
    method __call__ (line 108) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 119) | def filter(self, indices):
  class FrequencyPenaltyLogitsProcessor (line 127) | class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
    method __init__ (line 136) | def __init__(self, penalty: float):
    method __call__ (line 139) | def __call__(
  class HeterogeneousFrequencyPenaltyLogitsProcessor (line 151) | class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor):
    method __init__ (line 161) | def __init__(self, penalty: List[float], dtype: torch.dtype, device: t...
    method __call__ (line 167) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 182) | def filter(self, indices):
  class HeterogeneousTemperatureLogitsWarper (line 190) | class HeterogeneousTemperatureLogitsWarper:
    method __init__ (line 201) | def __init__(
    method __call__ (line 209) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 213) | def filter(self, indices):
  class HeterogeneousTopPLogitsWarper (line 221) | class HeterogeneousTopPLogitsWarper(LogitsProcessor):
    method __init__ (line 237) | def __init__(
    method __call__ (line 252) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 272) | def filter(self, indices):
  class HeterogeneousTopKLogitsWarper (line 280) | class HeterogeneousTopKLogitsWarper(LogitsProcessor):
    method __init__ (line 295) | def __init__(
    method __call__ (line 323) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 344) | def filter(self, indices):
  class HeterogeneousTypicalLogitsWarper (line 361) | class HeterogeneousTypicalLogitsWarper(LogitsProcessor):
    method __init__ (line 377) | def __init__(
    method __call__ (line 399) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 435) | def filter(self, indices):
  class HeterogeneousProcessorWrapper (line 451) | class HeterogeneousProcessorWrapper(LogitsProcessor):
    method __init__ (line 459) | def __init__(
    method __call__ (line 465) | def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> t...
    method filter (line 470) | def filter(self, indices):
  class GrammarLogitProcessor (line 482) | class GrammarLogitProcessor(LogitsProcessor):
    method __init__ (line 486) | def __init__(
    method __call__ (line 499) | def __call__(
    method advance (line 513) | def advance(self, next_token_id, fsm_grammar_state):
    method _advance (line 519) | def _advance(next_token_id, fsm_grammar_state, fsm):
    method _cached_compile_fsm (line 527) | def _cached_compile_fsm(
    method _cached_adapt_tokenizer (line 547) | def _cached_adapt_tokenizer(tokenizer):
  class HeterogeneousGrammarLogitProcessor (line 575) | class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
    method __init__ (line 576) | def __init__(self, tokenizer, device, grammars, grammar_types):
    method __call__ (line 589) | def __call__(
    method advance_batch (line 605) | def advance_batch(self, next_token_ids, fsm_grammar_states):
    method advance_at_index (line 613) | def advance_at_index(self, next_token_id, fsm_grammar_state, index):
    method filter (line 620) | def filter(self, indices):

FILE: server/text_generation_server/utils/merges/strategies.py
  class AdapterParameters (line 17) | class AdapterParameters:
    method __init__ (line 18) | def __init__(
  function _apply_weights (line 28) | def _apply_weights(
  class MergeStrategy (line 44) | class MergeStrategy(ABC):
    method merge (line 45) | def merge(
  class LinearMerge (line 51) | class LinearMerge(MergeStrategy):
    method __init__ (line 52) | def __init__(self, **kwargs):
    method merge (line 55) | def merge(
  class TiesMerge (line 62) | class TiesMerge(MergeStrategy):
    method __init__ (line 63) | def __init__(self, density: float, majority_sign_method: str = "total"...
    method merge (line 67) | def merge(
  class DareLinearMerge (line 86) | class DareLinearMerge(MergeStrategy):
    method __init__ (line 87) | def __init__(self, density: float, **kwargs):
    method merge (line 90) | def merge(
  class DareTiesMerge (line 102) | class DareTiesMerge(MergeStrategy):
    method __init__ (line 103) | def __init__(self, density: float, majority_sign_method: str = "total"...
    method merge (line 107) | def merge(
  function merge_adapters (line 136) | def merge_adapters(
  function _validate_lora_configs (line 193) | def _validate_lora_configs(lora_configs: List["LoraConfig"]):
  function _merge_lora_configs (line 207) | def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig":

FILE: server/text_generation_server/utils/merges/utils.py
  function magnitude_based_pruning (line 23) | def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> tor...
  function random_pruning (line 39) | def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) ...
  function prune (line 56) | def prune(
  function calculate_majority_sign_mask (line 83) | def calculate_majority_sign_mask(
  function disjoint_merge (line 105) | def disjoint_merge(task_tensors, majority_sign_mask):

FILE: server/text_generation_server/utils/peft.py
  function download_and_unload_peft (line 10) | def download_and_unload_peft(model_id, revision, trust_remote_code):
  function download_peft (line 48) | def download_peft(

FILE: server/text_generation_server/utils/prefill_chunking.py
  function set_support_chunking (line 7) | def set_support_chunking(support_chunking: bool):
  function get_support_chunking (line 12) | def get_support_chunking() -> bool:
  function set_max_prefill_tokens (line 17) | def set_max_prefill_tokens(max_prefill_tokens: int):
  function get_max_prefill_tokens (line 22) | def get_max_prefill_tokens() -> int:

FILE: server/text_generation_server/utils/quantization.py
  class _QuantizerConfig (line 16) | class _QuantizerConfig:
  class _FP8QuantizerConfig (line 28) | class _FP8QuantizerConfig:
  function _get_config_json (line 32) | def _get_config_json(model_id: str, revision: Optional[str], filename: s...
  function _get_quantizer_config (line 47) | def _get_quantizer_config(model_id, revision):
  function get_loader (line 124) | def get_loader(

FILE: server/text_generation_server/utils/segments.py
  function find_segments (line 11) | def find_segments(
  class SegmentConcatBuilder (line 31) | class SegmentConcatBuilder:
    method __init__ (line 32) | def __init__(self):
    method concat (line 36) | def concat(self, adapter_segments: torch.Tensor, segment_indices: List...
    method build (line 61) | def build(self) -> Tuple[torch.Tensor, List[int]]:

FILE: server/text_generation_server/utils/speculate.py
  function get_speculate (line 4) | def get_speculate() -> int:
  function set_speculate (line 9) | def set_speculate(speculate: int):

FILE: server/text_generation_server/utils/tokens.py
  class NextTokenChooser (line 24) | class NextTokenChooser:
    method __init__ (line 25) | def __init__(
    method __call__ (line 81) | def __call__(self, input_ids, scores):
    method advance_grammar (line 100) | def advance_grammar(self, next_id: int):
    method from_pb (line 108) | def from_pb(
  class StopSequenceCriteria (line 131) | class StopSequenceCriteria:
    method __init__ (line 132) | def __init__(self, stop_sequence: str):
    method __call__ (line 136) | def __call__(self, output: str) -> bool:
  class StoppingCriteria (line 142) | class StoppingCriteria:
    method __init__ (line 143) | def __init__(
    method __call__ (line 167) | def __call__(self, last_token: int, last_output: str) -> Tuple[bool, O...
    method from_pb (line 191) | def from_pb(
  function create_n_gram_speculation (line 209) | def create_n_gram_speculation(
  class HeterogeneousNextTokenChooser (line 233) | class HeterogeneousNextTokenChooser:
    method __init__ (line 234) | def __init__(
    method __call__ (line 326) | def __call__(
    method advance_grammar (line 415) | def advance_grammar(self, next_ids: List[int]):
    method advance_grammar_single (line 423) | def advance_grammar_single(self, grammar_state_index: int, next_id: int):
    method filter (line 434) | def filter(self, indices):
    method from_pb (line 477) | def from_pb(
  class Sampling (line 506) | class Sampling:
    method __init__ (line 507) | def __init__(self, seed: int, device: str = "cpu"):
    method __call__ (line 512) | def __call__(self, logits):
  class Greedy (line 520) | class Greedy:
    method __call__ (line 521) | def __call__(self, logits):
  class HeterogeneousSampling (line 525) | class HeterogeneousSampling:
    method __init__ (line 530) | def __init__(self, do_sample: List[bool], seeds: List[int], device: to...
    method __call__ (line 543) | def __call__(self, logits):
    method filter (line 553) | def filter(self, indices):
  function batch_top_tokens (line 567) | def batch_top_tokens(

FILE: server/text_generation_server/utils/watermark.py
  class WatermarkLogitsProcessor (line 26) | class WatermarkLogitsProcessor(LogitsProcessor):
    method __init__ (line 27) | def __init__(
    method _seed_rng (line 40) | def _seed_rng(self, input_ids: Union[List[int], torch.LongTensor]):
    method _get_greenlist_ids (line 55) | def _get_greenlist_ids(
    method _calc_greenlist_mask (line 70) | def _calc_greenlist_mask(
    method _bias_greenlist_logits (line 79) | def _bias_greenlist_logits(
    method __call__ (line 85) | def __call__(

FILE: server/text_generation_server/utils/weights.py
  class WeightsLoader (line 13) | class WeightsLoader(ABC):
    method get_weights (line 25) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 32) | def get_weights_col_packed(
    method get_weights_col (line 52) | def get_weights_col(self, weights: "Weights", prefix: str):
    method get_multi_weights_col (line 60) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_weights_row (line 68) | def get_weights_row(self, weights: "Weights", prefix: str):
  class Weight (line 76) | class Weight(ABC):
    method get_linear (line 81) | def get_linear(self, bias: torch.Tensor):
  class UnquantizedWeight (line 87) | class UnquantizedWeight(Weight):
    method get_linear (line 90) | def get_linear(self, bias: torch.Tensor):
  class DefaultWeightsLoader (line 99) | class DefaultWeightsLoader(WeightsLoader):
    method __init__ (line 102) | def __init__(self, weight_class: Type[UnquantizedWeight]):
    method get_weights (line 114) | def get_weights(self, weights: "Weights", prefix: str):
    method get_weights_col_packed (line 117) | def get_weights_col_packed(
    method get_multi_weights_col (line 129) | def get_multi_weights_col(self, weights: "Weights", prefixes: List[str...
    method get_weights_row (line 133) | def get_weights_row(self, weights: "Weights", prefix: str):
  class Weights (line 139) | class Weights:
    method __init__ (line 140) | def __init__(
    method _get_handle (line 170) | def _get_handle(self, filename):
    method get_filename (line 177) | def get_filename(self, tensor_name: str) -> (str, str):
    method _get_slice (line 194) | def _get_slice(self, tensor_name: str):
    method has_tensor (line 200) | def has_tensor(self, tensor_name: str):
    method get_shape (line 207) | def get_shape(self, tensor_name: str):
    method get_tensor (line 210) | def get_tensor(
    method get_partial_sharded (line 235) | def get_partial_sharded(
    method get_sharded (line 268) | def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_d...
    method get_packed_sharded (line 281) | def get_packed_sharded(
    method get_weights (line 347) | def get_weights(self, prefix: str):
    method get_weights_col_packed_qkv (line 350) | def get_weights_col_packed_qkv(
    method get_weights_col_packed_gate_up (line 360) | def get_weights_col_packed_gate_up(self, prefix: str):
    method get_weights_col_packed (line 363) | def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, ...
    method get_weights_col (line 373) | def get_weights_col(self, prefix: str):
    method get_multi_weights_col (line 376) | def get_multi_weights_col(self, prefixes: List[str], dim: int):
    method get_tensor_shard (line 379) | def get_tensor_shard(self, var, dim):
    method get_weights_row (line 395) | def get_weights_row(self, prefix: str):
    method use_loader (line 399) | def use_loader(self, weights_loader: WeightsLoader):
    method loader (line 413) | def loader(self):
  function _blocks_to_block_sizes (line 417) | def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]...

FILE: update_doc.py
  function check_cli (line 32) | def check_cli(check: bool):
  function check_supported_models (line 85) | def check_supported_models(check: bool):
  function get_openapi_schema (line 126) | def get_openapi_schema():
  function check_openapi (line 138) | def check_openapi(check: bool):
  function main (line 191) | def main():