SYMBOL INDEX (6547 symbols across 355 files)

FILE: examples/experiments/deepseek_v3_pretrain/config/configuration.py
  class DeepseekV2FastConfig (line 23) | class DeepseekV2FastConfig(PretrainedConfig):
    method __init__ (line 132) | def __init__(

FILE: examples/experiments/deepseek_v3_pretrain/convert_ckpt_to_sft.py
  function paddle_name_to_hf_names (line 45) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _handle_expert_weights (line 134) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 149) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 162) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function _is_need_transpose (line 172) | def _is_need_transpose(key):
  function prepare_tensor (line 191) | def prepare_tensor(key, value):
  function load_pretrained_ckpt (line 218) | def load_pretrained_ckpt(ckpt_path, output_path):

FILE: examples/experiments/deepseek_v3_pretrain/fp8_linear.py
  function fp8_linear (line 54) | def fp8_linear(
  function register_scale (line 95) | def register_scale(self):
  class Linear (line 109) | class Linear(PD_Linear):
    method __init__ (line 110) | def __init__(self, *args, **kwargs):
  class ColumnParallelLinear (line 116) | class ColumnParallelLinear(PD_ColumnParallelLinear):
    method __init__ (line 117) | def __init__(self, *args, **kwargs):
  class RowParallelLinear (line 123) | class RowParallelLinear(PD_RowParallelLinear):
    method __init__ (line 124) | def __init__(self, *args, **kwargs):
  class ColumnSequenceParallelLinear (line 130) | class ColumnSequenceParallelLinear(PD_ColumnSequenceParallelLinear):
    method __init__ (line 131) | def __init__(self, *args, **kwargs):
  class RowSequenceParallelLinear (line 137) | class RowSequenceParallelLinear(PD_RowSequenceParallelLinear):
    method __init__ (line 138) | def __init__(self, *args, **kwargs):

FILE: examples/experiments/deepseek_v3_pretrain/kernel.py
  function act_quant_kernel (line 30) | def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
  function act_quant (line 51) | def act_quant(x: paddle.Tensor, block_size: int = 128) -> Tuple[paddle.T...
  function weight_dequant_kernel (line 74) | def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.cons...
  function weight_dequant (line 100) | def weight_dequant(x: paddle.Tensor, s: paddle.Tensor, block_size: int =...
  function fp8_gemm_kernel (line 130) | def fp8_gemm_kernel(
  function fp8_gemm (line 190) | def fp8_gemm(a: paddle.Tensor, a_s: paddle.Tensor, b: paddle.Tensor, b_s...

FILE: examples/experiments/deepseek_v3_pretrain/load_hf_ckpt.py
  function paddle_name_to_hf_names_ds_v2 (line 53) | def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]:
  function paddle_name_to_hf_names (line 128) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _get_hf_prefix (line 196) | def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str:
  function _handle_expert_weights (line 206) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 221) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 234) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function prepare_tensor (line 244) | def prepare_tensor(tensor, dst_shape, *, force_transpose=False):
  function load_huggingface_ckpt (line 274) | def load_huggingface_ckpt(model, huggingface_ckpt_path):

FILE: examples/experiments/deepseek_v3_pretrain/modeling.py
  function swiglu (line 109) | def swiglu(x, y=None):
  function get_use_casual_mask (line 133) | def get_use_casual_mask():
  function set_global_step (line 138) | def set_global_step(cur_step):
  function get_global_step (line 143) | def get_global_step():
  function rms_norm_fused (line 148) | def rms_norm_fused(x_in, w, eps, use_fast_ln=False):
  function cast_if_needed (line 156) | def cast_if_needed(x, dtype):
  function fusion_rms_norm (line 163) | def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln...
  class LMHeadFunction (line 186) | class LMHeadFunction(paddle.autograd.PyLayer):
    method forward (line 188) | def forward(ctx, x, weight, transpose_y):
    method backward (line 195) | def backward(ctx, dout):
  function parallel_matmul (line 225) | def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_para...
  class DeepseekV2MLP (line 255) | class DeepseekV2MLP(nn.Layer):
    method __init__ (line 256) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, int...
    method forward (line 302) | def forward(self, x):
  class MoEGate (line 308) | class MoEGate(PretrainedMoEGate):
    method __init__ (line 309) | def __init__(
    method forward (line 355) | def forward(self, hidden_states):
  class DeepseekV2MoE (line 407) | class DeepseekV2MoE(MoELayer):
    method __init__ (line 412) | def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, nor...
    method fp8_quant_weight (line 491) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 554) | def forward(self, hidden_states):
    method post_process (line 579) | def post_process(self, hidden_states, final_hidden_states, l_aux):
  class DeepseekV2RotaryEmbedding (line 590) | class DeepseekV2RotaryEmbedding(nn.Layer):
    method __init__ (line 591) | def __init__(self, dim, max_position_embeddings=2048, base=10000):
    method _set_cos_sin_cache (line 606) | def _set_cos_sin_cache(self, seq_len):
    method forward (line 619) | def forward(self, x, seq_len=None):
  class DeepseekV2Attention (line 632) | class DeepseekV2Attention(nn.Layer):
    method __init__ (line 635) | def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: ...
    method fp8_quant_weight (line 745) | def fp8_quant_weight(self, quant_transpose=None):
    method _init_rope (line 752) | def _init_rope(self):
    method _shape (line 784) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
    method forward (line 787) | def forward(
  class DeepseekV2DecoderLayer (line 932) | class DeepseekV2DecoderLayer(nn.Layer):
    method __init__ (line 933) | def __init__(
    method fp8_quant_weight (line 974) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 983) | def forward(
    method self_attn_compute (line 1081) | def self_attn_compute(self, hidden_states, **kwargs):
    method pre_dispatch_compute (line 1131) | def pre_dispatch_compute(self, hidden_states):
    method expert_forward_compute (line 1138) | def expert_forward_compute(self, intermediate_hidden_states, dispatche...
    method post_combine_compute (line 1151) | def post_combine_compute(self, residual, hidden_states, final_hidden_s...
  class DeepseekV2MTPLayer (line 1166) | class DeepseekV2MTPLayer(DeepseekV2DecoderLayer):
    method __init__ (line 1167) | def __init__(
    method forward (line 1179) | def forward(
  class DeepseekV2PretrainedModelFast (line 1216) | class DeepseekV2PretrainedModelFast(PretrainedModel):
    method _get_model_flops (line 1221) | def _get_model_flops(self, batch_size=1, seq_length=None, **kwargs):
    method _get_hardware_flops (line 1234) | def _get_hardware_flops(self, *args, **kwargs):
    method _get_name_mappings (line 1238) | def _get_name_mappings(cls, config: DeepseekV2FastConfig) -> list[Stat...
    method _get_tensor_parallel_mappings (line 1298) | def _get_tensor_parallel_mappings(cls, config: DeepseekV2FastConfig, i...
    method _init_weights (line 1398) | def _init_weights(self, layer):
    method step_flex_token (line 1459) | def step_flex_token(self, cur_step):
  class DeepseekV2ModelFast (line 1464) | class DeepseekV2ModelFast(DeepseekV2PretrainedModelFast):
    method __init__ (line 1472) | def __init__(self, config: DeepseekV2FastConfig):
    method get_input_embeddings (line 1502) | def get_input_embeddings(self):
    method set_input_embeddings (line 1505) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1509) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_...
    method recompute_training_full (line 1544) | def recompute_training_full(
    method forward (line 1575) | def forward(
  class DeepseekV2PretrainingCriterionFast (line 1786) | class DeepseekV2PretrainingCriterionFast(nn.Layer):
    method __init__ (line 1792) | def __init__(self, config: DeepseekV2FastConfig):
    method forward (line 1803) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  function yarn_find_correction_dim (line 1853) | def yarn_find_correction_dim(num_rotations, dim, base=10000, max_positio...
  function yarn_find_correction_range (line 1858) | def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_p...
  function yarn_linear_ramp_mask (line 1864) | def yarn_linear_ramp_mask(min, max, dim):
  class DeepseekV2YarnRotaryEmbedding (line 1873) | class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
    method __init__ (line 1874) | def __init__(
    method _set_cos_sin_cache (line 1894) | def _set_cos_sin_cache(self, seq_len):
  class RmsNormFunction (line 1928) | class RmsNormFunction(paddle.autograd.PyLayer):
    method forward (line 1930) | def forward(ctx, x, scale, epsilon):
    method backward (line 1939) | def backward(ctx, grad_output):
  class DeepseekV2RMSNorm (line 1953) | class DeepseekV2RMSNorm(nn.Layer):
    method __init__ (line 1954) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps...
    method forward (line 1978) | def forward(self, hidden_states):
    method extra_repr (line 1991) | def extra_repr(self):
  function apply_rotary_pos_emb (line 1995) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion...
  class FusedNormGateFunc (line 2049) | class FusedNormGateFunc(paddle.autograd.PyLayer):
    method set_temporary_vars (line 2056) | def set_temporary_vars(cls, norm_output, invar):
    method clear_temporary_vars (line 2061) | def clear_temporary_vars(cls):
    method forward (line 2066) | def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps):
    method backward (line 2076) | def backward(ctx, d_gate_logits, d_norm_output):
  class TemporaryVarContext (line 2100) | class TemporaryVarContext:
    method __init__ (line 2101) | def __init__(self, norm_output, invar):
    method __enter__ (line 2105) | def __enter__(self):
    method __exit__ (line 2108) | def __exit__(self, exc_type, exc_val, exc_tb):
  function balance_expert_assignment (line 2112) | def balance_expert_assignment(n, m, k):
  class FakeGate (line 2123) | class FakeGate(paddle.autograd.PyLayer):
    method forward (line 2125) | def forward(ctx, hidden_states, weight, fakse_gate_restrict_balance=Fa...
    method backward (line 2141) | def backward(ctx, grad_output):
  class AddAuxiliaryLoss (line 2145) | class AddAuxiliaryLoss(paddle.autograd.PyLayer):
    method forward (line 2152) | def forward(ctx, x, loss):
    method backward (line 2158) | def backward(ctx, grad_output):
  function qkv_pre_process_no_fuse (line 2166) | def qkv_pre_process_no_fuse(
  function rearrange_kv (line 2203) | def rearrange_kv(kv, k_pe, qk_nope_head_dim, num_heads):
  function enable_to_static (line 2214) | def enable_to_static(value):
  function qkv_pre_process (line 2223) | def qkv_pre_process(
  function manul_fwd (line 2266) | def manul_fwd(
  class MemroyRecomputeAttnFunc (line 2318) | class MemroyRecomputeAttnFunc(paddle.autograd.PyLayer):
    method forward (line 2320) | def forward(
    method backward (line 2515) | def backward(ctx, dout):
  class MemroyRecomputeAttn (line 2828) | class MemroyRecomputeAttn(paddle.nn.Layer):
    method __init__ (line 2829) | def __init__(
    method fp8_quant_weight (line 2907) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 2911) | def forward(self, q_init, kv_init, position_ids):
  class FusedRMSLinearFunc (line 2941) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 2943) | def forward(ctx, x, rms_norm_weight, q_down_weight, kv_down_weight, eps):
    method backward (line 2964) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinear (line 3014) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 3015) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method fp8_quant_weight (line 3039) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 3042) | def forward(self, x):
  class FusedRMSLinearSingleFunc (line 3047) | class FusedRMSLinearSingleFunc(paddle.autograd.PyLayer):
    method forward (line 3049) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 3058) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinearSingle (line 3069) | class FusedRMSLinearSingle(paddle.nn.Layer):
    method __init__ (line 3070) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method forward (line 3087) | def forward(self, x):
  class FastCrossEntropyFunction (line 3092) | class FastCrossEntropyFunction(paddle.autograd.PyLayer):
    method forward (line 3094) | def forward(ctx, preds, labels):
    method backward (line 3102) | def backward(ctx, dout):
  class DeepseekV2LMHead (line 3112) | class DeepseekV2LMHead(nn.Layer):
    method __init__ (line 3113) | def __init__(self, config: DeepseekV2FastConfig, embedding_weight=None):
    method forward (line 3149) | def forward(self, hidden_states, tensor_parallel_output=None):
    method extra_repr (line 3171) | def extra_repr(self):

FILE: examples/experiments/deepseek_v3_pretrain/modeling_pp.py
  function check_accept_none_grad (line 81) | def check_accept_none_grad():
  function parse_args (line 97) | def parse_args(args):
  function return_args (line 127) | def return_args(hidden_states, attention_mask=None, attn_mask_startend_r...
  function get_attr (line 142) | def get_attr(layer, name):
  function calc_stream_wait (line 149) | def calc_stream_wait(group_id):
  class TensorMeta (line 154) | class TensorMeta:
    method __init__ (line 157) | def __init__(self, tensor):
  class PostProcessNode (line 162) | class PostProcessNode(ScheduleNode):
    method __init__ (line 163) | def __init__(
    method forward_without_residual (line 187) | def forward_without_residual(self, inputs):
    method forward (line 231) | def forward(self, inputs):
    method backward (line 277) | def backward(self, output_grad):
  class DecoderLayerNode (line 333) | class DecoderLayerNode(ScheduleNode):
    method __init__ (line 334) | def __init__(
    method dispatch_forward (line 363) | def dispatch_forward(self, inputs, previous_event=None, allocate_on_co...
    method combine_forward (line 409) | def combine_forward(self, inputs, previous_event=None):
    method dispatch_backward (line 425) | def dispatch_backward(self, output_grad):
    method combine_backward (line 464) | def combine_backward(self, output_grad):
    method forward (line 491) | def forward(self, inputs):
    method backward (line 511) | def backward(self, output_grad=None, scaler=None):
  class OverlapedScheduleChunk (line 534) | class OverlapedScheduleChunk:
    method __init__ (line 535) | def __init__(self, forward_nodes, backward_nodes, use_fuion=True):
    method forward_backward (line 546) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class DecoderBackwardScheduleChunk (line 559) | class DecoderBackwardScheduleChunk:
    method __init__ (line 560) | def __init__(self, nodes):
    method backward (line 563) | def backward(self, output_grad, combine_bw_event_to_wait=None, pp_stre...
  class OverlapedScheduleNode (line 573) | class OverlapedScheduleNode:
    method __init__ (line 574) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 580) | def forward_backward(self, inputs, output_grad, event_to_wait=None):
  class FusionFp8DecoderLayerNode (line 608) | class FusionFp8DecoderLayerNode(ScheduleNode):
    method __init__ (line 609) | def __init__(
    method attn_forward (line 633) | def attn_forward(self, inputs):
    method dispatch_forward (line 669) | def dispatch_forward(self, inputs, previous_event=None, async_finish=F...
    method mlp_forward (line 697) | def mlp_forward(self, inputs):
    method combine_forward (line 736) | def combine_forward(self, inputs, async_finish=False, previous_event=N...
    method post_process_forward (line 762) | def post_process_forward(self, inputs, with_residual=True):
    method post_process_backward (line 785) | def post_process_backward(self, output_grad, event_to_wait=None):
    method combine_backward (line 819) | def combine_backward(self, output_grad, previous_event=None, async_fin...
    method mlp_backward (line 877) | def mlp_backward(self, output_grad):
    method dispatch_backward (line 909) | def dispatch_backward(self, output_grad, async_finish=False, previous_...
    method attn_backward (line 958) | def attn_backward(self, output_grad):
    method backward_for_fusion (line 1016) | def backward_for_fusion(self, output_grad, combine_bw_event_to_wait=No...
    method forward (line 1087) | def forward(self, inputs):
    method backward (line 1097) | def backward(self, output_grad=None, scaler=None):
  class DenseDecoderLayerNode (line 1108) | class DenseDecoderLayerNode(ScheduleNode):
    method __init__ (line 1109) | def __init__(
    method forward (line 1119) | def forward(self, inputs):
    method backward (line 1124) | def backward(self, output_grad=None, scaler=None):
  class OverlapedFUsionScheduleNode (line 1131) | class OverlapedFUsionScheduleNode:
    method __init__ (line 1132) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1140) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class OverlapedDenseFusionScheduleNode (line 1276) | class OverlapedDenseFusionScheduleNode:
    method __init__ (line 1277) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1286) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  function build_overlapped_nodes (line 1372) | def build_overlapped_nodes(config: DeepseekV2FastConfig, forward_chunk, ...
  class EmbeddingFunction (line 1436) | class EmbeddingFunction(paddle.autograd.PyLayer):
    method forward (line 1438) | def forward(ctx, x, weight):
    method backward (line 1447) | def backward(ctx, dout):
  class DeepseekV2EmbeddingPipe (line 1458) | class DeepseekV2EmbeddingPipe(nn.Layer):
    method __init__ (line 1459) | def __init__(self, config: DeepseekV2FastConfig):
    method embedding_weight (line 1474) | def embedding_weight(self):
    method forward (line 1477) | def forward(self, args):
    method build_schedule_node (line 1557) | def build_schedule_node(self):
  class DeepseekV2DecoderLayerPipe (line 1561) | class DeepseekV2DecoderLayerPipe(DeepseekV2DecoderLayer):
    method forward (line 1562) | def forward(self, args):
    method attn_compute (line 1621) | def attn_compute(self, args):
    method attn_compute_for_fusion (line 1656) | def attn_compute_for_fusion(self, args):
    method mlp_compute (line 1694) | def mlp_compute(self, inputs):
    method post_process_compute (line 1741) | def post_process_compute(self, inputs):
    method post_process_compute_for_fusion (line 1777) | def post_process_compute_for_fusion(self, inputs):
    method attn_compute_dense (line 1802) | def attn_compute_dense(self, args):
    method mlp_compute_dense (line 1820) | def mlp_compute_dense(self, inputs):
    method build_schedule_node (line 1834) | def build_schedule_node(self):
  class DeepseekV2MTPLayerPipe (line 1900) | class DeepseekV2MTPLayerPipe(DeepseekV2MTPLayer):
    method forward (line 1901) | def forward(self, args):
    method attn_compute_for_fusion (line 1969) | def attn_compute_for_fusion(self, args):
    method build_schedule_node (line 2016) | def build_schedule_node(self):
  class DeepseekV2RMSNormPipe (line 2035) | class DeepseekV2RMSNormPipe(nn.Layer):
    method __init__ (line 2036) | def __init__(self, config):
    method forward (line 2041) | def forward(self, args):
    method build_schedule_node (line 2056) | def build_schedule_node(self):
  class DeepseekV2LMHeadPipe (line 2060) | class DeepseekV2LMHeadPipe(DeepseekV2LMHead):
    method __init__ (line 2061) | def __init__(self, config, embedding_weight=None):
    method embedding_weight (line 2065) | def embedding_weight(self):
    method forward (line 2068) | def forward(self, args: Union[Tuple, paddle.Tensor]):
    method build_schedule_node (line 2078) | def build_schedule_node(self):
  class DeepseekV2PretrainingCriterionPipe (line 2082) | class DeepseekV2PretrainingCriterionPipe(DeepseekV2PretrainingCriterionF...
    method forward (line 2083) | def forward(self, logits, labels):
    method build_schedule_node (line 2094) | def build_schedule_node(self):
  class DeepseekV2ForCausalLMPipe (line 2098) | class DeepseekV2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method step_flex_token (line 2117) | def step_flex_token(self, cur_step):
    method _prepare_pipeline_inputs_func (line 2121) | def _prepare_pipeline_inputs_func(cls, inputs):
    method __init__ (line 2144) | def __init__(self, config: DeepseekV2FastConfig):
    method fp8_quant_weight (line 2309) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
    method get_loss_fn (line 2322) | def get_loss_fn(self, config):
    method overlapped_forward_backward (line 2325) | def overlapped_forward_backward(

FILE: examples/experiments/deepseek_v3_pretrain/moe_gate.py
  class PretrainedMoEGate (line 29) | class PretrainedMoEGate(nn.Layer, MoEGateMixin):
    method __init__ (line 30) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
    method _priority (line 69) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle....
    method _topk_greedy (line 91) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle....
    method _topk_group_limited_greedy (line 106) | def _topk_group_limited_greedy(
    method _topk_noaux_tc (line 138) | def _topk_noaux_tc(
    method top1gating (line 175) | def top1gating(
    method top2gating (line 245) | def top2gating(
    method _cal_seq_aux_loss (line 321) | def _cal_seq_aux_loss(self, gates, top_k, topk_idx) -> paddle.Tensor:
    method topkgating (line 359) | def topkgating(
    method topkgating_nodrop (line 438) | def topkgating_nodrop(self, gates: paddle.Tensor):

FILE: examples/experiments/deepseek_v3_pretrain/moe_layer.py
  function record_stream_for_multi_input (line 58) | def record_stream_for_multi_input(x):
  function stop_gradient_for_multi_input (line 66) | def stop_gradient_for_multi_input(x):
  class MoELayer (line 73) | class MoELayer(nn.Layer):
    method __init__ (line 74) | def __init__(
    method update_flex_token (line 149) | def update_flex_token(self):
    method _parse_moe_expert_parallel (line 165) | def _parse_moe_expert_parallel(self, n_routed_experts, expert_model_pa...
    method _post_init (line 175) | def _post_init(self):
    method forward (line 186) | def forward(
    method forward_drop_token (line 207) | def forward_drop_token(
    method expert_forward (line 326) | def expert_forward(self, dispatched_input):
    method forward_flex_token (line 337) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 380) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 383) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 386) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 394) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 400) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 406) | def post_combine_compute(self, hidden_states):
  class MoEFlexTokenLayer (line 411) | class MoEFlexTokenLayer(nn.Layer):
    method __init__ (line 412) | def __init__(self, config, n_routed_experts, expert_class, expert_kwar...
    method expert_forward (line 428) | def expert_forward(self, dispatched_input, tokens_per_expert):
    method forward (line 440) | def forward(self, hidden_states: paddle.Tensor):
    method forward_flex_token (line 451) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 494) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 497) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 500) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 508) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 514) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 520) | def post_combine_compute(self, hidden_states):
  class Fp8DispatchQuantNode (line 525) | class Fp8DispatchQuantNode:
    method __init__ (line 526) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, name="fp8_...
    method forward (line 533) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 561) | def backward(self, hs_grad, token_probs_grad):
  class Fp8DispatchNode (line 573) | class Fp8DispatchNode:
    method __init__ (line 574) | def __init__(self, token_dispatcher, name="fp8_dispatch_node"):
    method forward (line 580) | def forward(
    method backward (line 609) | def backward(
  class Fp8CombineNode (line 628) | class Fp8CombineNode:
    method __init__ (line 629) | def __init__(self, token_dispatcher, name="fp8_combine_node"):
    method forward (line 635) | def forward(self, hidden_states_out, previous_event=None, async_finish...
    method backward (line 650) | def backward(self, output_combine_grad, previous_event=None, async_fin...
  class Fp8CombineQuantNode (line 661) | class Fp8CombineQuantNode:
    method __init__ (line 662) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, moe_group=...
    method forward (line 669) | def forward(self, output_combine):
    method backward (line 678) | def backward(self, output_grad, event_to_wait=None):
  class FusionMlpNode (line 704) | class FusionMlpNode:
    method __init__ (line 709) | def __init__(
    method set_recompute_fwd_gate_up (line 745) | def set_recompute_fwd_gate_up(self, recompute_fwd_gate_up):
    method reset_statue (line 748) | def reset_statue(self):
    method prepare_env_subbatch (line 773) | def prepare_env_subbatch(self, unzipped_tokens=None, unzipped_tokens_s...
    method gemm_forward_subbatch (line 790) | def gemm_forward_subbatch(
    method gemm_backward_subbatch (line 825) | def gemm_backward_subbatch(
    method forward (line 886) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1024) | def backward(self, hidden_states_out_grad):
  class FusionMoeNode (line 1127) | class FusionMoeNode:
    method __init__ (line 1128) | def __init__(
    method forward (line 1161) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 1189) | def backward(self, output_grad):
  class FusionMoe (line 1204) | class FusionMoe(paddle.autograd.PyLayer):
    method forward (line 1206) | def forward(
    method backward (line 1225) | def backward(ctx, output_grad):

FILE: examples/experiments/deepseek_v3_pretrain/moe_utils.py
  function _clear_to_zero_allocation (line 29) | def _clear_to_zero_allocation(self):
  function _holder_size (line 45) | def _holder_size(self):
  function topk_to_permuted_indices (line 57) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute_fast (line 69) | def permute_fast(
  function unpermute_fast (line 90) | def unpermute_fast(
  class UnZipNode (line 132) | class UnZipNode:
    method __init__ (line 133) | def __init__(self, name="unzip"):
    method reset_statue (line 138) | def reset_statue(self):
    method forward (line 143) | def forward(
    method backward (line 189) | def backward(self, dx, total_zipped_tokens, probs_grad, dispatched_ind...
  class ZipNode (line 203) | class ZipNode:
    method __init__ (line 204) | def __init__(self, name="zip"):
    method forward (line 208) | def forward(
    method backward (line 218) | def backward(
  class PermuteNode (line 264) | class PermuteNode:
    method __init__ (line 265) | def __init__(self, token_dispatcher, name="permute"):
    method reset_status (line 269) | def reset_status(self):
    method forward (line 273) | def forward(self, hidden_states, hidden_states_scale, dispatched_indic...
    method backward (line 287) | def backward(self, out_grad, dispatched_probs):
  class UnPermuteNode (line 300) | class UnPermuteNode:
    method __init__ (line 301) | def __init__(self, token_dispatcher, name="unpermute"):
    method reset_status (line 305) | def reset_status(self):
    method forward (line 314) | def forward(
    method backward (line 352) | def backward(self, out_grad, out_grad_scale):
  function tokens_zip_unique_add_with_subbatch (line 383) | def tokens_zip_unique_add_with_subbatch(zipped, unzipped, index_unzipped...
  function merge_subbatch_cast (line 409) | def merge_subbatch_cast(x, dtype):
  function get_env_device (line 420) | def get_env_device():

FILE: examples/experiments/deepseek_v3_pretrain/run_pretrain.py
  class PreTrainingArguments (line 65) | class PreTrainingArguments(TrainingArguments):
    method __post_init__ (line 122) | def __post_init__(self):
  class DataArguments (line 180) | class DataArguments:
  class ModelArguments (line 213) | class ModelArguments:
  function create_pretrained_dataset (line 244) | def create_pretrained_dataset(
  function get_train_data_file (line 318) | def get_train_data_file(args):
  class PretrainingTrainer (line 343) | class PretrainingTrainer(Trainer):
    method __init__ (line 344) | def __init__(self, *args, **kwargs):
    method evaluate (line 348) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method _get_eval_sampler (line 388) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 398) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
  function main (line 409) | def main():

FILE: examples/experiments/deepseek_v3_pretrain/token_dispatcher.py
  class _DeepepManager (line 30) | class _DeepepManager(_DispatchManager):
    method __init__ (line 52) | def __init__(
    method setup_metadata (line 73) | def setup_metadata(self, routing_map: paddle.Tensor, probs: paddle.Ten...
    method dispatch (line 81) | def dispatch(
    method _indices_to_multihot (line 93) | def _indices_to_multihot(self, indices, probs):
    method get_dispatched_metadata (line 118) | def get_dispatched_metadata(self) -> paddle.Tensor:
    method get_number_of_tokens_per_expert (line 121) | def get_number_of_tokens_per_expert(self) -> paddle.Tensor:
    method combine (line 127) | def combine(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
    method get_permuted_hidden_states_by_experts (line 133) | def get_permuted_hidden_states_by_experts(self, hidden_states: paddle....
    method get_permuted_hidden_states_by_experts_fast (line 145) | def get_permuted_hidden_states_by_experts_fast(
    method get_restored_hidden_states_by_experts (line 155) | def get_restored_hidden_states_by_experts(self, hidden_states: paddle....
    method get_restored_hidden_states_by_experts_fast (line 167) | def get_restored_hidden_states_by_experts_fast(
  class MoETokenDispatcher (line 186) | class MoETokenDispatcher:
    method __init__ (line 191) | def __init__(self, ep_group) -> None:
    method ep_group (line 198) | def ep_group(self):
    method ep_size (line 203) | def ep_size(self):
    method token_permutation (line 208) | def token_permutation(self, tokens: paddle.Tensor, probs: paddle.Tenso...
    method token_unpermutation (line 222) | def token_unpermutation(self, expert_output: paddle.Tensor, bias: padd...
  class MoEFlexTokenDispatcher (line 235) | class MoEFlexTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 240) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method token_permutation (line 252) | def token_permutation(
    method token_unpermutation (line 265) | def token_unpermutation(
  class MoEFlexTokenDispatcherFast (line 276) | class MoEFlexTokenDispatcherFast:
    method __init__ (line 281) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method ep_group (line 294) | def ep_group(self):
    method ep_size (line 299) | def ep_size(self):
    method pre_dispatch (line 303) | def pre_dispatch(self, hidden_states, probs, routing_map):
    method post_dispatch (line 313) | def post_dispatch(self, hidden_states, dispatched_indices):
    method pre_combine (line 321) | def pre_combine(self, hidden_states, token_permuted_indices, prob_perm...
    method post_combine (line 327) | def post_combine(self, hidden_states):
    method token_permutation (line 331) | def token_permutation(
    method token_unpermutation (line 349) | def token_unpermutation(
  class PreDispatchNode (line 367) | class PreDispatchNode:
    method __init__ (line 368) | def __init__(self, token_dispatcher):
    method reset_status (line 372) | def reset_status(self):
    method forward (line 378) | def forward(self, routing_map, probs):
    method backward (line 393) | def backward(self, token_probs_g):

FILE: examples/experiments/ernie_pretrain/ernie/config.py
  function get_config (line 24) | def get_config(verbose=False):

FILE: examples/experiments/ernie_pretrain/ernie/model_config.py
  class ModelConfig (line 22) | class ModelConfig:

FILE: examples/experiments/ernie_pretrain/ernie/pretrain.py
  function log_trainer_start (line 76) | def log_trainer_start():
  function load_huggingface_checkpoint (line 86) | def load_huggingface_checkpoint(model, args):
  function get_expected_state_dict (line 181) | def get_expected_state_dict(model, **kwargs):
  function update_model_config_from_args (line 249) | def update_model_config_from_args(config: ErnieMoEConfig, model_args: di...
  function get_tp_split_ckpt (line 259) | def get_tp_split_ckpt(args, path):
  class AllArguments (line 271) | class AllArguments(PreTrainingArguments):
    method __post_init__ (line 272) | def __post_init__(self):
  class ExpConfig (line 277) | class ExpConfig:
  function create_pretrained_dataset (line 283) | def create_pretrained_dataset(args):
  function main (line 329) | def main():

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/fp8_quant_weight_callback.py
  function enable_in_dict_config (line 25) | def enable_in_dict_config(config, key):
  class FP8QuantWeightCallback (line 32) | class FP8QuantWeightCallback(TrainerCallback):
    method on_step_begin (line 33) | def on_step_begin(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/gc_callback.py
  class GCCallback (line 20) | class GCCallback(TrainerCallback):
    method on_train_begin (line 21) | def on_train_begin(self, args, state, control, **kwargs):
    method on_step_end (line 25) | def on_step_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/logging_callback.py
  class LoggingCallback (line 22) | class LoggingCallback(TrainerCallback):
    method __init__ (line 23) | def __init__(
    method on_log (line 28) | def on_log(self, args, state, control, logs=None, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/moe_correction_bias_adjust_callback.py
  class MoECorrectionBiasAdjustCallback (line 28) | class MoECorrectionBiasAdjustCallback(TrainerCallback):
    method __init__ (line 29) | def __init__(self, lr, use_sp):
    method on_optimizer_end (line 34) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/moe_logging_callback.py
  function tensor_md5 (line 45) | def tensor_md5(tensor):
  class GlobalRNGCallback (line 51) | class GlobalRNGCallback(TrainerCallback):
    method on_step_end (line 52) | def on_step_end(self, args, state, control, model, **kwargs):
  class MoeLoggingCallback (line 57) | class MoeLoggingCallback(TrainerCallback):
    method __init__ (line 58) | def __init__(self, optimizer):
    method on_log (line 69) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_step_end (line 75) | def on_step_end(self, args, state, control, model, **kwargs):
    method on_save (line 114) | def on_save(self, args, state, control, model, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/ortho_loss_callback.py
  class OrthogonalCallback (line 22) | class OrthogonalCallback(TrainerCallback):
    method __init__ (line 23) | def __init__(self, ortho_loss_lambda):
    method on_optimizer_end (line 26) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/sp_grad_sync_callback.py
  class SPGradSyncCallback (line 28) | class SPGradSyncCallback(TrainerCallback):
    method __init__ (line 29) | def __init__(self, model):
    method on_optimizer_begin (line 41) | def on_optimizer_begin(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/tensorboard_callback.py
  function is_tensorboard_available (line 29) | def is_tensorboard_available():
  function rewrite_logs (line 33) | def rewrite_logs(d):
  class TensorBoardCallback (line 49) | class TensorBoardCallback(TrainerCallback):
    method __init__ (line 50) | def __init__(
    method _init_summary_writer (line 91) | def _init_summary_writer(self, args, log_dir=None):
    method on_train_begin (line 96) | def on_train_begin(self, args, state, control, **kwargs):
    method on_log (line 120) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_train_end (line 183) | def on_train_end(self, args, state, control, **kwargs):

FILE: examples/experiments/ernie_pretrain/ernie/src/clip/moe_clip.py
  class ClipGradForMOEByGlobalNorm (line 28) | class ClipGradForMOEByGlobalNorm(ClipGradBase):
    method __init__ (line 29) | def __init__(
    method __str__ (line 49) | def __str__(self):
    method get_l2_norm_pow (line 53) | def get_l2_norm_pow(params_grads, sum_dtype=None):
    method _dygraph_clip (line 101) | def _dygraph_clip(self, params_grads):

FILE: examples/experiments/ernie_pretrain/ernie/src/lr_schedulers/cosine_lr.py
  function get_cosine_schedule_with_warmup (line 24) | def get_cosine_schedule_with_warmup(

FILE: examples/experiments/ernie_pretrain/ernie/src/lr_schedulers/wsd_lr.py
  function get_wsd_schedule_with_warmup (line 20) | def get_wsd_schedule_with_warmup(

FILE: examples/experiments/ernie_pretrain/ernie/src/tokenizers/tokenization_eb_v2.py
  class ErnieBotTokenizer (line 29) | class ErnieBotTokenizer(PretrainedTokenizer):
    method __init__ (line 40) | def __init__(
    method space_token (line 72) | def space_token(self):
    method space_token_id (line 76) | def space_token_id(self):
    method gend_token (line 80) | def gend_token(self):
    method gend_token_id (line 84) | def gend_token_id(self):
    method im_start_id (line 88) | def im_start_id(self):
    method im_end_id (line 92) | def im_end_id(self):
    method vocab_size (line 96) | def vocab_size(self):
    method get_vocab (line 99) | def get_vocab(self):
    method _tokenize (line 104) | def _tokenize(self, text):
    method _convert_token_to_id (line 107) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 110) | def _convert_id_to_token(self, id):
    method convert_tokens_to_string (line 113) | def convert_tokens_to_string(self, tokens):
    method prepare_for_model (line 126) | def prepare_for_model(self, *args, **kwargs):
    method save_vocabulary (line 131) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st...
    method tokenize (line 147) | def tokenize(self, text: TextInput, **kwargs) -> List[str]:
    method _decode (line 169) | def _decode(self, *args, **kwargs):
    method _pad (line 179) | def _pad(
  function add_special_tokens (line 239) | def add_special_tokens(

FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/data_parallel.py
  class DataParallel (line 22) | class DataParallel(paddle.DataParallel):
    method init_reducer (line 23) | def init_reducer(self):
  function sync_dp_moe_params_across_sharding (line 74) | def sync_dp_moe_params_across_sharding(model: paddle.nn.Layer) -> None:

FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/dygraph_optimizer/hybrid_parallel_optimizer.py
  class HybridParallelClipGrad (line 37) | class HybridParallelClipGrad:
    method __init__ (line 38) | def __init__(self, clip, hcg, timers=None):
    method _global_norm (line 53) | def _global_norm(
    method _dygraph_clip (line 142) | def _dygraph_clip(self, params_grads):
    method _comm_and_clip (line 277) | def _comm_and_clip(
    method __getattr__ (line 330) | def __getattr__(self, item):
    method __call__ (line 333) | def __call__(self, params_grads):
  class HybridParallelOptimizer (line 337) | class HybridParallelOptimizer(HPBase):
    method __init__ (line 338) | def __init__(self, optimizer, hcg, strategy):

FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/pretraining_trainer.py
  function distributed_optimizer_maybe_overwrite (line 97) | def distributed_optimizer_maybe_overwrite(
  class PreTrainingArguments (line 122) | class PreTrainingArguments(TrainingArguments):
    method use_moe (line 274) | def use_moe(self):  # noqa: F811
    method use_moe (line 278) | def use_moe(self, value):
    method need_data (line 283) | def need_data(self):
    method combine_batch (line 287) | def combine_batch(self):
    method reeao_dataset_rank (line 291) | def reeao_dataset_rank(self):
    method reeao_dataset_world_size (line 295) | def reeao_dataset_world_size(self):
    method __post_init__ (line 298) | def __post_init__(self):
  class WeightedDistributedSampler (line 412) | class WeightedDistributedSampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 413) | def __init__(
    method set_epoch (line 459) | def set_epoch(self, epoch=0, consumed_samples=0):
    method gen_data_seq (line 464) | def gen_data_seq(self):
    method load_data_seq_from_cache (line 477) | def load_data_seq_from_cache(self):
    method gen_data_seq_weighted (line 490) | def gen_data_seq_weighted(self, num_examples, data_type=None):
    method roundup_and_shard (line 568) | def roundup_and_shard(self, indices):
    method __len__ (line 590) | def __len__(self):
    method __iter__ (line 593) | def __iter__(self):
  class DummySampler (line 661) | class DummySampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 662) | def __init__(self, dataset, batch_size=1, **kwargs):
    method __len__ (line 665) | def __len__(self):
    method __iter__ (line 668) | def __iter__(self):
  class PretrainingTrainer (line 673) | class PretrainingTrainer(Trainer):
    method __init__ (line 674) | def __init__(self, args=None, model=None, callbacks=[], **kwargs):
    method autocast_smart_context_manager (line 695) | def autocast_smart_context_manager(self):
    method _load_optimizer_state (line 727) | def _load_optimizer_state(self, checkpoint):
    method _save_moe_weights (line 776) | def _save_moe_weights(self, output_dir):
    method _wrap_model (line 823) | def _wrap_model(self, model, training=True):
    method _new_gradclip (line 990) | def _new_gradclip(self):
    method evaluate (line 1036) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method prediction_pipeline_step (line 1067) | def prediction_pipeline_step(self, model, inputs, prediction_loss_only...
    method restore_dataloader_status (line 1073) | def restore_dataloader_status(self):
    method _get_eval_sampler (line 1118) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 1128) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
    method _maybe_log_save_evaluate (line 1138) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_...
    method create_scheduler (line 1302) | def create_scheduler(self, num_training_steps):
    method create_optimizer (line 1326) | def create_optimizer(self, lr_scheduler=None):
    method save_model (line 1388) | def save_model(self, output_dir=None):
    method _load_rng_state (line 1394) | def _load_rng_state(self, checkpoint):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/logging.py
  function setup_logger_output_file (line 41) | def setup_logger_output_file(outputpath, local_rank):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/misc.py
  class SmoothedValue (line 39) | class SmoothedValue:
    method __init__ (line 40) | def __init__(
    method update (line 49) | def update(self, value):
    method global_avg (line 60) | def global_avg(self):
    method reset (line 63) | def reset(self):
  class TrainingLogs (line 68) | class TrainingLogs:
    method __new__ (line 71) | def __new__(cls, *args, **kw):
    method __init__ (line 76) | def __init__(self):
    method set_trainer_interval (line 84) | def set_trainer_interval(self, trainer, logging_interval):
    method global_meters_keys (line 89) | def global_meters_keys(self):
    method global_meters_keys (line 93) | def global_meters_keys(self, lst):
    method enable_skip_zero (line 96) | def enable_skip_zero(self, keys=[]):
    method update (line 104) | def update(self, **kwargs):
    method is_enabled (line 108) | def is_enabled(self):
    method __setitem__ (line 111) | def __setitem__(self, k, v):
    method __getitem__ (line 119) | def __getitem__(self, v):
    method __getattr__ (line 122) | def __getattr__(self, attr):
    method dict (line 129) | def dict(self, use_async=False):
    method reset (line 183) | def reset(self):
    method take_snapshot (line 188) | def take_snapshot(self):
    method restore_snapshot (line 191) | def restore_snapshot(self):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/seed_utils.py
  function set_seed (line 26) | def set_seed(seed):

FILE: examples/experiments/ernie_pretrain/ernie/src/utils/training_utils.py
  function reset_per_device_batch_size (line 20) | def reset_per_device_batch_size(global_batch_size, per_device_train_batc...

FILE: examples/experiments/ernie_pretrain/models/comm_utils.py
  function scatter (line 33) | def scatter(input, group=None, axis=0):
  function mp_slice (line 51) | def mp_slice(x, indices=None, group=None, axis=0):
  function all_gather_varlen (line 68) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):
  function scatter_varlen (line 90) | def scatter_varlen(x, recv_tensor, indices, src_rank, group, sync_op=True):
  function all_gather (line 112) | def all_gather(input, group=None, axis=0):
  function reduce_scatter (line 131) | def reduce_scatter(input, group=None):
  function subbatch (line 148) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar...
  function gather_varlen (line 193) | def gather_varlen(input, dst, group, offload_pp_data_chunk_size=0, all_s...
  function profile (line 293) | def profile(name, use_event=True):

FILE: examples/experiments/ernie_pretrain/models/ernie/configuration.py
  class ErnieMoEConfig (line 60) | class ErnieMoEConfig(PretrainedConfig):
    method __init__ (line 72) | def __init__(
    method __setattr__ (line 402) | def __setattr__(self, name: str, value):
    method register_nonsaveable_keys (line 413) | def register_nonsaveable_keys(self, keys):
    method use_moe (line 422) | def use_moe(self) -> bool:
    method to_json_string (line 425) | def to_json_string(self, use_diff: bool = True) -> str:

FILE: examples/experiments/ernie_pretrain/models/ernie/modeling.py
  function get_triangle_upper_mask (line 127) | def get_triangle_upper_mask(x, mask=None):
  function gqa_qkv_split_func (line 139) | def gqa_qkv_split_func(
  function gqa_qkv_merge_func (line 169) | def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_h...
  function parallel_matmul (line 190) | def parallel_matmul(
  function calc_lm_head_logits (line 231) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para...
  function finfo (line 261) | def finfo(dtype: paddle.dtype = None):
  function masked_fill (line 279) | def masked_fill(x, mask, value):
  function mem_eff_attn (line 284) | def mem_eff_attn(query, key, value, pack_offset, drop_prob=0.0, dtype=pa...
  function inbatch_pack_offset_to_attn_mask_start_row_indices (line 321) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs...
  function scaled_dot_product_attention (line 336) | def scaled_dot_product_attention(
  function _make_causal_mask (line 454) | def _make_causal_mask(input_ids_shape, past_key_values_length, dtype):
  function _expand_mask (line 468) | def _expand_mask(mask, dtype, tgt_length):
  class FusedDropoutImpl (line 483) | class FusedDropoutImpl(nn.Layer):
    method __init__ (line 484) | def __init__(self, prob, mode):
    method forward (line 491) | def forward(self, x, y):
  class RMSNorm (line 499) | class RMSNorm(nn.Layer):
    method __init__ (line 500) | def __init__(self, config):
    method forward (line 514) | def forward(self, hidden_states):
  class RotaryEmbedding (line 530) | class RotaryEmbedding(nn.Layer):
    method __init__ (line 531) | def __init__(self, dim, max_position_embeddings=4096, base=10000):
    method forward (line 547) | def forward(self, x, seq_len=None):
    method rotate_half (line 555) | def rotate_half(cls, x):
    method apply_rotary_pos_emb (line 562) | def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, positio...
  class RopeEmbeddingLegacy (line 580) | class RopeEmbeddingLegacy(nn.Layer):
    method __init__ (line 581) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a...
    method forward (line 588) | def forward(self, seq_length, position_ids=None):
    method apply_rotary (line 604) | def apply_rotary(self, rp, q, k):
    method apply_rotary_3d (line 626) | def apply_rotary_3d(self, rp, q, k, position_ids):
    method forward_single (line 694) | def forward_single(self, position_ids):
    method apply_rotary_single (line 709) | def apply_rotary_single(x, rope_emb):
  class ErnieMLP (line 717) | class ErnieMLP(nn.Layer):
    method __init__ (line 718) | def __init__(self, config):
    method forward (line 798) | def forward(self, x):
  class ErnieAttention (line 835) | class ErnieAttention(nn.Layer):
    method __init__ (line 836) | def __init__(self, config, layer_idx=0):
    method forward (line 997) | def forward(
    method rope_attn (line 1095) | def rope_attn(
  class ErnieDecoderLayer (line 1195) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 1196) | def __init__(self, config, layer_idx=0):
    method forward (line 1210) | def forward(
  class ErniePretrainedModel (line 1265) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1270) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1341) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1426) | def _init_weights(self, layer):
  class ErnieModel (line 1474) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1475) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1502) | def get_input_embeddings(self):
    method set_input_embeddings (line 1505) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1509) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1528) | def recompute_training(
    method forward (line 1558) | def forward(
  class FusedHeadParallelCrossEntropy (line 1692) | class FusedHeadParallelCrossEntropy(PyLayer):
    method forward (line 1694) | def forward(
    method backward (line 1812) | def backward(ctx, loss_all_grad, labels_all_grad):
  class ErniePretrainingCriterion (line 1930) | class ErniePretrainingCriterion(paddle.nn.Layer):
    method __init__ (line 1931) | def __init__(self, config, return_tuple=True):
    method forward (line 1946) | def forward(self, prediction_scores, masked_lm_labels):
    method forward_impl_with_fused_head_loss_fn (line 2002) | def forward_impl_with_fused_head_loss_fn(self, masked_lm_labels, hidde...
    method forward_impl_with_calc_logits (line 2037) | def forward_impl_with_calc_logits(self, masked_lm_labels, hidden_state...
    method loss_impl (line 2049) | def loss_impl(self, prediction_scores, masked_lm_labels):
    method forward_impl (line 2055) | def forward_impl(self, prediction_scores, masked_lm_labels):
  class ErnieLMHead (line 2110) | class ErnieLMHead(nn.Layer):
    method __init__ (line 2111) | def __init__(self, config):
    method forward (line 2150) | def forward(self, hidden_states, tensor_parallel_output=None):
    method sharded_state_dict (line 2169) | def sharded_state_dict(
  class ErnieForCausalLM (line 2178) | class ErnieForCausalLM(ErniePretrainedModel):
    method __init__ (line 2181) | def __init__(self, config):
    method _post_init (line 2213) | def _post_init(self, original_init, *args, **kwargs):
    method get_input_embeddings (line 2222) | def get_input_embeddings(self):
    method set_input_embeddings (line 2225) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2228) | def get_output_embeddings(self):
    method set_output_embeddings (line 2231) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2234) | def set_decoder(self, decoder):
    method get_decoder (line 2237) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2241) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2254) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2285) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2323) | def forward(
    method sharded_state_dict (line 2382) | def sharded_state_dict(self, *args, **kwargs):

FILE: examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py
  class BaseModelOutputWithPastAndCrossAttentions (line 96) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput):
  class CausalLMOutputWithCrossAttentions (line 103) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput):
  function get_gate (line 123) | def get_gate(
  function build_mpdp_group (line 177) | def build_mpdp_group():
  function _parse_moe_group (line 198) | def _parse_moe_group(
  function moe_ep2mp (line 245) | def moe_ep2mp(state_dict: Dict[str, paddle.Tensor], config: ErnieMoEConf...
  function moe_statedict_cherry_pick (line 294) | def moe_statedict_cherry_pick(state_dict: Dict[str, paddle.Tensor], conf...
  function moe_statedict_upcycle (line 319) | def moe_statedict_upcycle(
  class ErnieMoeMLP (line 491) | class ErnieMoeMLP(ErnieMLP):
    method __init__ (line 492) | def __init__(self, config, is_shared_expert=False):
    method forward (line 504) | def forward(self, x, use_comm=True):
  class ErnieMoeDenseExpert (line 566) | class ErnieMoeDenseExpert(nn.Layer):
    method __init__ (line 567) | def __init__(self, config):
    method forward (line 615) | def forward(self, x):
  class BMMLinear (line 642) | class BMMLinear(nn.Layer):
    method __init__ (line 643) | def __init__(self, experts, d_in, d_out, use_bias=False):
    method forward (line 651) | def forward(self, x):
  class ErnieMoeMLPFused (line 657) | class ErnieMoeMLPFused(nn.Layer):
    method __init__ (line 658) | def __init__(self, config):
    method __len__ (line 676) | def __len__(self):
    method __iter__ (line 679) | def __iter__(self):
    method forward (line 682) | def forward(self, x):
  class FusedLinearAddNormFunc (line 692) | class FusedLinearAddNormFunc(paddle.autograd.PyLayer):
    method forward (line 694) | def forward(ctx, x, residual, linear_weight, rms_norm_weight, eps):
    method backward (line 704) | def backward(ctx, d_rms_norm_out, d_residual_out):
  class FusedLinearAddNorm (line 723) | class FusedLinearAddNorm(paddle.nn.Layer):
    method __init__ (line 724) | def __init__(self, hidden_size, eps=1e-6) -> None:
    method forward (line 742) | def forward(self, x, residual):
  class FusedRMSLinearFunc (line 746) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 748) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 756) | def backward(ctx, d_qkv):
  class FusedRMSLinear (line 766) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 767) | def __init__(self, hidden_size, eps=1e-6, num_heads=1, num_key_value_h...
    method forward (line 786) | def forward(self, x):
  class ErnieMoEAttention (line 790) | class ErnieMoEAttention(ErnieAttention):
    method __init__ (line 791) | def __init__(self, config, layer_idx):
    method forward (line 814) | def forward(
  class FakeMoERouterLoss (line 934) | class FakeMoERouterLoss(PyLayer):
    method forward (line 936) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss):
    method backward (line 944) | def backward(ctx, out_grad):
  class ErnieDecoderLayer (line 953) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 954) | def __init__(self, config, layer_idx):
    method training (line 1040) | def training(self):
    method training (line 1044) | def training(self, new):
    method fp8_quant_weight (line 1050) | def fp8_quant_weight(self):
    method _init_gate_and_experts (line 1055) | def _init_gate_and_experts(self, layer_idx):
    method _init_shared_experts (line 1106) | def _init_shared_experts(self):
    method _init_dense_experts (line 1124) | def _init_dense_experts(self, layer_idx):
    method forward (line 1147) | def forward(
    method model_parallel_dropout (line 1231) | def model_parallel_dropout(self):
  class ErniePretrainedModel (line 1238) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1243) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1313) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1431) | def _init_weights(self, layer):
  class ErnieModel (line 1510) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1511) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1576) | def get_input_embeddings(self):
    method set_input_embeddings (line 1579) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1583) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1602) | def recompute_training(
    method forward (line 1657) | def forward(
  class ErniePretrainingCriterion (line 1890) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase):
    method __init__ (line 1891) | def __init__(self, config, return_tuple=True):
    method forward (line 1906) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  class ErnieMoEForCausalLM (line 1959) | class ErnieMoEForCausalLM(ErniePretrainedModel):
    method __init__ (line 1962) | def __init__(self, config):
    method _post_init (line 1987) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 2019) | def set_state_dict(self, state_dict, *args, **kwargs):
    method get_input_embeddings (line 2037) | def get_input_embeddings(self):
    method set_input_embeddings (line 2040) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2043) | def get_output_embeddings(self):
    method set_output_embeddings (line 2046) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2049) | def set_decoder(self, decoder):
    method get_decoder (line 2052) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2056) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2069) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2101) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2140) | def forward(
    method sharded_state_dict (line 2209) | def sharded_state_dict(self, *args, **kwargs):

FILE: examples/experiments/ernie_pretrain/models/ernie/modeling_pp.py
  class ErnieEmbeddingPipe (line 90) | class ErnieEmbeddingPipe(nn.Layer):
    method __init__ (line 91) | def __init__(self, config):
    method embedding_weight (line 107) | def embedding_weight(self):
    method forward (line 110) | def forward(self, args):
  class MTPEmbeddingPipe (line 204) | class MTPEmbeddingPipe(ErnieEmbeddingPipe):
    method __init__ (line 205) | def __init__(self, config):
    method embedding_weight (line 209) | def embedding_weight(self):
    method forward (line 212) | def forward(self, args):
  class EmptyLayer (line 231) | class EmptyLayer(nn.Layer):
    method __init__ (line 232) | def __init__(self):
    method forward (line 235) | def forward(self, x):
  class ErnieDecoderLayerPipe (line 239) | class ErnieDecoderLayerPipe(ErnieDecoderLayer):
    method __init__ (line 240) | def __init__(self, config, layer_idx, use_full_recompute=False):
    method forward (line 247) | def forward(self, args):
  class RMSNormPipe (line 351) | class RMSNormPipe(RMSNorm):
    method __init__ (line 352) | def __init__(self, config):
    method forward (line 357) | def forward(self, args):
  class ErnieMoELMHeadPipe (line 385) | class ErnieMoELMHeadPipe(ErnieMoELMHead):
    method forward (line 386) | def forward(self, args):
  class MTPLayer (line 397) | class MTPLayer(nn.Layer):
    method __init__ (line 398) | def __init__(self, config):
    method forward (line 432) | def forward(self, args):
    method forward_impl (line 441) | def forward_impl(self, *args):
  class ErniePretrainingCriterionPipe (line 518) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion):
    method __init__ (line 519) | def __init__(self, config):
    method forward (line 522) | def forward(self, logits, labels):
  class PipelinePretrainedModel (line 536) | class PipelinePretrainedModel(PretrainedModel):
    method __init__ (line 537) | def __init__(self, config, *args, **kwargs):
    method init (line 541) | def init(self, config, *args, **kwargs):
    method add_sequential_layer (line 546) | def add_sequential_layer(self, layer_desc, name_prefix=""):
    method get_sequential_layers (line 549) | def get_sequential_layers(self):
    method get_sequential_name_prefixs (line 552) | def get_sequential_name_prefixs(self):
    method get_shardlayer_prefix (line 555) | def get_shardlayer_prefix(self, name_splited):
    method _set_pipeline_name_mapping (line 566) | def _set_pipeline_name_mapping(self, mappings=None):
    method _check_shared_model_state (line 626) | def _check_shared_model_state(self):
    method state_dict (line 647) | def state_dict(self, *args, **kwargs):
    method _init_weights (line 659) | def _init_weights(self, layer):
    method sharded_state_dict (line 731) | def sharded_state_dict(self, *args, **kwargs):
  function get_pp_vp_split_layers (line 765) | def get_pp_vp_split_layers(config):
  class ErnieMoEForCausalLMPipe (line 797) | class ErnieMoEForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method _prepare_pipeline_inputs_func (line 810) | def _prepare_pipeline_inputs_func(cls, data):
    method __init__ (line 844) | def __init__(
    method get_loss_fn (line 995) | def get_loss_fn(self, config):
    method rename_model_params (line 998) | def rename_model_params(self, func):
    method fp8_quant_weight (line 1005) | def fp8_quant_weight(self):
    method _post_init (line 1011) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 1028) | def set_state_dict(self, state_dict, *args, **kwargs):

FILE: examples/experiments/ernie_pretrain/models/fp8_linear.py
  function fp8_gemm (line 44) | def fp8_gemm(
  function padding (line 94) | def padding(x, axis):
  class Fp8FusedMlpFunc (line 118) | class Fp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 128) | def forward(ctx, x, w1, w2):
    method backward (line 208) | def backward(ctx, do3):
  class MemEfficientFp8FusedMlpFunc (line 320) | class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 333) | def forward(ctx, x, w1, w2):
    method backward (line 393) | def backward(ctx, do3):
  class Fp8FusedMlp (line 515) | class Fp8FusedMlp(paddle.nn.Layer):
    method __init__ (line 526) | def __init__(self, config):
    method forward (line 557) | def forward(self, x):

FILE: examples/experiments/ernie_pretrain/models/moe/moe_layer.py
  function set_grad_in_dtype_non_consistent (line 65) | def set_grad_in_dtype_non_consistent(ctx):
  class Fp8MoeGateDispatchAndQuant (line 71) | class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer):
    method forward (line 75) | def forward(
    method backward (line 127) | def backward(ctx, *grads):
  function recompute_fwd_gate_up_func (line 146) | def recompute_fwd_gate_up_func(config, layer_idx):
  class MoEStatics (line 156) | class MoEStatics(nn.Layer):
    method __init__ (line 157) | def __init__(self, config, layer_idx):
  class GateCombine (line 188) | class GateCombine(PyLayer):
    method forward (line 190) | def forward(ctx, x, combine_weights, scatter_index):
    method backward (line 198) | def backward(ctx, grad_y, *_):
  class FusionFP8Expert (line 207) | class FusionFP8Expert(paddle.autograd.PyLayer):
    method forward (line 209) | def forward(ctx, hidden_states, custom_map):
    method backward (line 226) | def backward(ctx, output_grad):
  class AlltoAll (line 237) | class AlltoAll(PyLayer):
    method forward (line 239) | def forward(ctx, x, group, sync_op=True):
    method backward (line 252) | def backward(ctx, *dx):
  class AlltoAllExpertOverlap (line 256) | class AlltoAllExpertOverlap(PyLayer):
    method forward (line 258) | def forward(ctx, input, group, num_local_experts, forward_func_dict, i...
    method backward (line 294) | def backward(ctx, out_grad):
  class AlltoAllAsync (line 313) | class AlltoAllAsync(PyLayer):
    method forward (line 315) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False):
    method backward (line 336) | def backward(ctx, dx_out, *fn_out_grads):
  function dispatching (line 356) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity):
  function combining_fused (line 383) | def combining_fused(x, combine_weights, scatter_index, hard_gate=False):
  class ReshapeKeepGradDtype (line 392) | class ReshapeKeepGradDtype(PyLayer):
    method forward (line 394) | def forward(ctx, x, shape):
    method backward (line 400) | def backward(ctx, grad):
  class MOELayer (line 404) | class MOELayer(nn.Layer):
    method __init__ (line 412) | def __init__(
    method forward_experts (line 515) | def forward_experts(self, dispatched_input):
    method fp8_quant_weight (line 551) | def fp8_quant_weight(self):
    method fused_gate_logits_process (line 582) | def fused_gate_logits_process(self, gate_logits, token_type_ids, offlo...
    method gate_distpach_and_quant (line 595) | def gate_distpach_and_quant(self, input, token_type_ids):
    method gate_and_distpach (line 680) | def gate_and_distpach(self, input, token_type_ids):
    method _calc_router_loss (line 792) | def _calc_router_loss(
    method calc_router_loss_and_logging (line 823) | def calc_router_loss_and_logging(
    method combine_expert_output (line 846) | def combine_expert_output(self, expert_output, combine_weights, scatte...
    method forward_single_stage (line 854) | def forward_single_stage(self, dispatched_input, stage_id):
    method all2all_expert_overlap (line 858) | def all2all_expert_overlap(self, x, group):
    method forward (line 885) | def forward(
    method sharded_state_dict (line 1059) | def sharded_state_dict(
  class FP8FusedWLCHFunc (line 1071) | class FP8FusedWLCHFunc(paddle.autograd.PyLayer):
    method forward (line 1073) | def forward(
    method backward (line 1126) | def backward(ctx, output_grad):
  class MlpNode (line 1141) | class MlpNode:
    method __init__ (line 1142) | def __init__(self, custom_map, max_topk, recompute_fwd_gate_up=False, ...
    method reset_status (line 1157) | def reset_status(self):
    method release_mem (line 1165) | def release_mem(self):
    method forward (line 1170) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1211) | def backward(self, hidden_states_out_grad):
  class Fp8FusedMoeFunc (line 1235) | class Fp8FusedMoeFunc(paddle.autograd.PyLayer):
    method forward (line 1237) | def forward(
    method backward (line 1261) | def backward(ctx, output_grad):

FILE: examples/experiments/ernie_pretrain/models/moe/token_dispatcher/fp8_utils.py
  function _get_fp8_weight_and_scale (line 43) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False):
  function fused_stack_transpose_quant (line 63) | def fused_stack_transpose_quant(weight_list, transpose=False):
  function split_group_gemm (line 81) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ...
  function has_config (line 119) | def has_config(config_map, key):
  class ExpertsGroupGemmNode (line 123) | class ExpertsGroupGemmNode:
    method __init__ (line 136) | def __init__(self, experts, custom_map, name="moe_experts_node"):
    method reset_status (line 160) | def reset_status(self):
    method fwd_gate_up (line 166) | def fwd_gate_up(self, x_bf16, expert_w1, expert_w_count, tokens_per_ex...
    method fwd_swiglu (line 223) | def fwd_swiglu(self, o1):
    method fwd_down (line 239) | def fwd_down(self, o1, unzipped_probs, expert_w_count, tokens_per_expe...
    method fwd_down_no_probs (line 300) | def fwd_down_no_probs(self, o1, expert_w2, expert_w_count, tokens_per_...
    method bwd_down_input (line 358) | def bwd_down_input(self, expert_w2, unzipped_grad, tokens_per_expert, ...
    method bwd_down_input_no_prob (line 428) | def bwd_down_input_no_prob(self, expert_w2, unzipped_grad, tokens_per_...
    method bwd_swiglu (line 468) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 485) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, expecte...
    method bwd_down_weight (line 543) | def bwd_down_weight(self, out_grad, o2, expert_w2):
    method bwd_gate_up_weight (line 627) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 706) | def forward(self, hs_out, unzipped_probs, tokens_per_expert):
    method backward (line 725) | def backward(self, out_grad, tokens_per_expert, dispatched_indices, ex...
    method forward_no_prob (line 740) | def forward_no_prob(self, hs_out, tokens_per_expert):
    method backward_no_prob (line 752) | def backward_no_prob(self, out_grad, tokens_per_expert):
  class ExpertsGroupGemmContiguousNode (line 774) | class ExpertsGroupGemmContiguousNode:
    method __init__ (line 787) | def __init__(
    method reset_status (line 832) | def reset_status(self):
    method gen_m_indices (line 841) | def gen_m_indices(self, tokens_per_expert):
    method fwd_gate_up (line 862) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, sca...
    method fwd_swiglu (line 936) | def fwd_swiglu(self, o1):
    method fwd_down (line 940) | def fwd_down(self, o1, unzipped_probs, expert_w2, num_expert):
    method bwd_down_input (line 1006) | def bwd_down_input(self, expert_w2, unzipped_grad, o1):
    method bwd_swiglu (line 1083) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 1087) | def bwd_gate_up_input(self, do1, expert_w1):
    method fused_transpose_split_quant (line 1144) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...
    method bwd_down_weight (line 1169) | def bwd_down_weight(self, do3, o2, expert_w2):
    method bwd_gate_up_weight (line 1245) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 1309) | def forward(
    method backward (line 1333) | def backward(self, out_grad, a2a_async_fn=None):
  class ExpertsGroupGemmWLCHNode (line 1426) | class ExpertsGroupGemmWLCHNode(ExpertsGroupGemmContiguousNode):
    method __init__ (line 1442) | def __init__(
    method gen_m_indices (line 1478) | def gen_m_indices(self, tokens_per_expert):
    method fused_transpose_split_quant (line 1498) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...

FILE: examples/experiments/ernie_pretrain/models/moe/token_dispatcher/moe_utils.py
  function inplace_offload (line 24) | def inplace_offload(x):
  function inplace_offload_if_needed (line 41) | def inplace_offload_if_needed(x, threshold=2 * 1024 * 1024 * 1024):
  function topk_to_permuted_indices_single (line 61) | def topk_to_permuted_indices_single(x, num_tokens, expert_id, topk):
  function topk_to_permuted_indices (line 81) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute (line 105) | def permute(
  function unpermute (line 128) | def unpermute(
  class UnZipNode (line 163) | class UnZipNode:
    method __init__ (line 178) | def __init__(self, token_dispatcher, name="unzip"):
    method reset_status (line 190) | def reset_status(self):
    method forward (line 196) | def forward(
    method backward (line 240) | def backward(self, dx, hidden_states_out_grad, probs_grad, dispatched_...
  class ZipNode (line 268) | class ZipNode:
    method __init__ (line 281) | def __init__(self, token_dispatcher, name="zip"):
    method forward (line 292) | def forward(
    method backward (line 326) | def backward(

FILE: examples/experiments/ernie_pretrain/models/moe/top2_gate.py
  function cal_aux_loss_func (line 35) | def cal_aux_loss_func(
  function masked_fill (line 85) | def masked_fill(x, mask, value):
  class CalAuxLossFunctor (line 90) | class CalAuxLossFunctor(paddle.autograd.PyLayer):
    method forward (line 92) | def forward(
    method backward (line 122) | def backward(ctx, out_grad):
  function cast_if_needed (line 130) | def cast_if_needed(x, dtype):
  class FusedGateDetachMatmul (line 134) | class FusedGateDetachMatmul(paddle.autograd.PyLayer):
    method forward (line 136) | def forward(ctx, x, w):
    method backward (line 142) | def backward(ctx, y_grad):
  function gate_detach_matmul (line 155) | def gate_detach_matmul(x, weight, use_fuse):
  function compute_optimal_transport (line 164) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:...
  class Top2Gate (line 178) | class Top2Gate(nn.Layer):
    method __init__ (line 191) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->...
    method _create_gate_parameter (line 259) | def _create_gate_parameter(self):
    method forward (line 269) | def forward(
    method get_capacity (line 301) | def get_capacity(self, num_tokens, cap_factor=None):
    method top2_gating (line 316) | def top2_gating(self, logits, cap=None, correction_bias=None):
    method _cal_aux_loss (line 388) | def _cal_aux_loss(
    method _cal_orthogonal_loss (line 433) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None):
    method _cal_orthogonal_loss_opt_each_weight (line 448) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group):
  function cal_orthogonal_loss_opt_each_weight_func (line 455) | def cal_orthogonal_loss_opt_each_weight_func(weight, moe_k, use_group, e...
  class TopKGateFused (line 473) | class TopKGateFused(Top2Gate):
    method forward (line 474) | def forward(

FILE: examples/experiments/ernie_pretrain/models/sequence_parallel_utils.py
  function get_hcg (line 44) | def get_hcg():
  function get_async_loader (line 51) | def get_async_loader():
  function hack_offload_wait (line 64) | def hack_offload_wait(task):
  function hack_reload_wait (line 68) | def hack_reload_wait(task):
  class ScatterOp (line 72) | class ScatterOp(PyLayer):
    method forward (line 74) | def forward(ctx, input, axis=0, group=None):
    method backward (line 80) | def backward(ctx, grad):
  class GatherOp (line 84) | class GatherOp(PyLayer):
    method forward (line 86) | def forward(ctx, input, axis=0, group=None):
    method backward (line 92) | def backward(ctx, grad):
  class AllGatherOp (line 96) | class AllGatherOp(PyLayer):
    method forward (line 98) | def forward(ctx, input, group=None):
    method backward (line 103) | def backward(ctx, grad):
  class ReduceScatterOp (line 107) | class ReduceScatterOp(PyLayer):
    method forward (line 109) | def forward(ctx, input, group=None):
    method backward (line 115) | def backward(ctx, grad):
  class AllGatherVarlenOp (line 119) | class AllGatherVarlenOp(PyLayer):
    method forward (line 121) | def forward(ctx, input, group=None):
    method backward (line 160) | def backward(ctx, grad):
  class GemmReduceScatterOp (line 174) | class GemmReduceScatterOp(PyLayer):
    method forward (line 176) | def forward(ctx, input, weight, group):
    method backward (line 183) | def backward(ctx, grad):
  class AllGatherGemmOp (line 204) | class AllGatherGemmOp(PyLayer):
    method forward (line 206) | def forward(ctx, input, weight, group):
    method backward (line 214) | def backward(ctx, grad):
  function sequence_parallel_sparse_mask_labels (line 231) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
  function mark_as_sequence_parallel_parameter (line 247) | def mark_as_sequence_parallel_parameter(parameter):
  function is_sequence_parallel_parameter (line 251) | def is_sequence_parallel_parameter(parameter):
  function create_fused_allreduce_gradient_hook (line 255) | def create_fused_allreduce_gradient_hook(parameter_list, accumulation_st...
  function create_non_fused_allreduce_gradient_hook (line 272) | def create_non_fused_allreduce_gradient_hook(param, model, verbose=False):
  function register_sequence_parallel_allreduce_hooks (line 295) | def register_sequence_parallel_allreduce_hooks(model, fuse_sequence_para...
  function is_fused_matmul_bias_supported (line 318) | def is_fused_matmul_bias_supported():
  class ColumnSequenceParallelLinear (line 334) | class ColumnSequenceParallelLinear(Layer):
    method __init__ (line 335) | def __init__(
    method forward (line 427) | def forward(self, x, use_comm=True):
    method sharded_state_dict (line 447) | def sharded_state_dict(
  class MPScale (line 455) | class MPScale(PyLayer):
    method forward (line 457) | def forward(ctx, x, mp_degree):
    method backward (line 462) | def backward(ctx, dout):
  class RowSequenceParallelLinear (line 466) | class RowSequenceParallelLinear(Layer):
    method __init__ (line 467) | def __init__(
    method forward (line 563) | def forward(self, x):
    method sharded_state_dict (line 594) | def sharded_state_dict(

FILE: examples/experiments/ernie_pretrain/models/utils.py
  function get_global_training_logs (line 31) | def get_global_training_logs():
  function global_training_logs_enabled (line 47) | def global_training_logs_enabled():
  function inplace_offload (line 52) | def inplace_offload(tensor):
  function detach_and_requires_grad_ (line 57) | def detach_and_requires_grad_(*args):
  class FakeClone (line 65) | class FakeClone(paddle.autograd.PyLayer):
    method forward (line 67) | def forward(ctx, input):
    method backward (line 76) | def backward(ctx, grad_output):
  function manual_backward (line 80) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]):
  class FakeGather (line 118) | class FakeGather(paddle.autograd.PyLayer):
    method forward (line 120) | def forward(ctx, input, indices):
    method backward (line 130) | def backward(ctx, grad_output):
  class FusedUnpermutation (line 139) | class FusedUnpermutation(paddle.autograd.PyLayer):
    method forward (line 141) | def forward(
    method backward (line 178) | def backward(ctx, output_tokens_grad):

FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/convert_sharded_to_uc.py
  function parse_args (line 35) | def parse_args():
  function convert_ckpt (line 53) | def convert_ckpt(args):

FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/gather_all_ckpt.py
  function parse_args (line 20) | def parse_args():
  function parse_path (line 39) | def parse_path(args):
  function get_ip_list (line 50) | def get_ip_list(args):
  function gather_ckpt (line 63) | def gather_ckpt(org_path, tgt_path, hostnames, local_host):

FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/merge_sharding_ep.py
  class Timer (line 53) | class Timer:
    method __init__ (line 54) | def __init__(self, name="name"):
    method __enter__ (line 57) | def __enter__(self):
    method __exit__ (line 61) | def __exit__(self, exc_type, exc_val, exc_tb):
  function strtobool (line 66) | def strtobool(s):
  function execute_cmd (line 76) | def execute_cmd(cmd, ignore_error=False):
  function parse_args (line 85) | def parse_args():
  function save_ckpt (line 100) | def save_ckpt(ckpt, save_dir, rank_info, mp_degree, pp_degree=0, ep_degr...
  class Client (line 127) | class Client:
    method __init__ (line 128) | def __init__(self, args, base_path, nproc_per_node=8, nnodes=1, node_r...
    method _get_expert_param_shape (line 174) | def _get_expert_param_shape(self, meta):
    method _expert_id (line 183) | def _expert_id(self, s_name):
    method _global_expert_id (line 191) | def _global_expert_id(self, local_id, ep_rank):
    method _get_num_experts_per_rank (line 194) | def _get_num_experts_per_rank(self):
    method _gen_node_id_map (line 205) | def _gen_node_id_map(self):
    method _modify_expert_id (line 233) | def _modify_expert_id(self, s_name, new_id):
    method merge_and_save (line 243) | def merge_and_save(
    method _merge_sharding_for_dense_params (line 311) | def _merge_sharding_for_dense_params(self, parallel_2_ckpt_map, ignore...
    method _replicate_fused_param (line 334) | def _replicate_fused_param(self, local_params, indices_or_sections, co...
    method _replicate_dense_params (line 346) | def _replicate_dense_params(self, dense_params):
    method _merge_sharding_for_expert_params (line 359) | def _merge_sharding_for_expert_params(self, parallel_2_ckpt_map, ignor...
    method _extend_ep_degree_for_expert_params (line 383) | def _extend_ep_degree_for_expert_params(self, expert_params, dst_ep_de...
    method _get_final_ckpts (line 404) | def _get_final_ckpts(
    method _read_ckpts (line 467) | def _read_ckpts(self, args):
    method _read_ckpt (line 480) | def _read_ckpt(self, mp, pp, sd, include_opt_state):
    method _read_all_ckpts_by_pp_stage (line 483) | def _read_all_ckpts_by_pp_stage(self, pp_stage, include_opt_state=False):
    method _merge_and_save (line 504) | def _merge_and_save(self, mp_rank, save_dir, include_opt_state, ignore...
    method _merge_pp_ckpts (line 549) | def _merge_pp_ckpts(self, rank_info, ckpts, is_opt):
    method _get_param_meta (line 564) | def _get_param_meta(self, mp_rank, ep_rank=None):
    method _merge_sharding_param_ckpts (line 584) | def _merge_sharding_param_ckpts(
    method _concat_crop_reshape (line 620) | def _concat_crop_reshape(self, arrs, shape, name, ignore_sharding_padd...
    method _get_opt_state_key_and_type (line 639) | def _get_opt_state_key_and_type(self, name):
    method _merge_sharding_opt_ckpts (line 664) | def _merge_sharding_opt_ckpts(self, mp_rank, ckpts, ignore_sharding_pa...
    method _cal_ep_rank (line 759) | def _cal_ep_rank(self, sd_rank, mp_rank):
    method load_ckpt (line 764) | def load_ckpt(self, mp_rank, pp_rank, sharding_rank, include_opt_state):
    method weight_suffix (line 821) | def weight_suffix(self, mp_rank, pp_rank, sharding_rank):
    method load_model_meta (line 834) | def load_model_meta(self):
    method move_useful_file (line 841) | def move_useful_file(self, save_dir):
  function merge_and_save (line 851) | def merge_and_save(args):

FILE: examples/experiments/ernie_pretrain/tools/uc_to_sharded/convert_uc_to_sharded.py
  function parse_args (line 31) | def parse_args():
  function find_files (line 40) | def find_files(path, suffixes):
  class Checkpoint (line 56) | class Checkpoint:
    method __init__ (line 57) | def __init__(self, args):
    method map_to_org_model (line 95) | def map_to_org_model(self, layer_name):
    method load_from_org_model (line 104) | def load_from_org_model(self, layer_name):
    method process_one_pdparam (line 122) | def process_one_pdparam(self, pdparam_path):
    method process_pdparams (line 134) | def process_pdparams(self):
    method load_from_org_model_with_tensor_name (line 138) | def load_from_org_model_with_tensor_name(self, tensor_name, structure_...
    method process_one_pdopt (line 184) | def process_one_pdopt(self, pdopt_path):
    method process_pdopts (line 246) | def process_pdopts(self):
  function convert_ckpt (line 251) | def convert_ckpt(args):

FILE: examples/experiments/paddlefleet/glm45_provider.py
  class GLMMoEModelProvider (line 31) | class GLMMoEModelProvider(GPTModelProvider):
  class GLM45ModelProvider355B (line 88) | class GLM45ModelProvider355B(GLMMoEModelProvider):
  class GLM45AirModelProvider106B (line 107) | class GLM45AirModelProvider106B(GLMMoEModelProvider):
  class GLM45AirModelDebugProvider (line 127) | class GLM45AirModelDebugProvider(GLM45AirModelProvider106B):
  class GLM45AirModelDebugProviderFP8 (line 148) | class GLM45AirModelDebugProviderFP8(GLM45AirModelDebugProvider):
  class GLM45AirModelSingleCardDebugProvider (line 154) | class GLM45AirModelSingleCardDebugProvider(GLMMoEModelProvider):

FILE: examples/experiments/paddlefleet/qwen_provider.py
  class Qwen3MoEModelProvider (line 31) | class Qwen3MoEModelProvider(GPTModelProvider):
  class Qwen3MoEModelProvider30B_A3B (line 75) | class Qwen3MoEModelProvider30B_A3B(Qwen3MoEModelProvider):
  class Qwen3MoEModelSingleCardProvider (line 89) | class Qwen3MoEModelSingleCardProvider(Qwen3MoEModelProvider):

FILE: examples/experiments/paddlefleet/run_pretrain.py
  class PreTrainingArguments (line 65) | class PreTrainingArguments(TrainingArguments):
    method __post_init__ (line 118) | def __post_init__(self):
  class DataArguments (line 175) | class DataArguments:
  class ModelArguments (line 208) | class ModelArguments:
  function create_pretrained_dataset (line 246) | def create_pretrained_dataset(
  function get_train_data_file (line 357) | def get_train_data_file(args):
  class PretrainingTrainer (line 382) | class PretrainingTrainer(Trainer):
    method __init__ (line 383) | def __init__(self, *args, **kwargs):
    method evaluate (line 387) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method _get_eval_sampler (line 427) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 437) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
  function _set_random_seed (line 448) | def _set_random_seed(
  function main (line 474) | def main():

FILE: examples/tools/create_pretraining_data.py
  function print_datetime (line 40) | def print_datetime(string):
  function get_args (line 45) | def get_args():
  function lexical_analysis_fn (line 100) | def lexical_analysis_fn():
  function chinese_segmentation_fn (line 112) | def chinese_segmentation_fn():
  function jieba_segmentation_fn (line 124) | def jieba_segmentation_fn():
  function get_whole_word_mask_tokens (line 134) | def get_whole_word_mask_tokens(tokens, words, max_word_length=6):
  class IdentitySplitter (line 199) | class IdentitySplitter(object):
    method tokenize (line 200) | def tokenize(self, *text):
  class NewlineSplitter (line 204) | class NewlineSplitter:
    method tokenize (line 205) | def tokenize(self, text):
  class Converter (line 209) | class Converter(object):
    method __init__ (line 210) | def __init__(self, args):
    method initializer (line 213) | def initializer(self):
    method remove_repeated_chars (line 269) | def remove_repeated_chars(text, max_repeated_len=100):
    method encode (line 284) | def encode(self, json_line):
  function main (line 306) | def main():

FILE: examples/tools/gpt-oss_weight_change/change_weight_dtype.py
  function find_safetensors_files (line 46) | def find_safetensors_files(directory):
  function endswith (line 55) | def endswith(key, prefix_list):
  function save_single_safetenors (line 62) | def save_single_safetenors(save_path, state_dict, rank, total_files_size...
  function fp4_to_bf16 (line 73) | def fp4_to_bf16(load_path, save_path):
  function bf16_to_fp4 (line 96) | def bf16_to_fp4(load_path, save_path):

FILE: examples/tools/merge.py
  function print_datetime (line 25) | def print_datetime(string):
  function merge_sft_datasets (line 30) | def merge_sft_datasets(input_dirs, output_dir):
  function main (line 120) | def main(args):

FILE: examples/tools/trans_paddlenlp2hf.py
  function parse_arguments (line 28) | def parse_arguments():
  function load_safetensors_state_dict (line 44) | def load_safetensors_state_dict(input_dir):
  function trans_paddlenlp2hf (line 61) | def trans_paddlenlp2hf():

FILE: paddleformers/__init__.py
  function compare_version (line 32) | def compare_version(v1, v2):
  function _check_dependency_versions (line 42) | def _check_dependency_versions():

FILE: paddleformers/cli/cli.py
  function main (line 57) | def main():

FILE: paddleformers/cli/export/export.py
  function check_download_repo (line 33) | def check_download_repo(model_name_or_path, download_hub=None):
  function logger_merge_config (line 52) | def logger_merge_config(merge_config, lora_merge):
  function run_export (line 83) | def run_export(args: Optional[dict[str, Any]] = None) -> None:

FILE: paddleformers/cli/hparams/data_args.py
  class DataArguments (line 19) | class DataArguments:

FILE: paddleformers/cli/hparams/export_args.py
  class ExportArguments (line 19) | class ExportArguments:

FILE: paddleformers/cli/hparams/finetuning_args.py
  class PreTrainingArguments (line 28) | class PreTrainingArguments(TrainingArguments):
    method need_data (line 102) | def need_data(self):
    method reeao_dataset_rank (line 120) | def reeao_dataset_rank(self):
    method reeao_dataset_world_size (line 138) | def reeao_dataset_world_size(self):
  class VLSFTTrainingArguments (line 148) | class VLSFTTrainingArguments(PreTrainingArguments):
  class SFTTrainingArguments (line 156) | class SFTTrainingArguments(TrainingArguments):
  class DPOTrainingArguments (line 169) | class DPOTrainingArguments(TrainingArguments):
  class FinetuningArguments (line 232) | class FinetuningArguments(
    method __post_init__ (line 300) | def __post_init__(self):

FILE: paddleformers/cli/hparams/generating_args.py
  class StreamOptions (line 18) | class StreamOptions:
    method __init__ (line 25) | def __init__(self, max_count: int = 100):
  class GeneratingArguments (line 35) | class GeneratingArguments:

FILE: paddleformers/cli/hparams/model_args.py
  class VisionArguments (line 20) | class VisionArguments:
  class FP8MemConfigs (line 38) | class FP8MemConfigs:
  class FP8FusedOpsConfigs (line 47) | class FP8FusedOpsConfigs:
  class ErniePretrainArgument (line 56) | class ErniePretrainArgument:
  class ModelArguments (line 80) | class ModelArguments:
    method __post_init__ (line 234) | def __post_init__(self):

FILE: paddleformers/cli/hparams/parser.py
  function _load_custom_template (line 84) | def _load_custom_template(custom_path):
  function read_args (line 94) | def read_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -...
  function _parse_args (line 115) | def _parse_args(
  function _parse_train_args (line 152) | def _parse_train_args(args: Optional[Union[dict[str, Any], list[str]]] =...
  function _parse_eval_args (line 166) | def _parse_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = ...
  function _parse_server_args (line 180) | def _parse_server_args(args: Optional[Union[dict[str, Any], list[str]]] ...
  function _parse_export_args (line 194) | def _parse_export_args(args: Optional[Union[dict[str, Any], list[str]]] ...
  function get_train_args (line 208) | def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = No...
  function get_eval_args (line 260) | def get_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = Non...
  function get_server_args (line 273) | def get_server_args(args: Optional[Union[dict[str, Any], list[str]]] = N...
  function get_export_args (line 286) | def get_export_args(args: Optional[Union[dict[str, Any], list[str]]] = N...

FILE: paddleformers/cli/hparams/preprocess_args.py
  class BasePreprocessArguments (line 25) | class BasePreprocessArguments:
    method __post_init__ (line 26) | def __post_init__(self):
  class UtteranceProcessorArguments (line 31) | class UtteranceProcessorArguments(BasePreprocessArguments):
    method __post_init__ (line 39) | def __post_init__(self):
  class CoarseProcessorArguments (line 46) | class CoarseProcessorArguments(BasePreprocessArguments):
    method __post_init__ (line 57) | def __post_init__(self):
  class InputIdsMassageArguments (line 64) | class InputIdsMassageArguments(BasePreprocessArguments):
    method __post_init__ (line 92) | def __post_init__(self):
  class ImageModificationProcessorArguments (line 102) | class ImageModificationProcessorArguments(BasePreprocessArguments):
    method __post_init__ (line 112) | def __post_init__(self):
  class End2EndProcessorArgumentsHelper (line 117) | class End2EndProcessorArgumentsHelper(BasePreprocessArguments):
    method __post_init__ (line 124) | def __post_init__(self):
  class End2EndProcessorArguments (line 129) | class End2EndProcessorArguments(
    method __post_init__ (line 136) | def __post_init__(self):

FILE: paddleformers/cli/hparams/server_args.py
  class ServerArguments (line 19) | class ServerArguments:

FILE: paddleformers/cli/launcher.py
  function launch (line 21) | def launch():

FILE: paddleformers/cli/train/auto_parallel/workflow.py
  function create_pretrained_dataset (line 42) | def create_pretrained_dataset(
  function get_train_data_file (line 116) | def get_train_data_file(args):
  class PretrainingTrainer (line 141) | class PretrainingTrainer(Trainer):
    method __init__ (line 142) | def __init__(self, *args, **kwargs):
  function run_auto_parallel (line 147) | def run_auto_parallel(model_args, data_args, generating_args, training_a...

FILE: paddleformers/cli/train/deepseek_v3_pretrain/configuration.py
  class DeepseekV2FastConfig (line 22) | class DeepseekV2FastConfig(PretrainedConfig):
    method __init__ (line 131) | def __init__(

FILE: paddleformers/cli/train/deepseek_v3_pretrain/fp8_linear.py
  function fp8_linear (line 54) | def fp8_linear(
  function register_scale (line 95) | def register_scale(self):
  class Linear (line 109) | class Linear(PD_Linear):
    method __init__ (line 110) | def __init__(self, *args, **kwargs):
  class ColumnParallelLinear (line 116) | class ColumnParallelLinear(PD_ColumnParallelLinear):
    method __init__ (line 117) | def __init__(self, *args, **kwargs):
  class RowParallelLinear (line 123) | class RowParallelLinear(PD_RowParallelLinear):
    method __init__ (line 124) | def __init__(self, *args, **kwargs):
  class ColumnSequenceParallelLinear (line 130) | class ColumnSequenceParallelLinear(PD_ColumnSequenceParallelLinear):
    method __init__ (line 131) | def __init__(self, *args, **kwargs):
  class RowSequenceParallelLinear (line 137) | class RowSequenceParallelLinear(PD_RowSequenceParallelLinear):
    method __init__ (line 138) | def __init__(self, *args, **kwargs):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/kernel.py
  function act_quant_kernel (line 30) | def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
  function act_quant (line 51) | def act_quant(x: paddle.Tensor, block_size: int = 128) -> Tuple[paddle.T...
  function weight_dequant_kernel (line 74) | def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.cons...
  function weight_dequant (line 100) | def weight_dequant(x: paddle.Tensor, s: paddle.Tensor, block_size: int =...
  function fp8_gemm_kernel (line 130) | def fp8_gemm_kernel(
  function fp8_gemm (line 190) | def fp8_gemm(a: paddle.Tensor, a_s: paddle.Tensor, b: paddle.Tensor, b_s...

FILE: paddleformers/cli/train/deepseek_v3_pretrain/modeling.py
  function swiglu (line 110) | def swiglu(x, y=None):
  function get_use_casual_mask (line 134) | def get_use_casual_mask():
  function set_global_step (line 139) | def set_global_step(cur_step):
  function get_global_step (line 144) | def get_global_step():
  function rms_norm_fused (line 149) | def rms_norm_fused(x_in, w, eps, use_fast_ln=False):
  function cast_if_needed (line 157) | def cast_if_needed(x, dtype):
  function fusion_rms_norm (line 164) | def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln...
  class LMHeadFunction (line 187) | class LMHeadFunction(paddle.autograd.PyLayer):
    method forward (line 189) | def forward(ctx, x, weight, transpose_y):
    method backward (line 196) | def backward(ctx, dout):
  function parallel_matmul (line 226) | def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_para...
  class DeepseekV2MLP (line 256) | class DeepseekV2MLP(nn.Layer):
    method __init__ (line 257) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, int...
    method forward (line 303) | def forward(self, x):
  class MoEGate (line 309) | class MoEGate(PretrainedMoEGate):
    method __init__ (line 310) | def __init__(
    method forward (line 356) | def forward(self, hidden_states):
  class DeepseekV2MoE (line 408) | class DeepseekV2MoE(MoELayer):
    method __init__ (line 413) | def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, nor...
    method fp8_quant_weight (line 492) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 555) | def forward(self, hidden_states):
    method post_process (line 580) | def post_process(self, hidden_states, final_hidden_states, l_aux):
  class DeepseekV2RotaryEmbedding (line 591) | class DeepseekV2RotaryEmbedding(nn.Layer):
    method __init__ (line 592) | def __init__(self, dim, max_position_embeddings=2048, base=10000):
    method _set_cos_sin_cache (line 607) | def _set_cos_sin_cache(self, seq_len):
    method forward (line 620) | def forward(self, x, seq_len=None):
  class DeepseekV2Attention (line 633) | class DeepseekV2Attention(nn.Layer):
    method __init__ (line 636) | def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: ...
    method fp8_quant_weight (line 746) | def fp8_quant_weight(self, quant_transpose=None):
    method _init_rope (line 753) | def _init_rope(self):
    method _shape (line 785) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
    method forward (line 788) | def forward(
  class DeepseekV2DecoderLayer (line 933) | class DeepseekV2DecoderLayer(nn.Layer):
    method __init__ (line 934) | def __init__(
    method fp8_quant_weight (line 975) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
    method forward (line 984) | def forward(
    method self_attn_compute (line 1082) | def self_attn_compute(self, hidden_states, **kwargs):
    method pre_dispatch_compute (line 1132) | def pre_dispatch_compute(self, hidden_states):
    method expert_forward_compute (line 1139) | def expert_forward_compute(self, intermediate_hidden_states, dispatche...
    method post_combine_compute (line 1152) | def post_combine_compute(self, residual, hidden_states, final_hidden_s...
  class DeepseekV2MTPLayer (line 1167) | class DeepseekV2MTPLayer(DeepseekV2DecoderLayer):
    method __init__ (line 1168) | def __init__(
    method forward (line 1180) | def forward(
  class DeepseekV2PretrainedModelFast (line 1217) | class DeepseekV2PretrainedModelFast(PretrainedModel):
    method _get_model_flops (line 1222) | def _get_model_flops(self, batch_size=1, seq_length=None, **kwargs):
    method _get_hardware_flops (line 1235) | def _get_hardware_flops(self, *args, **kwargs):
    method _get_name_mappings (line 1239) | def _get_name_mappings(cls, config: DeepseekV2FastConfig) -> list[Stat...
    method _get_tensor_parallel_mappings (line 1299) | def _get_tensor_parallel_mappings(cls, config: DeepseekV2FastConfig, i...
    method _init_weights (line 1399) | def _init_weights(self, layer):
    method step_flex_token (line 1460) | def step_flex_token(self, cur_step):
  class DeepseekV2ModelFast (line 1465) | class DeepseekV2ModelFast(DeepseekV2PretrainedModelFast):
    method __init__ (line 1473) | def __init__(self, config: DeepseekV2FastConfig):
    method get_input_embeddings (line 1503) | def get_input_embeddings(self):
    method set_input_embeddings (line 1506) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1510) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_...
    method recompute_training_full (line 1545) | def recompute_training_full(
    method forward (line 1576) | def forward(
  class DeepseekV2PretrainingCriterionFast (line 1787) | class DeepseekV2PretrainingCriterionFast(nn.Layer):
    method __init__ (line 1793) | def __init__(self, config: DeepseekV2FastConfig):
    method forward (line 1804) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  function yarn_find_correction_dim (line 1854) | def yarn_find_correction_dim(num_rotations, dim, base=10000, max_positio...
  function yarn_find_correction_range (line 1859) | def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_p...
  function yarn_linear_ramp_mask (line 1865) | def yarn_linear_ramp_mask(min, max, dim):
  class DeepseekV2YarnRotaryEmbedding (line 1874) | class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
    method __init__ (line 1875) | def __init__(
    method _set_cos_sin_cache (line 1895) | def _set_cos_sin_cache(self, seq_len):
  class RmsNormFunction (line 1929) | class RmsNormFunction(paddle.autograd.PyLayer):
    method forward (line 1931) | def forward(ctx, x, scale, epsilon):
    method backward (line 1940) | def backward(ctx, grad_output):
  class DeepseekV2RMSNorm (line 1954) | class DeepseekV2RMSNorm(nn.Layer):
    method __init__ (line 1955) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps...
    method forward (line 1979) | def forward(self, hidden_states):
    method extra_repr (line 1992) | def extra_repr(self):
  function apply_rotary_pos_emb (line 1996) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion...
  class FusedNormGateFunc (line 2050) | class FusedNormGateFunc(paddle.autograd.PyLayer):
    method set_temporary_vars (line 2057) | def set_temporary_vars(cls, norm_output, invar):
    method clear_temporary_vars (line 2062) | def clear_temporary_vars(cls):
    method forward (line 2067) | def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps):
    method backward (line 2077) | def backward(ctx, d_gate_logits, d_norm_output):
  class TemporaryVarContext (line 2101) | class TemporaryVarContext:
    method __init__ (line 2102) | def __init__(self, norm_output, invar):
    method __enter__ (line 2106) | def __enter__(self):
    method __exit__ (line 2109) | def __exit__(self, exc_type, exc_val, exc_tb):
  function balance_expert_assignment (line 2113) | def balance_expert_assignment(n, m, k):
  class FakeGate (line 2124) | class FakeGate(paddle.autograd.PyLayer):
    method forward (line 2126) | def forward(ctx, hidden_states, weight, fakse_gate_restrict_balance=Fa...
    method backward (line 2142) | def backward(ctx, grad_output):
  class AddAuxiliaryLoss (line 2146) | class AddAuxiliaryLoss(paddle.autograd.PyLayer):
    method forward (line 2153) | def forward(ctx, x, loss):
    method backward (line 2159) | def backward(ctx, grad_output):
  function qkv_pre_process_no_fuse (line 2167) | def qkv_pre_process_no_fuse(
  function rearrange_kv (line 2204) | def rearrange_kv(kv, k_pe, qk_nope_head_dim, num_heads):
  function enable_to_static (line 2215) | def enable_to_static(value):
  function qkv_pre_process (line 2224) | def qkv_pre_process(
  function manul_fwd (line 2267) | def manul_fwd(
  class MemroyRecomputeAttnFunc (line 2319) | class MemroyRecomputeAttnFunc(paddle.autograd.PyLayer):
    method forward (line 2321) | def forward(
    method backward (line 2516) | def backward(ctx, dout):
  class MemroyRecomputeAttn (line 2829) | class MemroyRecomputeAttn(paddle.nn.Layer):
    method __init__ (line 2830) | def __init__(
    method fp8_quant_weight (line 2908) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 2912) | def forward(self, q_init, kv_init, position_ids):
  class FusedRMSLinearFunc (line 2942) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 2944) | def forward(ctx, x, rms_norm_weight, q_down_weight, kv_down_weight, eps):
    method backward (line 2965) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinear (line 3015) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 3016) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method fp8_quant_weight (line 3040) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 3043) | def forward(self, x):
  class FusedRMSLinearSingleFunc (line 3048) | class FusedRMSLinearSingleFunc(paddle.autograd.PyLayer):
    method forward (line 3050) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 3059) | def backward(ctx, d_q, d_kv):
  class FusedRMSLinearSingle (line 3070) | class FusedRMSLinearSingle(paddle.nn.Layer):
    method __init__ (line 3071) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
    method forward (line 3088) | def forward(self, x):
  class FastCrossEntropyFunction (line 3093) | class FastCrossEntropyFunction(paddle.autograd.PyLayer):
    method forward (line 3095) | def forward(ctx, preds, labels):
    method backward (line 3103) | def backward(ctx, dout):
  class DeepseekV2LMHead (line 3113) | class DeepseekV2LMHead(nn.Layer):
    method __init__ (line 3114) | def __init__(self, config: DeepseekV2FastConfig, embedding_weight=None):
    method forward (line 3150) | def forward(self, hidden_states, tensor_parallel_output=None):
    method extra_repr (line 3172) | def extra_repr(self):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/modeling_pp.py
  function check_accept_none_grad (line 82) | def check_accept_none_grad():
  function parse_args (line 98) | def parse_args(args):
  function return_args (line 128) | def return_args(hidden_states, attention_mask=None, attn_mask_startend_r...
  function get_attr (line 143) | def get_attr(layer, name):
  function calc_stream_wait (line 150) | def calc_stream_wait(group_id):
  class TensorMeta (line 155) | class TensorMeta:
    method __init__ (line 158) | def __init__(self, tensor):
  class PostProcessNode (line 163) | class PostProcessNode(ScheduleNode):
    method __init__ (line 164) | def __init__(
    method forward_without_residual (line 188) | def forward_without_residual(self, inputs):
    method forward (line 232) | def forward(self, inputs):
    method backward (line 278) | def backward(self, output_grad):
  class DecoderLayerNode (line 334) | class DecoderLayerNode(ScheduleNode):
    method __init__ (line 335) | def __init__(
    method dispatch_forward (line 364) | def dispatch_forward(self, inputs, previous_event=None, allocate_on_co...
    method combine_forward (line 410) | def combine_forward(self, inputs, previous_event=None):
    method dispatch_backward (line 426) | def dispatch_backward(self, output_grad):
    method combine_backward (line 465) | def combine_backward(self, output_grad):
    method forward (line 492) | def forward(self, inputs):
    method backward (line 512) | def backward(self, output_grad=None, scaler=None):
  class OverlapedScheduleChunk (line 535) | class OverlapedScheduleChunk:
    method __init__ (line 536) | def __init__(self, forward_nodes, backward_nodes, use_fuion=True):
    method forward_backward (line 547) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class DecoderBackwardScheduleChunk (line 560) | class DecoderBackwardScheduleChunk:
    method __init__ (line 561) | def __init__(self, nodes):
    method backward (line 564) | def backward(self, output_grad, combine_bw_event_to_wait=None, pp_stre...
  class OverlapedScheduleNode (line 574) | class OverlapedScheduleNode:
    method __init__ (line 575) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 581) | def forward_backward(self, inputs, output_grad, event_to_wait=None):
  class FusionFp8DecoderLayerNode (line 609) | class FusionFp8DecoderLayerNode(ScheduleNode):
    method __init__ (line 610) | def __init__(
    method attn_forward (line 634) | def attn_forward(self, inputs):
    method dispatch_forward (line 670) | def dispatch_forward(self, inputs, previous_event=None, async_finish=F...
    method mlp_forward (line 698) | def mlp_forward(self, inputs):
    method combine_forward (line 737) | def combine_forward(self, inputs, async_finish=False, previous_event=N...
    method post_process_forward (line 763) | def post_process_forward(self, inputs, with_residual=True):
    method post_process_backward (line 786) | def post_process_backward(self, output_grad, event_to_wait=None):
    method combine_backward (line 820) | def combine_backward(self, output_grad, previous_event=None, async_fin...
    method mlp_backward (line 878) | def mlp_backward(self, output_grad):
    method dispatch_backward (line 910) | def dispatch_backward(self, output_grad, async_finish=False, previous_...
    method attn_backward (line 959) | def attn_backward(self, output_grad):
    method backward_for_fusion (line 1017) | def backward_for_fusion(self, output_grad, combine_bw_event_to_wait=No...
    method forward (line 1088) | def forward(self, inputs):
    method backward (line 1098) | def backward(self, output_grad=None, scaler=None):
  class DenseDecoderLayerNode (line 1109) | class DenseDecoderLayerNode(ScheduleNode):
    method __init__ (line 1110) | def __init__(
    method forward (line 1120) | def forward(self, inputs):
    method backward (line 1125) | def backward(self, output_grad=None, scaler=None):
  class OverlapedFUsionScheduleNode (line 1132) | class OverlapedFUsionScheduleNode:
    method __init__ (line 1133) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1141) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  class OverlapedDenseFusionScheduleNode (line 1277) | class OverlapedDenseFusionScheduleNode:
    method __init__ (line 1278) | def __init__(self, forward_node, backward_node, name=""):
    method forward_backward (line 1287) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa...
  function build_overlapped_nodes (line 1373) | def build_overlapped_nodes(config: DeepseekV2FastConfig, forward_chunk, ...
  class EmbeddingFunction (line 1437) | class EmbeddingFunction(paddle.autograd.PyLayer):
    method forward (line 1439) | def forward(ctx, x, weight):
    method backward (line 1448) | def backward(ctx, dout):
  class DeepseekV2EmbeddingPipe (line 1459) | class DeepseekV2EmbeddingPipe(nn.Layer):
    method __init__ (line 1460) | def __init__(self, config: DeepseekV2FastConfig):
    method embedding_weight (line 1475) | def embedding_weight(self):
    method forward (line 1478) | def forward(self, args):
    method build_schedule_node (line 1558) | def build_schedule_node(self):
  class DeepseekV2DecoderLayerPipe (line 1562) | class DeepseekV2DecoderLayerPipe(DeepseekV2DecoderLayer):
    method forward (line 1563) | def forward(self, args):
    method attn_compute (line 1622) | def attn_compute(self, args):
    method attn_compute_for_fusion (line 1657) | def attn_compute_for_fusion(self, args):
    method mlp_compute (line 1695) | def mlp_compute(self, inputs):
    method post_process_compute (line 1742) | def post_process_compute(self, inputs):
    method post_process_compute_for_fusion (line 1778) | def post_process_compute_for_fusion(self, inputs):
    method attn_compute_dense (line 1803) | def attn_compute_dense(self, args):
    method mlp_compute_dense (line 1821) | def mlp_compute_dense(self, inputs):
    method build_schedule_node (line 1835) | def build_schedule_node(self):
  class DeepseekV2MTPLayerPipe (line 1901) | class DeepseekV2MTPLayerPipe(DeepseekV2MTPLayer):
    method forward (line 1902) | def forward(self, args):
    method attn_compute_for_fusion (line 1970) | def attn_compute_for_fusion(self, args):
    method build_schedule_node (line 2017) | def build_schedule_node(self):
  class DeepseekV2RMSNormPipe (line 2036) | class DeepseekV2RMSNormPipe(nn.Layer):
    method __init__ (line 2037) | def __init__(self, config):
    method forward (line 2042) | def forward(self, args):
    method build_schedule_node (line 2057) | def build_schedule_node(self):
  class DeepseekV2LMHeadPipe (line 2061) | class DeepseekV2LMHeadPipe(DeepseekV2LMHead):
    method __init__ (line 2062) | def __init__(self, config, embedding_weight=None):
    method embedding_weight (line 2066) | def embedding_weight(self):
    method forward (line 2069) | def forward(self, args: Union[Tuple, paddle.Tensor]):
    method build_schedule_node (line 2079) | def build_schedule_node(self):
  class DeepseekV2PretrainingCriterionPipe (line 2083) | class DeepseekV2PretrainingCriterionPipe(DeepseekV2PretrainingCriterionF...
    method forward (line 2084) | def forward(self, logits, labels):
    method build_schedule_node (line 2095) | def build_schedule_node(self):
  class DeepseekV2ForCausalLMPipe (line 2099) | class DeepseekV2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method step_flex_token (line 2118) | def step_flex_token(self, cur_step):
    method _prepare_pipeline_inputs_func (line 2122) | def _prepare_pipeline_inputs_func(cls, inputs):
    method __init__ (line 2145) | def __init__(self, config: DeepseekV2FastConfig):
    method fp8_quant_weight (line 2310) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
    method get_loss_fn (line 2323) | def get_loss_fn(self, config):
    method overlapped_forward_backward (line 2326) | def overlapped_forward_backward(

FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_gate.py
  class PretrainedMoEGate (line 29) | class PretrainedMoEGate(nn.Layer, MoEGateMixin):
    method __init__ (line 30) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
    method _priority (line 69) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle....
    method _topk_greedy (line 91) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle....
    method _topk_group_limited_greedy (line 106) | def _topk_group_limited_greedy(
    method _topk_noaux_tc (line 138) | def _topk_noaux_tc(
    method top1gating (line 175) | def top1gating(
    method top2gating (line 245) | def top2gating(
    method _cal_seq_aux_loss (line 321) | def _cal_seq_aux_loss(self, gates, top_k, topk_idx) -> paddle.Tensor:
    method topkgating (line 359) | def topkgating(
    method topkgating_nodrop (line 438) | def topkgating_nodrop(self, gates: paddle.Tensor):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_layer.py
  function record_stream_for_multi_input (line 59) | def record_stream_for_multi_input(x):
  function stop_gradient_for_multi_input (line 67) | def stop_gradient_for_multi_input(x):
  class MoELayer (line 74) | class MoELayer(nn.Layer):
    method __init__ (line 75) | def __init__(
    method update_flex_token (line 150) | def update_flex_token(self):
    method _parse_moe_expert_parallel (line 166) | def _parse_moe_expert_parallel(self, n_routed_experts, expert_model_pa...
    method _post_init (line 176) | def _post_init(self):
    method forward (line 187) | def forward(
    method forward_drop_token (line 208) | def forward_drop_token(
    method expert_forward (line 327) | def expert_forward(self, dispatched_input):
    method forward_flex_token (line 338) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 381) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 384) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 387) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 395) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 401) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 407) | def post_combine_compute(self, hidden_states):
  class MoEFlexTokenLayer (line 412) | class MoEFlexTokenLayer(nn.Layer):
    method __init__ (line 413) | def __init__(self, config, n_routed_experts, expert_class, expert_kwar...
    method expert_forward (line 429) | def expert_forward(self, dispatched_input, tokens_per_expert):
    method forward (line 441) | def forward(self, hidden_states: paddle.Tensor):
    method forward_flex_token (line 452) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,...
    method get_tokens_per_expert (line 495) | def get_tokens_per_expert(self):
    method set_tokens_per_expert (line 498) | def set_tokens_per_expert(self, tokens_per_expert_list):
    method pre_dispatch_compute (line 501) | def pre_dispatch_compute(self, hidden_states):
    method post_dispatch_compute (line 509) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis...
    method pre_combine_compute (line 515) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p...
    method post_combine_compute (line 521) | def post_combine_compute(self, hidden_states):
  class Fp8DispatchQuantNode (line 526) | class Fp8DispatchQuantNode:
    method __init__ (line 527) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, name="fp8_...
    method forward (line 534) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 562) | def backward(self, hs_grad, token_probs_grad):
  class Fp8DispatchNode (line 574) | class Fp8DispatchNode:
    method __init__ (line 575) | def __init__(self, token_dispatcher, name="fp8_dispatch_node"):
    method forward (line 581) | def forward(
    method backward (line 610) | def backward(
  class Fp8CombineNode (line 629) | class Fp8CombineNode:
    method __init__ (line 630) | def __init__(self, token_dispatcher, name="fp8_combine_node"):
    method forward (line 636) | def forward(self, hidden_states_out, previous_event=None, async_finish...
    method backward (line 651) | def backward(self, output_combine_grad, previous_event=None, async_fin...
  class Fp8CombineQuantNode (line 662) | class Fp8CombineQuantNode:
    method __init__ (line 663) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, moe_group=...
    method forward (line 670) | def forward(self, output_combine):
    method backward (line 679) | def backward(self, output_grad, event_to_wait=None):
  class FusionMlpNode (line 705) | class FusionMlpNode:
    method __init__ (line 710) | def __init__(
    method set_recompute_fwd_gate_up (line 746) | def set_recompute_fwd_gate_up(self, recompute_fwd_gate_up):
    method reset_statue (line 749) | def reset_statue(self):
    method prepare_env_subbatch (line 774) | def prepare_env_subbatch(self, unzipped_tokens=None, unzipped_tokens_s...
    method gemm_forward_subbatch (line 791) | def gemm_forward_subbatch(
    method gemm_backward_subbatch (line 826) | def gemm_backward_subbatch(
    method forward (line 887) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1025) | def backward(self, hidden_states_out_grad):
  class FusionMoeNode (line 1128) | class FusionMoeNode:
    method __init__ (line 1129) | def __init__(
    method forward (line 1162) | def forward(self, hidden_states, probs, routing_map):
    method backward (line 1190) | def backward(self, output_grad):
  class FusionMoe (line 1205) | class FusionMoe(paddle.autograd.PyLayer):
    method forward (line 1207) | def forward(
    method backward (line 1226) | def backward(ctx, output_grad):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_utils.py
  function _clear_to_zero_allocation (line 29) | def _clear_to_zero_allocation(self):
  function _holder_size (line 45) | def _holder_size(self):
  function topk_to_permuted_indices (line 57) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute_fast (line 69) | def permute_fast(
  function unpermute_fast (line 90) | def unpermute_fast(
  class UnZipNode (line 132) | class UnZipNode:
    method __init__ (line 133) | def __init__(self, name="unzip"):
    method reset_statue (line 138) | def reset_statue(self):
    method forward (line 143) | def forward(
    method backward (line 189) | def backward(self, dx, total_zipped_tokens, probs_grad, dispatched_ind...
  class ZipNode (line 203) | class ZipNode:
    method __init__ (line 204) | def __init__(self, name="zip"):
    method forward (line 208) | def forward(
    method backward (line 218) | def backward(
  class PermuteNode (line 264) | class PermuteNode:
    method __init__ (line 265) | def __init__(self, token_dispatcher, name="permute"):
    method reset_status (line 269) | def reset_status(self):
    method forward (line 273) | def forward(self, hidden_states, hidden_states_scale, dispatched_indic...
    method backward (line 287) | def backward(self, out_grad, dispatched_probs):
  class UnPermuteNode (line 300) | class UnPermuteNode:
    method __init__ (line 301) | def __init__(self, token_dispatcher, name="unpermute"):
    method reset_status (line 305) | def reset_status(self):
    method forward (line 314) | def forward(
    method backward (line 352) | def backward(self, out_grad, out_grad_scale):
  function tokens_zip_unique_add_with_subbatch (line 383) | def tokens_zip_unique_add_with_subbatch(zipped, unzipped, index_unzipped...
  function merge_subbatch_cast (line 409) | def merge_subbatch_cast(x, dtype):
  function get_env_device (line 420) | def get_env_device():

FILE: paddleformers/cli/train/deepseek_v3_pretrain/token_dispatcher.py
  class _DeepepManager (line 31) | class _DeepepManager(_DispatchManager):
    method __init__ (line 53) | def __init__(
    method setup_metadata (line 74) | def setup_metadata(self, routing_map: paddle.Tensor, probs: paddle.Ten...
    method dispatch (line 82) | def dispatch(
    method _indices_to_multihot (line 94) | def _indices_to_multihot(self, indices, probs):
    method get_dispatched_metadata (line 119) | def get_dispatched_metadata(self) -> paddle.Tensor:
    method get_number_of_tokens_per_expert (line 122) | def get_number_of_tokens_per_expert(self) -> paddle.Tensor:
    method combine (line 128) | def combine(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
    method get_permuted_hidden_states_by_experts (line 134) | def get_permuted_hidden_states_by_experts(self, hidden_states: paddle....
    method get_permuted_hidden_states_by_experts_fast (line 146) | def get_permuted_hidden_states_by_experts_fast(
    method get_restored_hidden_states_by_experts (line 156) | def get_restored_hidden_states_by_experts(self, hidden_states: paddle....
    method get_restored_hidden_states_by_experts_fast (line 168) | def get_restored_hidden_states_by_experts_fast(
  class MoETokenDispatcher (line 187) | class MoETokenDispatcher:
    method __init__ (line 192) | def __init__(self, ep_group) -> None:
    method ep_group (line 199) | def ep_group(self):
    method ep_size (line 204) | def ep_size(self):
    method token_permutation (line 209) | def token_permutation(self, tokens: paddle.Tensor, probs: paddle.Tenso...
    method token_unpermutation (line 223) | def token_unpermutation(self, expert_output: paddle.Tensor, bias: padd...
  class MoEFlexTokenDispatcher (line 236) | class MoEFlexTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 241) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method token_permutation (line 253) | def token_permutation(
    method token_unpermutation (line 266) | def token_unpermutation(
  class MoEFlexTokenDispatcherFast (line 277) | class MoEFlexTokenDispatcherFast:
    method __init__ (line 282) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m...
    method ep_group (line 295) | def ep_group(self):
    method ep_size (line 300) | def ep_size(self):
    method pre_dispatch (line 304) | def pre_dispatch(self, hidden_states, probs, routing_map):
    method post_dispatch (line 314) | def post_dispatch(self, hidden_states, dispatched_indices):
    method pre_combine (line 322) | def pre_combine(self, hidden_states, token_permuted_indices, prob_perm...
    method post_combine (line 328) | def post_combine(self, hidden_states):
    method token_permutation (line 332) | def token_permutation(
    method token_unpermutation (line 350) | def token_unpermutation(
  class PreDispatchNode (line 368) | class PreDispatchNode:
    method __init__ (line 369) | def __init__(self, token_dispatcher):
    method reset_status (line 373) | def reset_status(self):
    method forward (line 379) | def forward(self, routing_map, probs):
    method backward (line 394) | def backward(self, token_probs_g):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/utils/convert_ckpt_to_sft.py
  function paddle_name_to_hf_names (line 45) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _handle_expert_weights (line 134) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 149) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 162) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function _is_need_transpose (line 172) | def _is_need_transpose(key):
  function prepare_tensor (line 191) | def prepare_tensor(key, value):
  function load_pretrained_ckpt (line 218) | def load_pretrained_ckpt(ckpt_path, output_path):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/utils/load_hf_ckpt.py
  function paddle_name_to_hf_names_ds_v2 (line 57) | def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]:
  function paddle_name_to_hf_names (line 132) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
  function _get_hf_prefix (line 200) | def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str:
  function _handle_expert_weights (line 210) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s...
  function _handle_shared_expert_weights (line 225) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional...
  function _handle_mlp_weights (line 238) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
  function prepare_tensor (line 248) | def prepare_tensor(tensor, dst_shape, *, force_transpose=False):
  function load_huggingface_ckpt (line 278) | def load_huggingface_ckpt(model, huggingface_ckpt_path):

FILE: paddleformers/cli/train/deepseek_v3_pretrain/workflow.py
  class PreTrainingArguments (line 64) | class PreTrainingArguments(TrainingArguments):
    method __post_init__ (line 91) | def __post_init__(self):
  class DataArguments (line 111) | class DataArguments:
  class ModelArguments (line 144) | class ModelArguments:
  function create_pretrained_dataset (line 175) | def create_pretrained_dataset(
  function get_train_data_file (line 249) | def get_train_data_file(args):
  class PretrainingTrainer (line 274) | class PretrainingTrainer(Trainer):
    method __init__ (line 275) | def __init__(self, *args, **kwargs):
    method evaluate (line 279) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method _get_eval_sampler (line 319) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 329) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
  function run_dsv3_pretrain (line 340) | def run_dsv3_pretrain(model_args, data_args, generating_args, training_a...

FILE: paddleformers/cli/train/dpo/data_config.py
  class DataConfig (line 21) | class DataConfig:

FILE: paddleformers/cli/train/dpo/dpo_argument.py
  class DPOTrainingArguments (line 30) | class DPOTrainingArguments(TrainingArguments):
    method __post_init__ (line 59) | def __post_init__(self):
  class DPOConfig (line 93) | class DPOConfig:
  class DPODataArgument (line 112) | class DPODataArgument(DataConfig):
  class DPOModelArgument (line 122) | class DPOModelArgument:

FILE: paddleformers/cli/train/dpo/dpo_estimate_training.py
  function calculate_acc_steps (line 31) | def calculate_acc_steps(num_samples, train_batch, dataset_world_size, pe...
  function dpo_estimate_training (line 59) | def dpo_estimate_training(tokenizer, data_args, training_args, dataset_c...

FILE: paddleformers/cli/train/dpo/dpo_trainer.py
  function disable_dropout_in_model (line 44) | def disable_dropout_in_model(model: paddle.nn.Layer) -> None:
  class DPOTrainer (line 51) | class DPOTrainer(Trainer):
    method __init__ (line 56) | def __init__(
    method get_batch_metrics (line 124) | def get_batch_metrics(self, ref_model, model, batch, train_eval="train"):
    method compute_loss (line 193) | def compute_loss(self, model, inputs):
    method _wrap_ref_model (line 206) | def _wrap_ref_model(self, model):
    method _wrap_model (line 227) | def _wrap_model(self, model, training=True):
    method evaluate (line 245) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method prediction_step (line 252) | def prediction_step(self, model, inputs, prediction_loss_only=False, i...
    method store_metrics (line 278) | def store_metrics(self, metrics, train_eval="train"):
    method log (line 283) | def log(self, logs, **kwargs):
    method fleet_prediction_pipeline_step (line 301) | def fleet_prediction_pipeline_step(
    method prediction_pipeline_step (line 407) | def prediction_pipeline_step(
    method log_metric (line 492) | def log_metric(
    method training_pipeline_step (line 542) | def training_pipeline_step(self, model, inputs):
    method disable_lora (line 624) | def disable_lora(self, model):
    method enable_lora (line 630) | def enable_lora(self, model):
    method reset_dpo_infohub (line 636) | def reset_dpo_infohub(self):
    method broadcast_last_stage_infohub_tensor (line 641) | def broadcast_last_stage_infohub_tensor(self):
  function prepare_pipeline_dpo_inputs_func (line 688) | def prepare_pipeline_dpo_inputs_func(inputs):
  function _prepare_pipeline_dpo_inputs_func_fleet (line 732) | def _prepare_pipeline_dpo_inputs_func_fleet(inputs):
  function fleet_merge_dpo_labels (line 771) | def fleet_merge_dpo_labels(labels, logprobs):

FILE: paddleformers/cli/train/dpo/workflow.py
  function run_dpo (line 65) | def run_dpo(

FILE: paddleformers/cli/train/ernie_pretrain/model_config.py
  class ModelConfig (line 22) | class ModelConfig:

FILE: paddleformers/cli/train/ernie_pretrain/models/comm_utils.py
  function scatter (line 33) | def scatter(input, group=None, axis=0):
  function mp_slice (line 51) | def mp_slice(x, indices=None, group=None, axis=0):
  function all_gather_varlen (line 68) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):
  function scatter_varlen (line 90) | def scatter_varlen(x, recv_tensor, indices, src_rank, group, sync_op=True):
  function all_gather (line 112) | def all_gather(input, group=None, axis=0):
  function reduce_scatter (line 131) | def reduce_scatter(input, group=None):
  function subbatch (line 148) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar...
  function gather_varlen (line 193) | def gather_varlen(input, dst, group, offload_pp_data_chunk_size=0, all_s...
  function profile (line 293) | def profile(name, use_event=True):

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py
  class ErnieMoEConfig (line 60) | class ErnieMoEConfig(PretrainedConfig):
    method __init__ (line 72) | def __init__(
    method __setattr__ (line 398) | def __setattr__(self, name: str, value):
    method register_nonsaveable_keys (line 409) | def register_nonsaveable_keys(self, keys):
    method use_moe (line 418) | def use_moe(self) -> bool:
    method to_json_string (line 421) | def to_json_string(self, use_diff: bool = True) -> str:

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling.py
  function get_triangle_upper_mask (line 131) | def get_triangle_upper_mask(x, mask=None):
  function gqa_qkv_split_func (line 143) | def gqa_qkv_split_func(
  function gqa_qkv_merge_func (line 173) | def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_h...
  function parallel_matmul (line 194) | def parallel_matmul(
  function calc_lm_head_logits (line 235) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para...
  function finfo (line 265) | def finfo(dtype: paddle.dtype = None):
  function masked_fill (line 283) | def masked_fill(x, mask, value):
  function mem_eff_attn (line 288) | def mem_eff_attn(query, key, value, pack_offset, drop_prob=0.0, dtype=pa...
  function inbatch_pack_offset_to_attn_mask_start_row_indices (line 325) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs...
  function scaled_dot_product_attention (line 340) | def scaled_dot_product_attention(
  function _make_causal_mask (line 458) | def _make_causal_mask(input_ids_shape, past_key_values_length, dtype):
  function _expand_mask (line 472) | def _expand_mask(mask, dtype, tgt_length):
  class FusedDropoutImpl (line 487) | class FusedDropoutImpl(nn.Layer):
    method __init__ (line 488) | def __init__(self, prob, mode):
    method forward (line 495) | def forward(self, x, y):
  class RMSNorm (line 503) | class RMSNorm(nn.Layer):
    method __init__ (line 504) | def __init__(self, config):
    method forward (line 518) | def forward(self, hidden_states):
  class RotaryEmbedding (line 534) | class RotaryEmbedding(nn.Layer):
    method __init__ (line 535) | def __init__(self, dim, max_position_embeddings=4096, base=10000):
    method forward (line 551) | def forward(self, x, seq_len=None):
    method rotate_half (line 559) | def rotate_half(cls, x):
    method apply_rotary_pos_emb (line 566) | def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, positio...
  class RopeEmbeddingLegacy (line 584) | class RopeEmbeddingLegacy(nn.Layer):
    method __init__ (line 585) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a...
    method forward (line 592) | def forward(self, seq_length, position_ids=None):
    method apply_rotary (line 608) | def apply_rotary(self, rp, q, k):
    method apply_rotary_3d (line 630) | def apply_rotary_3d(self, rp, q, k, position_ids):
    method forward_single (line 698) | def forward_single(self, position_ids):
    method apply_rotary_single (line 713) | def apply_rotary_single(x, rope_emb):
  class ErnieMLP (line 721) | class ErnieMLP(nn.Layer):
    method __init__ (line 722) | def __init__(self, config):
    method forward (line 802) | def forward(self, x):
  class ErnieAttention (line 839) | class ErnieAttention(nn.Layer):
    method __init__ (line 840) | def __init__(self, config, layer_idx=0):
    method forward (line 1001) | def forward(
    method rope_attn (line 1099) | def rope_attn(
  class ErnieDecoderLayer (line 1199) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 1200) | def __init__(self, config, layer_idx=0):
    method forward (line 1214) | def forward(
  class ErniePretrainedModel (line 1269) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1274) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1345) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1430) | def _init_weights(self, layer):
  class ErnieModel (line 1478) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1479) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1506) | def get_input_embeddings(self):
    method set_input_embeddings (line 1509) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1513) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1532) | def recompute_training(
    method forward (line 1562) | def forward(
  class FusedHeadParallelCrossEntropy (line 1696) | class FusedHeadParallelCrossEntropy(PyLayer):
    method forward (line 1698) | def forward(
    method backward (line 1816) | def backward(ctx, loss_all_grad, labels_all_grad):
  class ErniePretrainingCriterion (line 1934) | class ErniePretrainingCriterion(paddle.nn.Layer):
    method __init__ (line 1935) | def __init__(self, config, return_tuple=True):
    method forward (line 1950) | def forward(self, prediction_scores, masked_lm_labels):
    method forward_impl_with_fused_head_loss_fn (line 2006) | def forward_impl_with_fused_head_loss_fn(self, masked_lm_labels, hidde...
    method forward_impl_with_calc_logits (line 2041) | def forward_impl_with_calc_logits(self, masked_lm_labels, hidden_state...
    method loss_impl (line 2053) | def loss_impl(self, prediction_scores, masked_lm_labels):
    method forward_impl (line 2059) | def forward_impl(self, prediction_scores, masked_lm_labels):
  class ErnieLMHead (line 2114) | class ErnieLMHead(nn.Layer):
    method __init__ (line 2115) | def __init__(self, config):
    method forward (line 2154) | def forward(self, hidden_states, tensor_parallel_output=None):
    method sharded_state_dict (line 2173) | def sharded_state_dict(
  class ErnieForCausalLM (line 2182) | class ErnieForCausalLM(ErniePretrainedModel):
    method __init__ (line 2185) | def __init__(self, config):
    method _post_init (line 2217) | def _post_init(self, original_init, *args, **kwargs):
    method get_input_embeddings (line 2226) | def get_input_embeddings(self):
    method set_input_embeddings (line 2229) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2232) | def get_output_embeddings(self):
    method set_output_embeddings (line 2235) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2238) | def set_decoder(self, decoder):
    method get_decoder (line 2241) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2245) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2258) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2289) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2327) | def forward(
    method sharded_state_dict (line 2386) | def sharded_state_dict(self, *args, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling_moe.py
  class BaseModelOutputWithPastAndCrossAttentions (line 109) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput):
  class CausalLMOutputWithCrossAttentions (line 116) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput):
  function get_gate (line 136) | def get_gate(
  function build_mpdp_group (line 190) | def build_mpdp_group():
  function _parse_moe_group (line 211) | def _parse_moe_group(
  function moe_ep2mp (line 258) | def moe_ep2mp(state_dict: Dict[str, paddle.Tensor], config: ErnieMoEConf...
  function moe_statedict_cherry_pick (line 307) | def moe_statedict_cherry_pick(state_dict: Dict[str, paddle.Tensor], conf...
  function moe_statedict_upcycle (line 332) | def moe_statedict_upcycle(
  class ErnieMoeMLP (line 504) | class ErnieMoeMLP(ErnieMLP):
    method __init__ (line 505) | def __init__(self, config, is_shared_expert=False):
    method forward (line 517) | def forward(self, x, use_comm=True):
  class ErnieMoeDenseExpert (line 579) | class ErnieMoeDenseExpert(nn.Layer):
    method __init__ (line 580) | def __init__(self, config):
    method forward (line 628) | def forward(self, x):
  class BMMLinear (line 655) | class BMMLinear(nn.Layer):
    method __init__ (line 656) | def __init__(self, experts, d_in, d_out, use_bias=False):
    method forward (line 664) | def forward(self, x):
  class ErnieMoeMLPFused (line 670) | class ErnieMoeMLPFused(nn.Layer):
    method __init__ (line 671) | def __init__(self, config):
    method __len__ (line 689) | def __len__(self):
    method __iter__ (line 692) | def __iter__(self):
    method forward (line 695) | def forward(self, x):
  class FusedLinearAddNormFunc (line 705) | class FusedLinearAddNormFunc(paddle.autograd.PyLayer):
    method forward (line 707) | def forward(ctx, x, residual, linear_weight, rms_norm_weight, eps):
    method backward (line 717) | def backward(ctx, d_rms_norm_out, d_residual_out):
  class FusedLinearAddNorm (line 736) | class FusedLinearAddNorm(paddle.nn.Layer):
    method __init__ (line 737) | def __init__(self, hidden_size, eps=1e-6) -> None:
    method forward (line 755) | def forward(self, x, residual):
  class FusedRMSLinearFunc (line 759) | class FusedRMSLinearFunc(paddle.autograd.PyLayer):
    method forward (line 761) | def forward(ctx, x, rms_norm_weight, linear_weight, eps):
    method backward (line 769) | def backward(ctx, d_qkv):
  class FusedRMSLinear (line 779) | class FusedRMSLinear(paddle.nn.Layer):
    method __init__ (line 780) | def __init__(self, hidden_size, eps=1e-6, num_heads=1, num_key_value_h...
    method forward (line 799) | def forward(self, x):
  class ErnieMoEAttention (line 803) | class ErnieMoEAttention(ErnieAttention):
    method __init__ (line 804) | def __init__(self, config, layer_idx):
    method forward (line 827) | def forward(
  class FakeMoERouterLoss (line 947) | class FakeMoERouterLoss(PyLayer):
    method forward (line 949) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss):
    method backward (line 957) | def backward(ctx, out_grad):
  class ErnieDecoderLayer (line 966) | class ErnieDecoderLayer(nn.Layer):
    method __init__ (line 967) | def __init__(self, config, layer_idx):
    method training (line 1053) | def training(self):
    method training (line 1057) | def training(self, new):
    method fp8_quant_weight (line 1063) | def fp8_quant_weight(self):
    method _init_gate_and_experts (line 1068) | def _init_gate_and_experts(self, layer_idx):
    method _init_shared_experts (line 1119) | def _init_shared_experts(self):
    method _init_dense_experts (line 1137) | def _init_dense_experts(self, layer_idx):
    method forward (line 1160) | def forward(
    method model_parallel_dropout (line 1244) | def model_parallel_dropout(self):
  class ErniePretrainedModel (line 1251) | class ErniePretrainedModel(PretrainedModel):
    method _get_name_mappings (line 1256) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa...
    method _get_tensor_parallel_mappings (line 1326) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _init_weights (line 1446) | def _init_weights(self, layer):
  class ErnieModel (line 1525) | class ErnieModel(ErniePretrainedModel):
    method __init__ (line 1526) | def __init__(self, config: ErnieMoEConfig):
    method get_input_embeddings (line 1591) | def get_input_embeddings(self):
    method set_input_embeddings (line 1594) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 1598) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ...
    method recompute_training (line 1617) | def recompute_training(
    method forward (line 1672) | def forward(
  class ErniePretrainingCriterion (line 1905) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase):
    method __init__ (line 1906) | def __init__(self, config, return_tuple=True):
    method forward (line 1921) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  class ErnieMoEForCausalLM (line 1974) | class ErnieMoEForCausalLM(ErniePretrainedModel):
    method __init__ (line 1977) | def __init__(self, config):
    method _post_init (line 2002) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 2034) | def set_state_dict(self, state_dict, *args, **kwargs):
    method get_input_embeddings (line 2052) | def get_input_embeddings(self):
    method set_input_embeddings (line 2055) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 2058) | def get_output_embeddings(self):
    method set_output_embeddings (line 2061) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 2064) | def set_decoder(self, decoder):
    method get_decoder (line 2067) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 2071) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method prepare_inputs_for_generation (line 2084) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 2116) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 2155) | def forward(
    method sharded_state_dict (line 2224) | def sharded_state_dict(self, *args, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling_pp.py
  class ErnieEmbeddingPipe (line 95) | class ErnieEmbeddingPipe(nn.Layer):
    method __init__ (line 96) | def __init__(self, config):
    method embedding_weight (line 112) | def embedding_weight(self):
    method forward (line 115) | def forward(self, args):
  class MTPEmbeddingPipe (line 209) | class MTPEmbeddingPipe(ErnieEmbeddingPipe):
    method __init__ (line 210) | def __init__(self, config):
    method embedding_weight (line 214) | def embedding_weight(self):
    method forward (line 217) | def forward(self, args):
  class EmptyLayer (line 236) | class EmptyLayer(nn.Layer):
    method __init__ (line 237) | def __init__(self):
    method forward (line 240) | def forward(self, x):
  class ErnieDecoderLayerPipe (line 244) | class ErnieDecoderLayerPipe(ErnieDecoderLayer):
    method __init__ (line 245) | def __init__(self, config, layer_idx, use_full_recompute=False):
    method forward (line 252) | def forward(self, args):
  class RMSNormPipe (line 356) | class RMSNormPipe(RMSNorm):
    method __init__ (line 357) | def __init__(self, config):
    method forward (line 362) | def forward(self, args):
  class ErnieMoELMHeadPipe (line 390) | class ErnieMoELMHeadPipe(ErnieMoELMHead):
    method forward (line 391) | def forward(self, args):
  class MTPLayer (line 402) | class MTPLayer(nn.Layer):
    method __init__ (line 403) | def __init__(self, config):
    method forward (line 437) | def forward(self, args):
    method forward_impl (line 446) | def forward_impl(self, *args):
  class ErniePretrainingCriterionPipe (line 523) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion):
    method __init__ (line 524) | def __init__(self, config):
    method forward (line 527) | def forward(self, logits, labels):
  class PipelinePretrainedModel (line 541) | class PipelinePretrainedModel(PretrainedModel):
    method __init__ (line 542) | def __init__(self, config, *args, **kwargs):
    method init (line 546) | def init(self, config, *args, **kwargs):
    method add_sequential_layer (line 551) | def add_sequential_layer(self, layer_desc, name_prefix=""):
    method get_sequential_layers (line 554) | def get_sequential_layers(self):
    method get_sequential_name_prefixs (line 557) | def get_sequential_name_prefixs(self):
    method get_shardlayer_prefix (line 560) | def get_shardlayer_prefix(self, name_splited):
    method _set_pipeline_name_mapping (line 571) | def _set_pipeline_name_mapping(self, mappings=None):
    method _check_shared_model_state (line 631) | def _check_shared_model_state(self):
    method state_dict (line 652) | def state_dict(self, *args, **kwargs):
    method _init_weights (line 664) | def _init_weights(self, layer):
    method sharded_state_dict (line 736) | def sharded_state_dict(self, *args, **kwargs):
  function get_pp_vp_split_layers (line 770) | def get_pp_vp_split_layers(config):
  class ErnieMoEForCausalLMPipe (line 802) | class ErnieMoEForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method _prepare_pipeline_inputs_func (line 815) | def _prepare_pipeline_inputs_func(cls, data):
    method __init__ (line 849) | def __init__(
    method get_loss_fn (line 1000) | def get_loss_fn(self, config):
    method rename_model_params (line 1003) | def rename_model_params(self, func):
    method fp8_quant_weight (line 1010) | def fp8_quant_weight(self):
    method _post_init (line 1016) | def _post_init(self, original_init, *args, **kwargs):
    method set_state_dict (line 1033) | def set_state_dict(self, state_dict, *args, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/models/fp8_linear.py
  function fp8_gemm (line 44) | def fp8_gemm(
  function padding (line 94) | def padding(x, axis):
  class Fp8FusedMlpFunc (line 118) | class Fp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 128) | def forward(ctx, x, w1, w2):
    method backward (line 208) | def backward(ctx, do3):
  class MemEfficientFp8FusedMlpFunc (line 320) | class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer):
    method forward (line 333) | def forward(ctx, x, w1, w2):
    method backward (line 393) | def backward(ctx, do3):
  class Fp8FusedMlp (line 515) | class Fp8FusedMlp(paddle.nn.Layer):
    method __init__ (line 526) | def __init__(self, config):
    method forward (line 557) | def forward(self, x):

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/moe_layer.py
  function set_grad_in_dtype_non_consistent (line 73) | def set_grad_in_dtype_non_consistent(ctx):
  class Fp8MoeGateDispatchAndQuant (line 79) | class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer):
    method forward (line 83) | def forward(
    method backward (line 135) | def backward(ctx, *grads):
  function recompute_fwd_gate_up_func (line 154) | def recompute_fwd_gate_up_func(config, layer_idx):
  class MoEStatics (line 164) | class MoEStatics(nn.Layer):
    method __init__ (line 165) | def __init__(self, config, layer_idx):
  class GateCombine (line 196) | class GateCombine(PyLayer):
    method forward (line 198) | def forward(ctx, x, combine_weights, scatter_index):
    method backward (line 206) | def backward(ctx, grad_y, *_):
  class FusionFP8Expert (line 215) | class FusionFP8Expert(paddle.autograd.PyLayer):
    method forward (line 217) | def forward(ctx, hidden_states, custom_map):
    method backward (line 234) | def backward(ctx, output_grad):
  class AlltoAll (line 245) | class AlltoAll(PyLayer):
    method forward (line 247) | def forward(ctx, x, group, sync_op=True):
    method backward (line 260) | def backward(ctx, *dx):
  class AlltoAllExpertOverlap (line 264) | class AlltoAllExpertOverlap(PyLayer):
    method forward (line 266) | def forward(ctx, input, group, num_local_experts, forward_func_dict, i...
    method backward (line 302) | def backward(ctx, out_grad):
  class AlltoAllAsync (line 321) | class AlltoAllAsync(PyLayer):
    method forward (line 323) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False):
    method backward (line 344) | def backward(ctx, dx_out, *fn_out_grads):
  function dispatching (line 364) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity):
  function combining_fused (line 391) | def combining_fused(x, combine_weights, scatter_index, hard_gate=False):
  class ReshapeKeepGradDtype (line 400) | class ReshapeKeepGradDtype(PyLayer):
    method forward (line 402) | def forward(ctx, x, shape):
    method backward (line 408) | def backward(ctx, grad):
  class MOELayer (line 412) | class MOELayer(nn.Layer):
    method __init__ (line 420) | def __init__(
    method forward_experts (line 523) | def forward_experts(self, dispatched_input):
    method fp8_quant_weight (line 559) | def fp8_quant_weight(self):
    method fused_gate_logits_process (line 590) | def fused_gate_logits_process(self, gate_logits, token_type_ids, offlo...
    method gate_distpach_and_quant (line 603) | def gate_distpach_and_quant(self, input, token_type_ids):
    method gate_and_distpach (line 688) | def gate_and_distpach(self, input, token_type_ids):
    method _calc_router_loss (line 800) | def _calc_router_loss(
    method calc_router_loss_and_logging (line 831) | def calc_router_loss_and_logging(
    method combine_expert_output (line 854) | def combine_expert_output(self, expert_output, combine_weights, scatte...
    method forward_single_stage (line 862) | def forward_single_stage(self, dispatched_input, stage_id):
    method all2all_expert_overlap (line 866) | def all2all_expert_overlap(self, x, group):
    method forward (line 893) | def forward(
    method sharded_state_dict (line 1067) | def sharded_state_dict(
  class FP8FusedWLCHFunc (line 1079) | class FP8FusedWLCHFunc(paddle.autograd.PyLayer):
    method forward (line 1081) | def forward(
    method backward (line 1134) | def backward(ctx, output_grad):
  class MlpNode (line 1149) | class MlpNode:
    method __init__ (line 1150) | def __init__(self, custom_map, max_topk, recompute_fwd_gate_up=False, ...
    method reset_status (line 1165) | def reset_status(self):
    method release_mem (line 1173) | def release_mem(self):
    method forward (line 1178) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro...
    method backward (line 1219) | def backward(self, hidden_states_out_grad):
  class Fp8FusedMoeFunc (line 1243) | class Fp8FusedMoeFunc(paddle.autograd.PyLayer):
    method forward (line 1245) | def forward(
    method backward (line 1269) | def backward(ctx, output_grad):

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/token_dispatcher/fp8_utils.py
  function _get_fp8_weight_and_scale (line 44) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False):
  function fused_stack_transpose_quant (line 64) | def fused_stack_transpose_quant(weight_list, transpose=False):
  function split_group_gemm (line 82) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ...
  function has_config (line 120) | def has_config(config_map, key):
  class ExpertsGroupGemmNode (line 124) | class ExpertsGroupGemmNode:
    method __init__ (line 137) | def __init__(self, experts, custom_map, name="moe_experts_node"):
    method reset_status (line 161) | def reset_status(self):
    method fwd_gate_up (line 167) | def fwd_gate_up(self, x_bf16, expert_w1, expert_w_count, tokens_per_ex...
    method fwd_swiglu (line 224) | def fwd_swiglu(self, o1):
    method fwd_down (line 240) | def fwd_down(self, o1, unzipped_probs, expert_w_count, tokens_per_expe...
    method fwd_down_no_probs (line 301) | def fwd_down_no_probs(self, o1, expert_w2, expert_w_count, tokens_per_...
    method bwd_down_input (line 359) | def bwd_down_input(self, expert_w2, unzipped_grad, tokens_per_expert, ...
    method bwd_down_input_no_prob (line 429) | def bwd_down_input_no_prob(self, expert_w2, unzipped_grad, tokens_per_...
    method bwd_swiglu (line 469) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 486) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, expecte...
    method bwd_down_weight (line 544) | def bwd_down_weight(self, out_grad, o2, expert_w2):
    method bwd_gate_up_weight (line 628) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 707) | def forward(self, hs_out, unzipped_probs, tokens_per_expert):
    method backward (line 726) | def backward(self, out_grad, tokens_per_expert, dispatched_indices, ex...
    method forward_no_prob (line 741) | def forward_no_prob(self, hs_out, tokens_per_expert):
    method backward_no_prob (line 753) | def backward_no_prob(self, out_grad, tokens_per_expert):
  class ExpertsGroupGemmContiguousNode (line 775) | class ExpertsGroupGemmContiguousNode:
    method __init__ (line 788) | def __init__(
    method reset_status (line 833) | def reset_status(self):
    method gen_m_indices (line 842) | def gen_m_indices(self, tokens_per_expert):
    method fwd_gate_up (line 863) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, sca...
    method fwd_swiglu (line 937) | def fwd_swiglu(self, o1):
    method fwd_down (line 941) | def fwd_down(self, o1, unzipped_probs, expert_w2, num_expert):
    method bwd_down_input (line 1007) | def bwd_down_input(self, expert_w2, unzipped_grad, o1):
    method bwd_swiglu (line 1084) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 1088) | def bwd_gate_up_input(self, do1, expert_w1):
    method fused_transpose_split_quant (line 1145) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...
    method bwd_down_weight (line 1170) | def bwd_down_weight(self, do3, o2, expert_w2):
    method bwd_gate_up_weight (line 1246) | def bwd_gate_up_weight(self, do1, input_x, expert_w1):
    method forward (line 1310) | def forward(
    method backward (line 1334) | def backward(self, out_grad, a2a_async_fn=None):
  class ExpertsGroupGemmWLCHNode (line 1427) | class ExpertsGroupGemmWLCHNode(ExpertsGroupGemmContiguousNode):
    method __init__ (line 1443) | def __init__(
    method gen_m_indices (line 1479) | def gen_m_indices(self, tokens_per_expert):
    method fused_transpose_split_quant (line 1499) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal...

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/token_dispatcher/moe_utils.py
  function inplace_offload (line 24) | def inplace_offload(x):
  function inplace_offload_if_needed (line 41) | def inplace_offload_if_needed(x, threshold=2 * 1024 * 1024 * 1024):
  function topk_to_permuted_indices_single (line 61) | def topk_to_permuted_indices_single(x, num_tokens, expert_id, topk):
  function topk_to_permuted_indices (line 81) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
  function permute (line 105) | def permute(
  function unpermute (line 128) | def unpermute(
  class UnZipNode (line 163) | class UnZipNode:
    method __init__ (line 178) | def __init__(self, token_dispatcher, name="unzip"):
    method reset_status (line 190) | def reset_status(self):
    method forward (line 196) | def forward(
    method backward (line 240) | def backward(self, dx, hidden_states_out_grad, probs_grad, dispatched_...
  class ZipNode (line 268) | class ZipNode:
    method __init__ (line 281) | def __init__(self, token_dispatcher, name="zip"):
    method forward (line 292) | def forward(
    method backward (line 326) | def backward(

FILE: paddleformers/cli/train/ernie_pretrain/models/moe/top2_gate.py
  function cal_aux_loss_func (line 37) | def cal_aux_loss_func(
  function masked_fill (line 87) | def masked_fill(x, mask, value):
  class CalAuxLossFunctor (line 92) | class CalAuxLossFunctor(paddle.autograd.PyLayer):
    method forward (line 94) | def forward(
    method backward (line 124) | def backward(ctx, out_grad):
  function cast_if_needed (line 132) | def cast_if_needed(x, dtype):
  class FusedGateDetachMatmul (line 136) | class FusedGateDetachMatmul(paddle.autograd.PyLayer):
    method forward (line 138) | def forward(ctx, x, w):
    method backward (line 144) | def backward(ctx, y_grad):
  function gate_detach_matmul (line 157) | def gate_detach_matmul(x, weight, use_fuse):
  function compute_optimal_transport (line 166) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:...
  class Top2Gate (line 180) | class Top2Gate(nn.Layer):
    method __init__ (line 193) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->...
    method _create_gate_parameter (line 261) | def _create_gate_parameter(self):
    method forward (line 271) | def forward(
    method get_capacity (line 303) | def get_capacity(self, num_tokens, cap_factor=None):
    method top2_gating (line 318) | def top2_gating(self, logits, cap=None, correction_bias=None):
    method _cal_aux_loss (line 390) | def _cal_aux_loss(
    method _cal_orthogonal_loss (line 435) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None):
    method _cal_orthogonal_loss_opt_each_weight (line 450) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group):
  function cal_orthogonal_loss_opt_each_weight_func (line 457) | def cal_orthogonal_loss_opt_each_weight_func(weight, moe_k, use_group, e...
  class TopKGateFused (line 475) | class TopKGateFused(Top2Gate):
    method forward (line 476) | def forward(

FILE: paddleformers/cli/train/ernie_pretrain/models/sequence_parallel_utils.py
  function get_hcg (line 49) | def get_hcg():
  function get_async_loader (line 56) | def get_async_loader():
  function hack_offload_wait (line 69) | def hack_offload_wait(task):
  function hack_reload_wait (line 73) | def hack_reload_wait(task):
  class ScatterOp (line 77) | class ScatterOp(PyLayer):
    method forward (line 79) | def forward(ctx, input, axis=0, group=None):
    method backward (line 85) | def backward(ctx, grad):
  class GatherOp (line 89) | class GatherOp(PyLayer):
    method forward (line 91) | def forward(ctx, input, axis=0, group=None):
    method backward (line 97) | def backward(ctx, grad):
  class AllGatherOp (line 101) | class AllGatherOp(PyLayer):
    method forward (line 103) | def forward(ctx, input, group=None):
    method backward (line 108) | def backward(ctx, grad):
  class ReduceScatterOp (line 112) | class ReduceScatterOp(PyLayer):
    method forward (line 114) | def forward(ctx, input, group=None):
    method backward (line 120) | def backward(ctx, grad):
  class AllGatherVarlenOp (line 124) | class AllGatherVarlenOp(PyLayer):
    method forward (line 126) | def forward(ctx, input, group=None):
    method backward (line 165) | def backward(ctx, grad):
  class GemmReduceScatterOp (line 179) | class GemmReduceScatterOp(PyLayer):
    method forward (line 181) | def forward(ctx, input, weight, group):
    method backward (line 188) | def backward(ctx, grad):
  class AllGatherGemmOp (line 209) | class AllGatherGemmOp(PyLayer):
    method forward (line 211) | def forward(ctx, input, weight, group):
    method backward (line 219) | def backward(ctx, grad):
  function sequence_parallel_sparse_mask_labels (line 236) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
  function mark_as_sequence_parallel_parameter (line 252) | def mark_as_sequence_parallel_parameter(parameter):
  function is_sequence_parallel_parameter (line 256) | def is_sequence_parallel_parameter(parameter):
  function create_fused_allreduce_gradient_hook (line 260) | def create_fused_allreduce_gradient_hook(parameter_list, accumulation_st...
  function create_non_fused_allreduce_gradient_hook (line 277) | def create_non_fused_allreduce_gradient_hook(param, model, verbose=False):
  function register_sequence_parallel_allreduce_hooks (line 300) | def register_sequence_parallel_allreduce_hooks(model, fuse_sequence_para...
  function is_fused_matmul_bias_supported (line 323) | def is_fused_matmul_bias_supported():
  class ColumnSequenceParallelLinear (line 339) | class ColumnSequenceParallelLinear(Layer):
    method __init__ (line 340) | def __init__(
    method forward (line 432) | def forward(self, x, use_comm=True):
    method sharded_state_dict (line 452) | def sharded_state_dict(
  class MPScale (line 460) | class MPScale(PyLayer):
    method forward (line 462) | def forward(ctx, x, mp_degree):
    method backward (line 467) | def backward(ctx, dout):
  class RowSequenceParallelLinear (line 471) | class RowSequenceParallelLinear(Layer):
    method __init__ (line 472) | def __init__(
    method forward (line 568) | def forward(self, x):
    method sharded_state_dict (line 599) | def sharded_state_dict(

FILE: paddleformers/cli/train/ernie_pretrain/models/utils.py
  function get_global_training_logs (line 31) | def get_global_training_logs():
  function global_training_logs_enabled (line 49) | def global_training_logs_enabled():
  function inplace_offload (line 54) | def inplace_offload(tensor):
  function detach_and_requires_grad_ (line 59) | def detach_and_requires_grad_(*args):
  class FakeClone (line 67) | class FakeClone(paddle.autograd.PyLayer):
    method forward (line 69) | def forward(ctx, input):
    method backward (line 78) | def backward(ctx, grad_output):
  function manual_backward (line 82) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]):
  class FakeGather (line 120) | class FakeGather(paddle.autograd.PyLayer):
    method forward (line 122) | def forward(ctx, input, indices):
    method backward (line 132) | def backward(ctx, grad_output):
  class FusedUnpermutation (line 141) | class FusedUnpermutation(paddle.autograd.PyLayer):
    method forward (line 143) | def forward(
    method backward (line 180) | def backward(ctx, output_tokens_grad):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/fp8_quant_weight_callback.py
  function enable_in_dict_config (line 25) | def enable_in_dict_config(config, key):
  class FP8QuantWeightCallback (line 32) | class FP8QuantWeightCallback(TrainerCallback):
    method on_step_begin (line 33) | def on_step_begin(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/gc_callback.py
  class GCCallback (line 20) | class GCCallback(TrainerCallback):
    method on_train_begin (line 21) | def on_train_begin(self, args, state, control, **kwargs):
    method on_step_end (line 25) | def on_step_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/logging_callback.py
  class LoggingCallback (line 22) | class LoggingCallback(TrainerCallback):
    method __init__ (line 23) | def __init__(
    method on_log (line 28) | def on_log(self, args, state, control, logs=None, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/moe_correction_bias_adjust_callback.py
  class MoECorrectionBiasAdjustCallback (line 30) | class MoECorrectionBiasAdjustCallback(TrainerCallback):
    method __init__ (line 31) | def __init__(self, lr, use_sp):
    method on_optimizer_end (line 36) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/moe_logging_callback.py
  function tensor_md5 (line 46) | def tensor_md5(tensor):
  class GlobalRNGCallback (line 52) | class GlobalRNGCallback(TrainerCallback):
    method on_step_end (line 53) | def on_step_end(self, args, state, control, model, **kwargs):
  class MoeLoggingCallback (line 58) | class MoeLoggingCallback(TrainerCallback):
    method __init__ (line 59) | def __init__(self, optimizer):
    method on_log (line 70) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_step_end (line 76) | def on_step_end(self, args, state, control, model, **kwargs):
    method on_save (line 115) | def on_save(self, args, state, control, model, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/ortho_loss_callback.py
  class OrthogonalCallback (line 24) | class OrthogonalCallback(TrainerCallback):
    method __init__ (line 25) | def __init__(self, ortho_loss_lambda):
    method on_optimizer_end (line 28) | def on_optimizer_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/sp_grad_sync_callback.py
  class SPGradSyncCallback (line 30) | class SPGradSyncCallback(TrainerCallback):
    method __init__ (line 31) | def __init__(self, model):
    method on_optimizer_begin (line 43) | def on_optimizer_begin(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/tensorboard_callback.py
  function is_tensorboard_available (line 29) | def is_tensorboard_available():
  function rewrite_logs (line 33) | def rewrite_logs(d):
  class TensorBoardCallback (line 49) | class TensorBoardCallback(TrainerCallback):
    method __init__ (line 50) | def __init__(
    method _init_summary_writer (line 91) | def _init_summary_writer(self, args, log_dir=None):
    method on_train_begin (line 96) | def on_train_begin(self, args, state, control, **kwargs):
    method on_log (line 120) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_train_end (line 183) | def on_train_end(self, args, state, control, **kwargs):

FILE: paddleformers/cli/train/ernie_pretrain/src/clip/moe_clip.py
  class ClipGradForMOEByGlobalNorm (line 28) | class ClipGradForMOEByGlobalNorm(ClipGradBase):
    method __init__ (line 29) | def __init__(
    method __str__ (line 49) | def __str__(self):
    method get_l2_norm_pow (line 53) | def get_l2_norm_pow(params_grads, sum_dtype=None):
    method _dygraph_clip (line 101) | def _dygraph_clip(self, params_grads):

FILE: paddleformers/cli/train/ernie_pretrain/src/lr_schedulers/cosine_lr.py
  function get_cosine_schedule_with_warmup (line 24) | def get_cosine_schedule_with_warmup(

FILE: paddleformers/cli/train/ernie_pretrain/src/lr_schedulers/wsd_lr.py
  function get_wsd_schedule_with_warmup (line 20) | def get_wsd_schedule_with_warmup(

FILE: paddleformers/cli/train/ernie_pretrain/src/tokenizers/tokenization_eb_v2.py
  class ErnieBotTokenizer (line 29) | class ErnieBotTokenizer(PretrainedTokenizer):
    method __init__ (line 40) | def __init__(
    method space_token (line 72) | def space_token(self):
    method space_token_id (line 76) | def space_token_id(self):
    method gend_token (line 80) | def gend_token(self):
    method gend_token_id (line 84) | def gend_token_id(self):
    method im_start_id (line 88) | def im_start_id(self):
    method im_end_id (line 92) | def im_end_id(self):
    method vocab_size (line 96) | def vocab_size(self):
    method get_vocab (line 99) | def get_vocab(self):
    method _tokenize (line 104) | def _tokenize(self, text):
    method _convert_token_to_id (line 107) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 110) | def _convert_id_to_token(self, id):
    method convert_tokens_to_string (line 113) | def convert_tokens_to_string(self, tokens):
    method prepare_for_model (line 126) | def prepare_for_model(self, *args, **kwargs):
    method save_vocabulary (line 131) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st...
    method tokenize (line 147) | def tokenize(self, text: TextInput, **kwargs) -> List[str]:
    method _decode (line 169) | def _decode(self, *args, **kwargs):
    method _pad (line 179) | def _pad(
  function add_special_tokens (line 239) | def add_special_tokens(

FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/data_parallel.py
  class DataParallel (line 22) | class DataParallel(paddle.DataParallel):
    method init_reducer (line 23) | def init_reducer(self):
  function sync_dp_moe_params_across_sharding (line 74) | def sync_dp_moe_params_across_sharding(model: paddle.nn.Layer) -> None:

FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/dygraph_optimizer/hybrid_parallel_optimizer.py
  class HybridParallelClipGrad (line 37) | class HybridParallelClipGrad:
    method __init__ (line 38) | def __init__(self, clip, hcg, timers=None):
    method _global_norm (line 53) | def _global_norm(
    method _dygraph_clip (line 142) | def _dygraph_clip(self, params_grads):
    method _comm_and_clip (line 277) | def _comm_and_clip(
    method __getattr__ (line 330) | def __getattr__(self, item):
    method __call__ (line 333) | def __call__(self, params_grads):
  class HybridParallelOptimizer (line 337) | class HybridParallelOptimizer(HPBase):
    method __init__ (line 338) | def __init__(self, optimizer, hcg, strategy):

FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/pretraining_trainer.py
  function distributed_optimizer_maybe_overwrite (line 109) | def distributed_optimizer_maybe_overwrite(
  class PreTrainingArguments (line 134) | class PreTrainingArguments(TrainingArguments):
    method use_moe (line 286) | def use_moe(self):  # noqa: F811
    method use_moe (line 290) | def use_moe(self, value):
    method need_data (line 295) | def need_data(self):
    method combine_batch (line 299) | def combine_batch(self):
    method reeao_dataset_rank (line 303) | def reeao_dataset_rank(self):
    method reeao_dataset_world_size (line 307) | def reeao_dataset_world_size(self):
    method __post_init__ (line 310) | def __post_init__(self):
  class WeightedDistributedSampler (line 424) | class WeightedDistributedSampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 425) | def __init__(
    method set_epoch (line 471) | def set_epoch(self, epoch=0, consumed_samples=0):
    method gen_data_seq (line 476) | def gen_data_seq(self):
    method load_data_seq_from_cache (line 489) | def load_data_seq_from_cache(self):
    method gen_data_seq_weighted (line 502) | def gen_data_seq_weighted(self, num_examples, data_type=None):
    method roundup_and_shard (line 580) | def roundup_and_shard(self, indices):
    method __len__ (line 602) | def __len__(self):
    method __iter__ (line 605) | def __iter__(self):
  class DummySampler (line 673) | class DummySampler(PaddleNLPDistributedBatchSampler):
    method __init__ (line 674) | def __init__(self, dataset, batch_size=1, **kwargs):
    method __len__ (line 677) | def __len__(self):
    method __iter__ (line 680) | def __iter__(self):
  class PretrainingTrainer (line 685) | class PretrainingTrainer(Trainer):
    method __init__ (line 686) | def __init__(self, args=None, model=None, callbacks=[], **kwargs):
    method autocast_smart_context_manager (line 707) | def autocast_smart_context_manager(self):
    method _load_optimizer_state (line 739) | def _load_optimizer_state(self, checkpoint):
    method _save_moe_weights (line 788) | def _save_moe_weights(self, output_dir):
    method _wrap_model (line 835) | def _wrap_model(self, model, training=True):
    method _new_gradclip (line 1004) | def _new_gradclip(self):
    method evaluate (line 1050) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre...
    method prediction_pipeline_step (line 1081) | def prediction_pipeline_step(self, model, inputs, prediction_loss_only...
    method restore_dataloader_status (line 1087) | def restore_dataloader_status(self):
    method _get_eval_sampler (line 1132) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]:
    method _get_train_sampler (line 1142) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
    method _maybe_log_save_evaluate (line 1152) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_...
    method create_scheduler (line 1316) | def create_scheduler(self, num_training_steps):
    method create_optimizer (line 1340) | def create_optimizer(self, lr_scheduler=None):
    method save_model (line 1404) | def save_model(self, output_dir=None):
    method _load_rng_state (line 1410) | def _load_rng_state(self, checkpoint):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/logging.py
  function setup_logger_output_file (line 41) | def setup_logger_output_file(outputpath, local_rank):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/misc.py
  class SmoothedValue (line 42) | class SmoothedValue:
    method __init__ (line 43) | def __init__(
    method update (line 52) | def update(self, value):
    method global_avg (line 63) | def global_avg(self):
    method reset (line 66) | def reset(self):
  class TrainingLogs (line 71) | class TrainingLogs:
    method __new__ (line 74) | def __new__(cls, *args, **kw):
    method __init__ (line 79) | def __init__(self):
    method set_trainer_interval (line 87) | def set_trainer_interval(self, trainer, logging_interval):
    method global_meters_keys (line 92) | def global_meters_keys(self):
    method global_meters_keys (line 96) | def global_meters_keys(self, lst):
    method enable_skip_zero (line 99) | def enable_skip_zero(self, keys=[]):
    method update (line 107) | def update(self, **kwargs):
    method is_enabled (line 111) | def is_enabled(self):
    method __setitem__ (line 114) | def __setitem__(self, k, v):
    method __getitem__ (line 122) | def __getitem__(self, v):
    method __getattr__ (line 125) | def __getattr__(self, attr):
    method dict (line 132) | def dict(self, use_async=False):
    method reset (line 186) | def reset(self):
    method take_snapshot (line 191) | def take_snapshot(self):
    method restore_snapshot (line 194) | def restore_snapshot(self):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/seed_utils.py
  function set_seed (line 26) | def set_seed(seed):

FILE: paddleformers/cli/train/ernie_pretrain/src/utils/training_utils.py
  function reset_per_device_batch_size (line 20) | def reset_per_device_batch_size(global_batch_size, per_device_train_batc...

FILE: paddleformers/cli/train/ernie_pretrain/workflow.py
  function log_trainer_start (line 84) | def log_trainer_start():
  function load_huggingface_checkpoint (line 94) | def load_huggingface_checkpoint(model, args):
  function get_expected_state_dict (line 189) | def get_expected_state_dict(model, **kwargs):
  function update_model_config_from_args (line 257) | def update_model_config_from_args(config: ErnieMoEConfig, model_args: di...
  function get_tp_split_ckpt (line 267) | def get_tp_split_ckpt(args, path):
  class AllArguments (line 279) | class AllArguments(PreTrainingArguments):
    method __post_init__ (line 280) | def __post_init__(self):
  class ExpConfig (line 285) | class ExpConfig:
  function create_pretrained_dataset (line 291) | def create_pretrained_dataset(args):
  function run_ernie_pretrain (line 337) | def run_ernie_pretrain(model_args, data_args, generating_args, training_...

FILE: paddleformers/cli/train/sft/dataset_formatting.py
  function conversations_formatting_function (line 31) | def conversations_formatting_function(tokenizer: AutoTokenizer, messages...
  function instructions_formatting_function (line 49) | def instructions_formatting_function(tokenizer: AutoTokenizer):
  function paddleformers_instructions_formatting_function (line 75) | def paddleformers_instructions_formatting_function(tokenizer: AutoTokeni...
  function get_formatting_func_from_dataset (line 101) | def get_formatting_func_from_dataset(dataset: Union[Dataset], tokenizer:...

FILE: paddleformers/cli/train/sft/make_data_utils.py
  class DataGenerator (line 18) | class DataGenerator:
    method __init__ (line 21) | def __init__(self, data_source):
    method __iter__ (line 34) | def __iter__(self):
    method __next__ (line 41) | def __next__(self):

FILE: paddleformers/cli/train/sft/sft_config.py
  class SFTConfig (line 30) | class SFTConfig(TrainingArguments):
    method __post_init__ (line 87) | def __post_init__(self):

FILE: paddleformers/cli/train/sft/sft_trainer.py
  class SFTTrainer (line 58) | class SFTTrainer(Trainer):
    method __init__ (line 59) | def __init__(
    method _prepare_dataset (line 197) | def _prepare_dataset(
    method _prepare_non_packed_dataloader (line 248) | def _prepare_non_packed_dataloader(
    method prediction_step (line 319) | def prediction_step(
    method log (line 378) | def log(self, logs: Dict[str, float], **kwargs) -> None:
    method get_ptq_dataloader (line 386) | def get_ptq_dataloader(self, ptq_ds):
    method ptq_loop (line 411) | def ptq_loop(

FILE: paddleformers/cli/train/sft/workflow.py
  function create_pretrained_dataset (line 89) | def create_pretrained_dataset(training_args, data_args, model_args):
  function run_sft (line 167) | def run_sft(
  function create_peft_model (line 735) | def create_peft_model(model_args, training_args, dtype, model):

FILE: paddleformers/cli/train/tuner.py
  function check_path (line 25) | def check_path(path):
  function _training_function (line 33) | def _training_function(config: dict[str, Any]) -> None:
  function run_tuner (line 71) | def run_tuner(args: Optional[dict[str, Any]] = None) -> None:

FILE: paddleformers/cli/utils/llm_utils.py
  function compute_metrics (line 44) | def compute_metrics(eval_preds):
  function get_lora_target_modules (line 55) | def get_lora_target_modules(model):
  function get_infer_model_path (line 424) | def get_infer_model_path(input_dir, model_prefix):
  function deserialize_from_file (line 432) | def deserialize_from_file(fp):
  function get_alibi_slopes (line 462) | def get_alibi_slopes(num_heads):
  function pad_batch_data (line 477) | def pad_batch_data(insts, masks=None, pad_id=0, return_seq_len=False, pa...
  function dybatch_preprocess (line 505) | def dybatch_preprocess(
  function load_real_time_tokens (line 735) | def load_real_time_tokens():
  function init_chat_template (line 752) | def init_chat_template(
  function get_model_max_position_embeddings (line 799) | def get_model_max_position_embeddings(config: PretrainedConfig) -> Optio...
  function read_res (line 812) | def read_res(
  function read_res_dynamic_insert (line 850) | def read_res_dynamic_insert(
  function speculate_read_res (line 899) | def speculate_read_res(
  function get_rotary_position_embedding (line 950) | def get_rotary_position_embedding(position_ids, head_dim, rope_theta=100...
  function init_dist_env (line 1000) | def init_dist_env():
  function get_eos_token_id (line 1044) | def get_eos_token_id(
  function set_triton_cache (line 1066) | def set_triton_cache(model_name_or_path, mode):

FILE: paddleformers/cli/utils/mllm_utils.py
  class MLLMModelMapping (line 27) | class MLLMModelMapping:
  class ModelKeys (line 37) | class ModelKeys:
  class MultiModelKeys (line 52) | class MultiModelKeys(ModelKeys):
    method __post_init__ (line 57) | def __post_init__(self):
  function register_multimodel_keys (line 66) | def register_multimodel_keys(multimodel_key: ModelKeys, *, exist_ok: boo...
  function get_multimodel_target_modules (line 73) | def get_multimodel_target_modules(model_type: Optional[str]) -> Optional...
  function get_multimodel_lora_target_modules (line 79) | def get_multimodel_lora_target_modules(model, target_modules, freeze_con...
  function freeze_model_parameters (line 131) | def freeze_model_parameters(model, freeze_config):

FILE: paddleformers/cli/utils/process.py
  function terminate_process_tree (line 26) | def terminate_process_tree(pid: int) -> None:
  function is_env_enabled (line 61) | def is_env_enabled(env_var: str, default: str = "0") -> bool:
  function is_valid_model_dir (line 66) | def is_valid_model_dir(directory: str) -> bool:
  function detect_device (line 75) | def detect_device() -> str:
  function set_ascend_environment (line 98) | def set_ascend_environment():
  function remove_paddle_shm_files (line 163) | def remove_paddle_shm_files():
  function set_cuda_environment (line 174) | def set_cuda_environment():
  function set_env_if_empty (line 197) | def set_env_if_empty(key, value):
  function add_new_special_tokens (line 212) | def add_new_special_tokens(tokenizer, path):

FILE: paddleformers/data/blendable_dataset.py
  function print_rank_0 (line 26) | def print_rank_0(*args, **kwargs):
  class BlendableDataset (line 31) | class BlendableDataset(paddle.io.Dataset):
    method __init__ (line 32) | def __init__(self, datasets, weights, size, share_folder, *, data_cach...
    method __len__ (line 175) | def __len__(self):
    method __getitem__ (line 178) | def __getitem__(self, idx):

FILE: paddleformers/data/causal_dataset.py
  function get_logits (line 35) | def get_logits(batch_ids, max_retries=1, timeout=1200, retry_delay=1, pr...
  function check_data_split (line 82) | def check_data_split(splits_string, do_train, do_eval, do_predict):
  function get_train_valid_test_split_ (line 102) | def get_train_valid_test_split_(splits_string, size):
  function get_datasets_weights_and_num_samples (line 129) | def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num...
  function print_rank_0 (line 160) | def print_rank_0(*args, **kwargs):
  function build_train_valid_test_datasets (line 165) | def build_train_valid_test_datasets(
  function _build_train_valid_test_datasets (line 261) | def _build_train_valid_test_datasets(
  function get_indexed_dataset_ (line 338) | def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
  class GPTDataset (line 350) | class GPTDataset(paddle.io.Dataset):
    method __init__ (line 351) | def __init__(
    method __len__ (line 425) | def __len__(self):
    method __getitem__ (line 430) | def __getitem__(self, idx):
  function _build_index_mappings (line 523) | def _build_index_mappings(
  function _num_tokens (line 713) | def _num_tokens(documents, sizes):
  function _num_epochs (line 718) | def _num_epochs(tokens_per_epoch, seq_length, num_samples):
  function _build_doc_idx (line 733) | def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
  function _build_sample_idx (line 749) | def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per...
  function _build_shuffle_idx (line 797) | def _build_shuffle_idx(num_samples, total_size, np_rng):

FILE: paddleformers/data/collate.py
  class Stack (line 26) | class Stack(object):
    method __init__ (line 38) | def __init__(self, axis=0, dtype=None):
    method __call__ (line 42) | def __call__(self, data):
  class Pad (line 72) | class Pad(object):
    method __init__ (line 95) | def __init__(self, pad_val=0, axis=0, ret_length=None, dtype=None, pad...
    method __call__ (line 102) | def __call__(self, data):
  class Tuple (line 169) | class Tuple(object):
    method __init__ (line 187) | def __init__(self, fn, *args):
    method __call__ (line 200) | def __call__(self, data):
  class Dict (line 247) | class Dict(object):
    method __init__ (line 266) | def __init__(self, fn):
    method __call__ (line 280) | def __call__(self, data):

FILE: paddleformers/data/data_collator.py
  class DataCollatorMixin (line 61) | class DataCollatorMixin:
    method __call__ (line 62) | def __call__(self, features, return_tensors=None):
  function default_data_collator (line 73) | def default_data_collator(features: List[InputDataClass], return_tensors...
  function paddle_default_data_collator (line 96) | def paddle_default_data_collator(features: List[InputDataClass]) -> Dict...
  function numpy_default_data_collator (line 128) | def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[...
  class DefaultDataCollator (line 162) | class DefaultDataCollator(DataCollatorMixin):
    method __call__ (line 179) | def __call__(self, features: List[Dict[str, Any]], return_tensors=None...
  class DataCollatorWithPadding (line 186) | class DataCollatorWithPadding:
    method __call__ (line 202) | def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
  class DataCollatorForTokenClassification (line 225) | class DataCollatorForTokenClassification(DataCollatorMixin):
    method paddle_call (line 262) | def paddle_call(self, features):
    method numpy_call (line 299) | def numpy_call(self, features):
  class DataCollatorForSeq2Seq (line 330) | class DataCollatorForSeq2Seq:
    method __call__ (line 376) | def __call__(self, features, return_tensors=None):
  class DataCollatorForEmbedding (line 434) | class DataCollatorForEmbedding:
    method __call__ (line 448) | def __call__(self, batch, return_tensors=None) -> Any:
    method process_data (line 507) | def process_data(self, data, pad_idx, max_len):
    method pad_batch_data (line 516) | def pad_batch_data(insts, pad_id=0, max_seq_len=None, return_seq_len=F...
    method gen_self_attn_mask (line 531) | def gen_self_attn_mask(batch_token_ids: List[List[int]], max_seq_len: ...
    method gen_attn_mask_start_row_indices (line 543) | def gen_attn_mask_start_row_indices(batch_token_ids: List[List[int]], ...
  function _paddle_collate_batch (line 561) | def _paddle_collate_batch(examples, tokenizer, pad_to_multiple_of: Optio...
  function _numpy_collate_batch (line 599) | def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Option...
  function tolist (line 633) | def tolist(x):
  class DataCollatorForLanguageModeling (line 642) | class DataCollatorForLanguageModeling(DataCollatorMixin):
    method paddle_call (line 671) | def paddle_call(self, examples: List[Union[List[int], Any, Dict[str, A...
    method paddle_mask_tokens (line 695) | def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optiona...
    method numpy_call (line 736) | def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, An...
    method numpy_mask_tokens (line 758) | def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional...

FILE: paddleformers/data/dist_dataloader.py
  class DummyDataset (line 27) | class DummyDataset(paddle.io.Dataset):
    method __len__ (line 32) | def __len__(self):
  class IterableDummyDataset (line 36) | class IterableDummyDataset(paddle.io.IterableDataset):
    method __iter__ (line 37) | def __iter__(self):
  class DistDataLoader (line 41) | class DistDataLoader(paddle.io.DataLoader):
    method __init__ (line 46) | def __init__(
    method _dataloader_iter (line 132) | def _dataloader_iter(self):
    method __len__ (line 137) | def __len__(self):
    method __iter__ (line 143) | def __iter__(self):
    method _broadcast_data (line 146) | def _broadcast_data(self, data):
    method __next__ (line 201) | def __next__(self):
  function init_dataloader_comm_group (line 213) | def init_dataloader_comm_group():

FILE: paddleformers/data/indexed_dataset.py
  function print_rank_0 (line 40) | def print_rank_0(*args, **kwargs):
  function __best_fitting_dtype (line 45) | def __best_fitting_dtype(vocab_size=None):
  function get_available_dataset_impl (line 52) | def get_available_dataset_impl():
  function make_dataset (line 56) | def make_dataset(path, impl, skip_warmup=False):
  function make_sft_dataset (line 72) | def make_sft_dataset(path, dataclass, skip_warmup=False, impl="mmap"):
  function dataset_exists (line 85) | def dataset_exists(path, impl):
  function read_longs (line 92) | def read_longs(f, n):
  function write_longs (line 98) | def write_longs(f, a):
  function read_shorts (line 102) | def read_shorts(f, n):
  function write_shorts (line 108) | def write_shorts(f, a):
  function code (line 126) | def code(dtype):
  function index_file_path (line 133) | def index_file_path(prefix_path):
  function sft_index_file_path (line 137) | def sft_index_file_path(prefix_path):
  function sft_data_file_path (line 141) | def sft_data_file_path(prefix_path, dataclass):
  function data_file_path (line 149) | def data_file_path(prefix_path):
  function loss_mask_file_path (line 153) | def loss_mask_file_path(prefix_path):
  function create_doc_idx (line 157) | def create_doc_idx(sizes):
  class IndexedDataset (line 165) | class IndexedDataset(paddle.io.Dataset):
    method __init__ (line 170) | def __init__(self, path):
    method read_index (line 176) | def read_index(self, path):
    method read_data (line 193) | def read_data(self, path):
    method check_index (line 196) | def check_index(self, i):
    method __del__ (line 200) | def __del__(self):
    method __getitem__ (line 205) | def __getitem__(self, idx):
    method get (line 229) | def get(self, idx, offset=0, length=None):
    method __len__ (line 247) | def __len__(self):
    method num_tokens (line 250) | def num_tokens(self, index):
    method size (line 253) | def size(self, index):
    method exists (line 257) | def exists(path):
    method supports_prefetch (line 261) | def supports_prefetch(self):
    method doc_idx (line 265) | def doc_idx(self):
    method get_doc_idx (line 268) | def get_doc_idx(self):
    method set_doc_idx (line 271) | def set_doc_idx(self, doc_idx_):
  class IndexedDatasetBuilder (line 275) | class IndexedDatasetBuilder(object):
    method __init__ (line 287) | def __init__(self, out_file, dtype=np.int32):
    method add_item (line 296) | def add_item(self, tensor):
    method end_document (line 305) | def end_document(self):
    method merge_file_ (line 308) | def merge_file_(self, another_file):
    method finalize (line 333) | def finalize(self, index_file):
  function _warmup_mmap_file (line 354) | def _warmup_mmap_file(path):
  class MMapIndexedDataset (line 360) | class MMapIndexedDataset(paddle.io.Dataset):
    class Index (line 361) | class Index(object):
      method writer (line 365) | def writer(cls, path, dtype):
      method __init__ (line 410) | def __init__(self, path, skip_warmup=False):
      method __del__ (line 448) | def __del__(self):
      method dtype (line 453) | def dtype(self):
      method sizes (line 457) | def sizes(self):
      method doc_idx (line 461) | def doc_idx(self):
      method __getitem__ (line 465) | def __getitem__(self, i):
      method __len__ (line 468) | def __len__(self):
    method __init__ (line 471) | def __init__(self, path, skip_warmup=False):
    method __getstate__ (line 481) | def __getstate__(self):
    method __setstate__ (line 484) | def __setstate__(self, state):
    method _do_init (line 487) | def _do_init(self, path, skip_warmup):
    method __del__ (line 506) | def __del__(self):
    method __len__ (line 514) | def __len__(self):
    method __getitem__ (line 518) | def __getitem__(self, idx):
    method get (line 537) | def get(self, idx, offset=0, length=None):
    method sizes (line 555) | def sizes(self):
    method doc_idx (line 559) | def doc_idx(self):
    method get_doc_idx (line 562) | def get_doc_idx(self):
    method set_doc_idx (line 565) | def set_doc_idx(self, doc_idx_):
    method supports_prefetch (line 569) | def supports_prefetch(self):
    method exists (line 573) | def exists(path):
  class SFTMMapIndexedDataset (line 577) | class SFTMMapIndexedDataset(paddle.io.Dataset):
    class Index (line 578) | class Index(object):
      method writer (line 582) | def writer(cls, path, dtype):
      method __init__ (line 624) | def __init__(self, path, skip_warmup=False):
      method __del__ (line 662) | def __del__(self):
      method dtype (line 667) | def dtype(self):
      method sizes (line 671) | def sizes(self):
      method doc_idx (line 675) | def doc_idx(self):
      method __getitem__ (line 679) | def __getitem__(self, i):
      method __len__ (line 682) | def __len__(self):
    method __init__ (line 685) | def __init__(self, path, dataclass, skip_warmup=False):
    method __getstate__ (line 694) | def __getstate__(self):
    method __setstate__ (line 697) | def __setstate__(self, state):
    method _do_init (line 700) | def _do_init(self, path, skip_warmup):
    method __del__ (line 719) | def __del__(self):
    method __len__ (line 726) | def __len__(self):
    method __getitem__ (line 729) | def __getitem__(self, idx):
    method sizes (line 767) | def sizes(self):
    method doc_idx (line 771) | def doc_idx(self):
    method get_doc_idx (line 774) | def get_doc_idx(self):
    method set_doc_idx (line 777) | def set_doc_idx(self, doc_idx_):
    method supports_prefetch (line 781) | def supports_prefetch(self):
    method exists (line 785) | def exists(path, dataclass):
  function make_builder (line 794) | def make_builder(out_file, impl, save_dtype, loss_mask_file=None):
  class SFTMMapIndexedDatasetBuilder (line 801) | class SFTMMapIndexedDatasetBuilder(object):
    method __init__ (line 802) | def __init__(self, output_file_dict, dtype, index_file=None):
    method add_item (line 818) | def add_item(self, sequence):
    method add_item_bytes (line 827) | def add_item_bytes(self, serialized):
    method end_document (line 835) | def end_document(self):
    method finalize (line 842) | def finalize(self, index_file):
  class MMapIndexedDatasetBuilder (line 849) | class MMapIndexedDatasetBuilder(object):
    method __init__ (line 850) | def __init__(self, out_file, dtype, loss_mask_file=None):
    method flush_loss_mask_item (line 859) | def flush_loss_mask_item(self, loss_mask_lst):
    method add_item (line 864) | def add_item(self, tensor):
    method add_doc (line 869) | def add_doc(self, tensor, sizes):
    method end_document (line 875) | def end_document(self):
    method merge_file_ (line 878) | def merge_file_(self, another_file):
    method finalize (line 891) | def finalize(self, index_file):
  function get_indexed_dataset_ (line 903) | def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
  class CompatibleIndexedDataset (line 919) | class CompatibleIndexedDataset(paddle.io.Dataset):
    method __init__ (line 920) | def __init__(self, path):
    method __getstate__ (line 934) | def __getstate__(self):
    method __len__ (line 937) | def __len__(self):
    method __getitem__ (line 941) | def __getitem__(self, idx):
    method get (line 960) | def get(self, idx, offset=0, length=None):
    method sizes (line 976) | def sizes(self):
    method doc_idx (line 980) | def doc_idx(self):
    method get_doc_idx (line 983) | def get_doc_idx(self):
    method set_doc_idx (line 986) | def set_doc_idx(self, doc_idx_):
    method exists (line 990) | def exists(path):

FILE: paddleformers/data/sampler.py
  class SamplerHelper (line 22) | class SamplerHelper(object):
    method __init__ (line 46) | def __init__(self, dataset, iterable=None):
    method __iter__ (line 53) | def __iter__(self):
    method __len__ (line 63) | def __len__(self):
    method length (line 72) | def length(self):
    method length (line 86) | def length(self, length):
    method apply (line 89) | def apply(self, fn):
    method shuffle (line 105) | def shuffle(self, buffer_size=-1, seed=None):
    method sort (line 171) | def sort(self, cmp=None, key=None, reverse=False, buffer_size=-1):
    method batch (line 247) | def batch(self, batch_size, drop_last=False, batch_size_fn=None, key=N...
    method shard (line 335) | def shard(self, num_replicas=None, rank=None):
    method list (line 406) | def list(self):

FILE: paddleformers/data/tokenizer.py
  function get_idx_from_word (line 16) | def get_idx_from_word(word, word_to_idx, unk_word):
  class BaseTokenizer (line 22) | class BaseTokenizer(object):
    method __init__ (line 23) | def __init__(self, vocab):
    method get_tokenizer (line 26) | def get_tokenizer(self):
    method cut (line 29) | def cut(self, sentence):
    method encode (line 32) | def encode(self, sentence):

FILE: paddleformers/data/vocab.py
  class Vocab (line 24) | class Vocab(object):
    method __init__ (line 57) | def __init__(
    method _index_counter_keys (line 132) | def _index_counter_keys(self, counter, special_tokens, max_size, min_f...
    method _sort_index_according_to_user_specification (line 147) | def _sort_index_according_to_user_specification(self, token_to_idx):
    method to_tokens (line 172) | def to_tokens(self, indices):
    method to_indices (line 228) | def to_indices(self, tokens):
    method __getitem__ (line 259) | def __getitem__(self, tokens):
    method __len__ (line 268) | def __len__(self):
    method __contains__ (line 271) | def __contains__(self, token):
    method __call__ (line 274) | def __call__(self, tokens):
    method idx_to_token (line 284) | def idx_to_token(self):
    method token_to_idx (line 289) | def token_to_idx(self):
    method to_json (line 293) | def to_json(self, path=None):
    method from_json (line 333) | def from_json(cls, json_str):
    method from_dict (line 378) | def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_t...
    method build_vocab (line 431) | def build_vocab(
    method load_vocabulary (line 509) | def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_toke...
    method save_vocabulary (line 558) | def save_vocabulary(self, filepath):
    method get_unk_token_id (line 569) | def get_unk_token_id(self):
    method get_bos_token_id (line 572) | def get_bos_token_id(self):
    method get_eos_token_id (line 575) | def get_eos_token_id(self):
    method get_pad_token_id (line 578) | def get_pad_token_id(self):

FILE: paddleformers/datasets/DPODataset.py
  class Sequence (line 37) | class Sequence:
  class BaseDPODataSet (line 54) | class BaseDPODataSet:
    method __init__ (line 55) | def __init__(self, **dataset_config):
    method __len__ (line 90) | def __len__(self):
    method _generate_sequences (line 93) | def _generate_sequences(self):
    method _generate_greedy_packs (line 160) | def _generate_greedy_packs(self, sequences):
    method _preprocess_dpo_example (line 186) | def _preprocess_dpo_example(self, example):
    method __postprocess_before_concat (line 237) | def __postprocess_before_concat(self, example):
    method _postprocess_sequence (line 375) | def _postprocess_sequence(self, example):
  class IteratorDPODataset (line 495) | class IteratorDPODataset(BaseDPODataSet, IterableDataset):
    method __init__ (line 496) | def __init__(self, **dataset_config):
    method __iter__ (line 499) | def __iter__(self):
  class MapDPODataset (line 507) | class MapDPODataset(BaseDPODataSet, Dataset):
    method __init__ (line 508) | def __init__(self, **dataset_config):
    method __len__ (line 515) | def __len__(self):
    method __getitem__ (line 518) | def __getitem__(self, idx):

FILE: paddleformers/datasets/SFTDataset.py
  class TextSequence (line 39) | class TextSequence:
  class Sequence (line 49) | class Sequence:
  class BaseSFTDataset (line 62) | class BaseSFTDataset:
    method __init__ (line 63) | def __init__(self, **dataset_config):
    method __len__ (line 194) | def __len__(self):
    method _worker_loop (line 197) | def _worker_loop(self):
    method _get_processed_data_iterator (line 212) | def _get_processed_data_iterator(self, dataset_iterator, actual_exampl...
    method _process_sequence (line 317) | def _process_sequence(self, example, actual_example_num):
    method _process_pretraining_tokens (line 324) | def _process_pretraining_tokens(self, example, actual_example_num):
    method _generate_greedy_packs_from_sequences (line 328) | def _generate_greedy_packs_from_sequences(self, sequences):
    method _generate_sequences (line 357) | def _generate_sequences(self):
    method __iter__ (line 574) | def __iter__(self):
    method _encode_pretraining_messages (line 585) | def _encode_pretraining_messages(self, messages, actual_example_num):
    method _postprocess_pretraining_sequence (line 593) | def _postprocess_pretraining_sequence(self, example, actual_example_num):
    method _postprocess_sequence (line 679) | def _postprocess_sequence(self, example, actual_example_num):
    method print_max_steps_estimate_progress (line 858) | def print_max_steps_estimate_progress(self):
    method _add_dynamic_eos (line 869) | def _add_dynamic_eos(input_ids, labels, suffix_tokens_id):
    method _binpacking_process_batch (line 885) | def _binpacking_process_batch(self, iterator, batch_size):
  class IteratorSFTDataset (line 901) | class IteratorSFTDataset(BaseSFTDataset, IterableDataset):
    method __init__ (line 902) | def __init__(self, **dataset_config):
    method __iter__ (line 905) | def __iter__(self):
  class MapSFTDataset (line 913) | class MapSFTDataset(BaseSFTDataset, Dataset):
    method __init__ (line 914) | def __init__(self, **dataset_config):
    method __len__ (line 933) | def __len__(self):
    method __getitem__ (line 936) | def __getitem__(self, idx):

FILE: paddleformers/datasets/collate.py
  function calc_padding_size (line 28) | def calc_padding_size(seq_len: int, training_args) -> int:
  function dpo_collate_fn (line 48) | def dpo_collate_fn(
  function mm_dpo_collate_fn (line 193) | def mm_dpo_collate_fn(
  function collate_fn (line 442) | def collate_fn(
  function mm_collate_fn (line 523) | def mm_collate_fn(
  function pad_batch_data (line 724) | def pad_batch_data(
  function gen_self_attn_mask (line 777) | def gen_self_attn_mask(batch_token_ids: List[List[int]], max_seq_len: in...
  function gen_attn_mask_startend_row_indices (line 804) | def gen_attn_mask_startend_row_indices(

FILE: paddleformers/datasets/data_utils.py
  function round_up_to_multiple_of_8 (line 32) | def round_up_to_multiple_of_8(n):
  function print_debug_info (line 37) | def print_debug_info(tokenizer, data, label):
  function convert_to_tokens_for_pt (line 46) | def convert_to_tokens_for_pt(
  function convert_to_tokens_for_sft (line 68) | def convert_to_tokens_for_sft(
  function convert_to_input_ids (line 113) | def convert_to_input_ids(
  function function_call_chat_template (line 151) | def function_call_chat_template(tokenizer, messages, tools):
  function postprocess_fc_sequence (line 181) | def postprocess_fc_sequence(tokenizer, example):
  function estimate_training (line 188) | def estimate_training(train_dataset, data_args, training_args, model_args):
  function get_worker_sliced_iterator (line 308) | def get_worker_sliced_iterator(dataset):
  function calculate_matched_group (line 344) | def calculate_matched_group(sequences, packing_length: int, is_finished:...

FILE: paddleformers/datasets/dataset.py
  function load_from_ppnlp (line 56) | def load_from_ppnlp(path, *args, **kwargs):
  class DatasetTuple (line 69) | class DatasetTuple:
    method __init__ (line 70) | def __init__(self, splits):
    method __getitem__ (line 75) | def __getitem__(self, key):
    method __setitem__ (line 81) | def __setitem__(self, key, value):
    method _gen_identifier_map (line 84) | def _gen_identifier_map(self, splits):
    method __len__ (line 92) | def __len__(self):
  function import_main_class (line 96) | def import_main_class(module_path):
  function load_from_hf (line 117) | def load_from_hf(path, name=None, splits=None, **kwargs):
  function load_dataset (line 158) | def load_dataset(path_or_read_func, name=None, data_files=None, splits=N...
  class MapDataset (line 233) | class MapDataset(Dataset):
    method __init__ (line 246) | def __init__(self, data, **kwargs):
    method _transform (line 254) | def _transform(self, data):
    method __getitem__ (line 259) | def __getitem__(self, idx):
    method __len__ (line 266) | def __len__(self):
    method filter (line 272) | def filter(self, fn, num_workers=0):
    method _filter (line 303) | def _filter(self, fn):
    method shard (line 307) | def shard(self, num_shards=None, index=None, contiguous=False):
    method _shard (line 311) | def _shard(self, num_shards=None, index=None, contiguous=False):
    method map (line 345) | def map(self, fn, lazy=True, batched=False, num_workers=0):
    method _map (line 384) | def _map(self, fn, lazy=True, batched=False):
  class IterDataset (line 394) | class IterDataset(IterableDataset):
    method __init__ (line 407) | def __init__(self, data, **kwargs):
    method _transform (line 415) | def _transform(self, data):
    method _shard_filter (line 420) | def _shard_filter(self, num_samples):
    method _filter (line 423) | def _filter(self, data):
    method __iter__ (line 429) | def __iter__(self):
    method skip (line 451) | def skip(self, n):
    method filter (line 457) | def filter(self, fn):
    method shard (line 471) | def shard(self, num_shards=None, index=None):
    method map (line 498) | def map(self, fn):
  class DatasetBuilder (line 512) | class DatasetBuilder:
    method __init__ (line 523) | def __init__(self, lazy=None, name=None, **config):
    method read_datasets (line 529) | def read_datasets(self, splits=None, data_files=None):
    method read (line 614) | def read(self, filename, split="train"):
    method _read (line 727) | def _read(self, filename: str, *args):
    method _get_data (line 736) | def _get_data(self, mode: str):
    method get_labels (line 745) | def get_labels(self):
    method get_vocab (line 751) | def get_vocab(self):
  class SimpleBuilder (line 758) | class SimpleBuilder(DatasetBuilder):
    method __init__ (line 759) | def __init__(self, lazy, read_func):
    method read (line 763) | def read(self, **kwargs):

FILE: paddleformers/datasets/loader.py
  function create_dataset (line 21) | def create_dataset(**dataset_config: Dict[str, Any]):
  function create_indexed_dataset (line 47) | def create_indexed_dataset(data_file_prefix):

FILE: paddleformers/datasets/reader/convertor.py
  function convert_dpo_txt_data (line 18) | def convert_dpo_txt_data(data):
  function convert_txt_data (line 94) | def convert_txt_data(item):
  function convert_mm_data (line 144) | def convert_mm_data(item):
  function convert_pretraining_data (line 242) | def convert_pretraining_data(data):
  function erniekit_convertor (line 256) | def erniekit_convertor(item):
  function messages_convertor (line 272) | def messages_convertor(item):

FILE: paddleformers/datasets/reader/download_manager.py
  function HuggingFaceDownload (line 22) | def HuggingFaceDownload(repo_id, download_path, resume_download=True, ma...

FILE: paddleformers/datasets/reader/file_reader.py
  class BaseReader (line 32) | class BaseReader(IterableDataset):
    method __init__ (line 35) | def __init__(
  class FileReader (line 63) | class FileReader(BaseReader):
    method __init__ (line 64) | def __init__(
    method __iter__ (line 82) | def __iter__(self):
    method _get_extension (line 143) | def _get_extension(self):
    method _data_check (line 147) | def _data_check(self, data):
  class FileListReader (line 225) | class FileListReader(BaseReader):
    method __init__ (line 226) | def __init__(
    method __iter__ (line 246) | def __iter__(self):
    method _get_files (line 259) | def _get_files(self):
  function get_hf_dataset_config (line 268) | def get_hf_dataset_config(file_path):
  class HuggingFaceReader (line 275) | class HuggingFaceReader(BaseReader):
    method __init__ (line 276) | def __init__(
    method __iter__ (line 316) | def __iter__(self):

FILE: paddleformers/datasets/reader/io.py
  function load_json (line 23) | def load_json(file_path):
  function load_txt (line 47) | def load_txt(file_path):
  function load_csv (line 57) | def load_csv(file_path):
  function load_parquet (line 67) | def load_parquet(file_path):

FILE: paddleformers/datasets/reader/mix_datasets.py
  class BaseMixDataset (line 24) | class BaseMixDataset(IterableDataset):
    method __init__ (line 29) | def __init__(
    method __iter__ (line 60) | def __iter__(self):
    method __len__ (line 67) | def __len__(self):
  class RandomDataset (line 74) | class RandomDataset(BaseMixDataset):
    method __init__ (line 79) | def __init__(self, *args, **kwargs):
    method __iter__ (line 90) | def __iter__(self):
    method __len__ (line 116) | def __len__(self):
  class ConcatDataset (line 120) | class ConcatDataset(BaseMixDataset):
    method __init__ (line 129) | def __init__(self, *args, **kwargs):
    method __iter__ (line 140) | def __iter__(self):
    method __len__ (line 153) | def __len__(self):
  class InterLeaveDataset (line 158) | class InterLeaveDataset(BaseMixDataset):
    method __init__ (line 167) | def __init__(self, *args, **kwargs):
    method _build_dataset (line 186) | def _build_dataset(self):
    method __iter__ (line 239) | def __iter__(self):
    method __len__ (line 252) | def __len__(self):
  function create_dataset_instance (line 265) | def create_dataset_instance(class_name, *args, **kwargs):

FILE: paddleformers/datasets/reader/multi_source_datasets.py
  class InfiniteDataset (line 28) | class InfiniteDataset(IterableDataset):
    method __init__ (line 34) | def __init__(self, dataset, rng=None, random_shuffle=True):
    method __iter__ (line 49) | def __iter__(self):
  class MultiSourceDataset (line 62) | class MultiSourceDataset(IterableDataset):
    method __init__ (line 65) | def __init__(self, **dataset_config):
    method __iter__ (line 164) | def __iter__(self):

FILE: paddleformers/datasets/rlhf_datasets/protocol.py
  class TensorDict (line 35) | class TensorDict:
    method __init__ (line 36) | def __init__(self, source: dict, batch_size=None, num_batch_dims=1):
    method __setitem__ (line 44) | def __setitem__(self, key: str, tensor: paddle.Tensor):
    method __getitem__ (line 51) | def __getitem__(self, key):
    method keys (line 54) | def keys(self):
    method items (line 57) | def items(self):
    method to (line 60) | def to(self, device: str):
  function union_two_dict (line 66) | def union_two_dict(dict1: Dict, dict2: Dict):
  function pad_dataproto_to_divisor (line 84) | def pad_dataproto_to_divisor(data: "DataProto", size_divisor: int):
  function unpad_dataproto (line 110) | def unpad_dataproto(data: "DataProto", pad_size):
  function union_tensor_dict (line 116) | def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict...
  function union_numpy_dict (line 132) | def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[...
  function list_of_dict_to_dict_of_list (line 146) | def list_of_dict_to_dict_of_list(list_of_dict: list[dict]):
  function fold_batch_dim (line 158) | def fold_batch_dim(data: "DataProto", new_batch_size):
  function unfold_batch_dim (line 178) | def unfold_batch_dim(data: "DataProto", batch_dims=2):
  function collate_fn (line 197) | def collate_fn(x: list["DataProtoItem"]):
  class DataProtoItem (line 211) | class DataProtoItem:
  class DataProto (line 219) | class DataProto:
    method __post_init__ (line 228) | def __post_init__(self):
    method __len__ (line 232) | def __len__(self):
    method __getitem__ (line 241) | def __getitem__(self, item):
    method print_size (line 246) | def print_size(self, prefix=""):
    method check_consistency (line 263) | def check_consistency(self):
    method from_single_dict (line 288) | def from_single_dict(cls, data: Dict[str, Union[paddle.Tensor, np.ndar...
    method from_dict (line 303) | def from_dict(cls, tensors: Dict[str, paddle.Tensor], non_tensors=None...
    method to (line 339) | def to(self, device) -> "DataProto":
    method select (line 353) | def select(self, batch_keys=None, non_tensor_batch_keys=None, meta_inf...
    method pop (line 388) | def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_k...
    method rename (line 420) | def rename(self, old_keys=None, new_keys=None) -> "DataProto":
    method union (line 447) | def union(self, other: "DataProto") -> "DataProto":
    method make_iterator (line 465) | def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader...
    method chunk (line 498) | def chunk(self, chunks: int) -> List["DataProto"]:
    method concat (line 533) | def concat(data: List["DataProto"]) -> "DataProto":
    method reorder (line 557) | def reorder(self, indices):
    method repeat (line 565) | def repeat(self, repeat_times=2, interleave=True):

FILE: paddleformers/datasets/rlhf_datasets/rl_dataset.py
  function left_padding (line 33) | def left_padding(sequences, padding_value=0, max_length=None):
  function padding_batch_data (line 43) | def padding_batch_data(
  function collate_fn (line 63) | def collate_fn(data_list: list[dict], pad_token_id: int, requires_label:...
  class RLHFDataset (line 83) | class RLHFDataset(Dataset):
    method __init__ (line 91) | def __init__(
    method _read_files (line 124) | def _read_files(self):
    method tokenize (line 133) | def tokenize(
    method __len__ (line 155) | def __len__(self):
    method __getitem__ (line 158) | def __getitem__(self, index: int) -> dict[str, paddle.Tensor]:

FILE: paddleformers/datasets/template/augment_utils.py
  class RandomApply (line 24) | class RandomApply:
    method __init__ (line 25) | def __init__(self, transforms, p=0.5):
    method __call__ (line 29) | def __call__(self, x):
  class RandomDiscreteRotation (line 36) | class RandomDiscreteRotation:
    method __init__ (line 37) | def __init__(self, degrees, interpolation="nearest", expand=True):
    method __call__ (line 42) | def __call__(self, img):
  class JpegCompression (line 47) | class JpegCompression:
    method __init__ (line 48) | def __init__(self, quality_range=(20, 80)):
    method __call__ (line 51) | def __call__(self, img):
  class RandomScale (line 59) | class RandomScale:
    method __init__ (line 60) | def __init__(self, scale_range=(0.7, 1.3), interpolation="bicubic"):
    method __call__ (line 64) | def __call__(self, img):
  class RandomSingleSidePadding (line 75) | class RandomSingleSidePadding:
    method __init__ (line 76) | def __init__(self, padding_range=(0, 20), fill="white"):
    method __call__ (line 83) | def __call__(self, img):

FILE: paddleformers/datasets/template/formatter.py
  class Formatter (line 33) | class Formatter(ABC):
    method apply (line 38) | def apply(self, **kwargs) -> SLOTS:
    method extract (line 42) | def extract(self, content: str) -> Union[str, list["FunctionCall"]]:
  class EmptyFormatter (line 51) | class EmptyFormatter(Formatter):
    method __post_init__ (line 52) | def __post_init__(self):
    method apply (line 62) | def apply(self, **kwargs) -> SLOTS:
  class StringFormatter (line 67) | class StringFormatter(Formatter):
    method __post_init__ (line 68) | def __post_init__(self):
    method apply (line 78) | def apply(self, **kwargs) -> SLOTS:
  class FunctionFormatter (line 97) | class FunctionFormatter(StringFormatter):
    method __post_init__ (line 98) | def __post_init__(self):
    method apply (line 103) | def apply(self, **kwargs) -> SLOTS:
  class ToolFormatter (line 138) | class ToolFormatter(Formatter):
    method __post_init__ (line 139) | def __post_init__(self):
    method apply (line 143) | def apply(self, **kwargs) -> SLOTS:
  class ThinkingFormatter (line 153) | class ThinkingFormatter(StringFormatter):
    method __post_init__ (line 154) | def __post_init__(self):
    method apply (line 158) | def apply(self, **kwargs) -> SLOTS:

FILE: paddleformers/datasets/template/grounding_plugin.py
  class BaseGroundingPlugin (line 20) | class BaseGroundingPlugin:
    method normalize_bbox (line 21) | def normalize_bbox(self, bbox: List[float]) -> List[int]:
    method format_ref_object (line 24) | def format_ref_object(self, obj_name: str) -> str:
    method format_bbox (line 27) | def format_bbox(self, bbox: List[float]) -> str:
    method process_messages (line 31) | def process_messages(self, messages, objects):
  function register_grounding_plugin (line 62) | def register_grounding_plugin(name, plugin_class):
  function get_grounding_plugin (line 69) | def get_grounding_plugin(

FILE: paddleformers/datasets/template/mm_plugin.py
  function _make_batched_images (line 61) | def _make_batched_images(images, imglens: list[int]):
  function _check_video_is_nested_images (line 71) | def _check_video_is_nested_images(video) -> bool:
  class MMPluginMixin (line 77) | class MMPluginMixin:
    method _validate_input (line 83) | def _validate_input(
    method _validate_messages (line 121) | def _validate_messages(
    method _file_download (line 150) | def _file_download(self, url: str) -> bytes:
    method _img_download (line 163) | def _img_download(self, url: str) -> Image.Image:
    method _video_download (line 169) | def _video_download(self, url: str) -> VideoReader:
    method _preprocess_image (line 175) | def _preprocess_image(self, image, image_max_pixels, image_min_pixels,...
    method _get_video_sample_indices (line 192) | def _get_video_sample_indices(self, video_reader, video_fps, video_max...
    method _regularize_images (line 205) | def _regularize_images(self, images, **kwargs):
    method _regularize_videos (line 214) | def _regularize_videos(self, videos, **kwargs):
    method _regularize_audios (line 240) | def _regularize_audios(self, audios, sampling_rate: float, **kwargs):
    method _get_mm_inputs (line 251) | def _get_mm_inputs(
  class BasePlugin (line 308) | class BasePlugin(MMPluginMixin):
    method process_messages (line 309) | def process_messages(
    method process_tokens (line 322) | def process_tokens(self, tokens, processor):
    method get_mm_inputs (line 345) | def get_mm_inputs(
  class PaddleOCRVLPlugin (line 364) | class PaddleOCRVLPlugin(BasePlugin):
    method __init__ (line 368) | def __init__(self, image_token, video_token, audio_token, **kwargs):
    method get_ocr_augmentations (line 382) | def get_ocr_augmentations(
    method _preprocess_image (line 420) | def _preprocess_image(self, image, **kwargs):
    method _get_mm_inputs (line 443) | def _get_mm_inputs(
    method process_messages (line 465) | def process_messages(
  class ErnieVLPlugin (line 507) | class ErnieVLPlugin(BasePlugin):
    method convert_to_rgb (line 513) | def convert_to_rgb(self, image: Image.Image) -> Image.Image:
    method _preprocess_image (line 554) | def _preprocess_image(self, image, **kwargs):
    method _get_video_sample_indices (line 559) | def _get_video_sample_indices(self, video_reader, video_fps, video_max...
    method _regularize_videos (line 593) | def _regularize_videos(self, videos, **kwargs):
    method _get_mm_inputs (line 635) | def _get_mm_inputs(
    method process_messages (line 668) | def process_messages(
  class Qwen2VLPlugin (line 724) | class Qwen2VLPlugin(BasePlugin):
    method _preprocess_image (line 729) | def _preprocess_image(self, image, **kwargs):
    method _regularize_videos (line 746) | def _regularize_videos(self, videos, **kwargs):
    method _get_mm_inputs (line 785) | def _get_mm_inputs(
    method process_messages (line 819) | def process_messages(
  class Qwen2OmniPlugin (line 872) | class Qwen2OmniPlugin(Qwen2VLPlugin):
    method _get_mm_inputs (line 877) | def _get_mm_inputs(
    method _to_float_dtype (line 953) | def _to_float_dtype(data: Any, dtype: str) -> Any:
    method process_messages (line 976) | def process_messages(
  class Qwen3VLPlugin (line 1052) | class Qwen3VLPlugin(Qwen2VLPlugin):
    method _get_mm_inputs (line 1054) | def _get_mm_inputs(
    method process_messages (line 1095) | def process_messages(
  class GLM4VPlugin (line 1179) | class GLM4VPlugin(Qwen2VLPlugin):
    method _get_mm_inputs (line 1181) | def _get_mm_inputs(
    method process_messages (line 1217) | def process_messages(
    method get_mm_inputs (line 1296) | def get_mm_inputs(
  class Gemma3Plugin (line 1311) | class Gemma3Plugin(BasePlugin):
    method process_messages (line 1313) | def process_messages(
    method get_mm_inputs (line 1351) | def get_mm_inputs(
  class GlmOcrPlugin (line 1366) | class GlmOcrPlugin(BasePlugin):
    method process_messages (line 1379) | def process_messages(
  function register_mm_plugin (line 1470) | def register_mm_plugin(name: str, plugin_class: type["BasePlugin"]) -> N...
  function get_mm_plugin (line 1478) | def get_mm_plugin(

FILE: paddleformers/datasets/template/template.py
  class Role (line 49) | class Role(str, Enum):
  class Template (line 58) | class Template:
    method encode_oneturn (line 77) | def encode_oneturn(
    method encode_multiturn (line 93) | def encode_multiturn(
    method add_thought (line 104) | def add_thought(self, content: str = "") -> str:
    method remove_thought (line 108) | def remove_thought(self, content: str) -> str:
    method get_thought_word_ids (line 113) | def get_thought_word_ids(self, tokenizer: "PreTrainedTokenizer") -> li...
    method _convert_elements_to_ids (line 117) | def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", e...
    method _encode (line 136) | def _encode(
    method _add_or_replace_eos_token (line 185) | def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_to...
    method fix_special_tokens (line 201) | def fix_special_tokens(self, tokenizer: "PreTrainedTokenizer") -> None:
  class ReasoningTemplate (line 222) | class ReasoningTemplate(Template):
    method encode_oneturn (line 226) | def encode_oneturn(
    method encode_multiturn (line 253) | def encode_multiturn(
  class Llama2Template (line 280) | class Llama2Template(Template):
    method _encode (line 284) | def _encode(
  class ErnieThinkingTemplate (line 328) | class ErnieThinkingTemplate(ReasoningTemplate):
    method _encode (line 332) | def _encode(
  function register_template (line 383) | def register_template(
  function parse_template (line 458) | def parse_template(tokenizer: "PreTrainedTokenizer") -> "Template":
  function get_template_and_fix_tokenizer (line 521) | def get_template_and_fix_tokenizer(dataset_config) -> "Template":
  function _get_gpt_oss_prefix (line 841) | def _get_gpt_oss_prefix():

FILE: paddleformers/datasets/template/tool_utils.py
  class FunctionCall (line 29) | class FunctionCall(NamedTuple):
  class ToolUtils (line 82) | class ToolUtils(ABC):
    method tool_formatter (line 87) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 93) | def function_formatter(functions: list["FunctionCall"]) -> str:
  class DefaultToolUtils (line 98) | class DefaultToolUtils(ToolUtils):
    method tool_formatter (line 103) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 138) | def function_formatter(functions: list["FunctionCall"]) -> str:
  class QwenToolUtils (line 142) | class QwenToolUtils(ToolUtils):
    method tool_formatter (line 147) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 157) | def function_formatter(functions: list["FunctionCall"]) -> str:
  class GLM4ToolUtils (line 165) | class GLM4ToolUtils(ToolUtils):
    method tool_formatter (line 170) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 182) | def function_formatter(functions: list["FunctionCall"]) -> str:
  class GLM4MOEToolUtils (line 189) | class GLM4MOEToolUtils(QwenToolUtils):
    method tool_formatter (line 194) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 204) | def function_formatter(functions: list["FunctionCall"]) -> str:
  class Llama3ToolUtils (line 221) | class Llama3ToolUtils(ToolUtils):
    method tool_formatter (line 229) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 240) | def function_formatter(functions: list["FunctionCall"]) -> str:
  class ERNIEToolUtils (line 245) | class ERNIEToolUtils(ToolUtils):
    method tool_formatter (line 250) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 261) | def function_formatter(functions: list["FunctionCall"]) -> str:
  class ERNIEVLToolUtils (line 269) | class ERNIEVLToolUtils(ToolUtils):
    method tool_formatter (line 274) | def tool_formatter(tools: list[dict[str, Any]]) -> str:
    method function_formatter (line 285) | def function_formatter(functions: list["FunctionCall"]) -> str:
  function get_tool_utils (line 304) | def get_tool_utils(name: str) -> "ToolUtils":

FILE: paddleformers/generation/configuration_utils.py
  function resolve_hf_generation_config_path (line 36) | def resolve_hf_generation_config_path(repo_id: str, cache_dir: str, subf...
  class GenerationConfig (line 62) | class GenerationConfig:
    method _get_generation_mode (line 131) | def _get_generation_mode(self):
    method __init__ (line 142) | def __init__(self, **kwargs):
    method __eq__ (line 206) | def __eq__(self, other):
    method __repr__ (line 218) | def __repr__(self):
    method validate (line 221) | def validate(self, is_init=False):
    method save_pretrained (line 292) | def save_pretrained(
    method from_pretrained (line 337) | def from_pretrained(
    method _dict_from_json_file (line 434) | def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
    method dict_paddle_dtype_to_str (line 439) | def dict_paddle_dtype_to_str(self, d: Dict[str, Any]) -> None:
    method from_dict (line 452) | def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "Generati...
    method to_diff_dict (line 476) | def to_diff_dict(self) -> Dict[str, Any]:
    method to_dict (line 499) | def to_dict(self) -> Dict[str, Any]:
    method to_json_string (line 514) | def to_json_string(self, use_diff: bool = True) -> str:
    method to_json_file (line 532) | def to_json_file(self, json_file_path: Union[str, os.PathLike], use_di...
    method from_model_config (line 547) | def from_model_config(cls, model_config: PretrainedConfig) -> "Generat...
    method update (line 575) | def update(self, **kwargs):

FILE: paddleformers/generation/logits_process.py
  class LogitsProcessor (line 26) | class LogitsProcessor(ABC):
    method __call__ (line 32) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor):
  class LogitsProcessorList (line 38) | class LogitsProcessorList:
    method __init__ (line 41) | def __init__(self, processors: List[LogitsProcessor] = None) -> None:
    method __call__ (line 47) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor, **...
    method append (line 59) | def append(self, processor: LogitsProcessor):
  class MinLengthLogitsProcessor (line 63) | class MinLengthLogitsProcessor(LogitsProcessor):
    method __init__ (line 72) | def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):
    method __call__ (line 82) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor):
  class RepetitionPenaltyLogitsProcessor (line 89) | class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    method __init__ (line 99) | def __init__(self, penalty: float):
    method __call__ (line 105) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor):
  function _get_ngrams (line 113) | def _get_ngrams(ngram_size: int, prev_input_ids: paddle.Tensor, num_hypo...
  function _get_generated_ngrams (line 140) | def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur...
  function _calc_banned_ngram_tokens (line 162) | def _calc_banned_ngram_tokens(ngram_size: int, prev_input_ids: paddle.Te...
  class NoRepeatNGramLogitsProcessor (line 177) | class NoRepeatNGramLogitsProcessor(LogitsProcessor):
    method __init__ (line 186) | def __init__(self, ngram_size: int):
    method __call__ (line 191) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
  class HammingDiversityLogitsProcessor (line 204) | class HammingDiversityLogitsProcessor(LogitsProcessor):
    method __init__ (line 219) | def __init__(self, diversity_rate: float, num_beams: int, num_beam_gro...
    method __call__ (line 230) | def __call__(
  class ForcedBOSTokenLogitsProcessor (line 252) | class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
    method __init__ (line 261) | def __init__(self, forced_bos_token_id: int):
    method __call__ (line 264) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
  class ForcedEOSTokenLogitsProcessor (line 272) | class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
    method __init__ (line 281) | def __init__(self, max_length: int, forced_eos_token_id: Union[int, Li...
    method __call__ (line 285) | def __call__(self, input_ids, scores):
  function TopKProcess (line 293) | def TopKProcess(probs: paddle.Tensor, top_k: int, min_tokens_to_keep: int):
  function TopPProcess (line 311) | def TopPProcess(probs: paddle.Tensor, top_p: float, min_tokens_to_keep: ...
  class LogitsWarper (line 347) | class LogitsWarper:
    method __call__ (line 350) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
  class TemperatureLogitsWarper (line 356) | class TemperatureLogitsWarper(LogitsWarper):
    method __init__ (line 364) | def __init__(self, temperature: float):
    method __call__ (line 370) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
  class SequenceBiasLogitsProcessor (line 375) | class SequenceBiasLogitsProcessor(LogitsProcessor):
    method __init__ (line 437) | def __init__(self, sequence_bias: Dict[Tuple[int], float]):
    method __call__ (line 446) | def __call__(self, input_ids, scores):
    method _prepare_bias_variables (line 484) | def _prepare_bias_variables(self, scores):
    method _validate_arguments (line 508) | def _validate_arguments(self):
  class NoBadWordsLogitsProcessor (line 527) | class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
    method __init__ (line 591) | def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union...
    method _validate_arguments (line 608) | def _validate_arguments(self):
  class PrefixConstrainedLogitsProcessor (line 623) | class PrefixConstrainedLogitsProcessor(LogitsProcessor):
    method __init__ (line 636) | def __init__(self, prefix_allowed_tokens_fn: Callable[[int, paddle.Ten...
    method __call__ (line 640) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor) ->...

FILE: paddleformers/generation/stopping_criteria.py
  class StoppingCriteria (line 24) | class StoppingCriteria(ABC):
    method __call__ (line 30) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor, **...
  class MaxTimeCriteria (line 34) | class MaxTimeCriteria(StoppingCriteria):
    method __init__ (line 47) | def __init__(self, max_time: float, initial_timestamp: Optional[float]...
    method __call__ (line 51) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **...
  class MaxLengthCriteria (line 55) | class MaxLengthCriteria(StoppingCriteria):
    method __init__ (line 65) | def __init__(self, max_length: int):
    method __call__ (line 68) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **...
  class StoppingCriteriaList (line 72) | class StoppingCriteriaList(list):
    method __call__ (line 73) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **...
    method max_length (line 77) | def max_length(self):
  function validate_stopping_criteria (line 84) | def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, ...

FILE: paddleformers/generation/streamers.py
  class BaseStreamer (line 26) | class BaseStreamer:
    method put (line 31) | def put(self, value):
    method end (line 35) | def end(self):
  class TextStreamer (line 40) | class TextStreamer(BaseStreamer):
    method __init__ (line 67) | def __init__(self, tokenizer: PreTrainedTokenizer, skip_prompt: bool =...
    method put (line 77) | def put(self, value):
    method end (line 111) | def end(self):
    method on_finalized_text (line 125) | def on_finalized_text(self, text: str, stream_end: bool = False):
    method _is_chinese_char (line 129) | def _is_chinese_char(self, cp):
  class TextIteratorStreamer (line 154) | class TextIteratorStreamer(TextStreamer):
    method __init__ (line 195) | def __init__(
    method on_finalized_text (line 207) | def on_finalized_text(self, text: str, stream_end: bool = False):
    method __iter__ (line 213) | def __iter__(self):
    method __next__ (line 216) | def __next__(self):

FILE: paddleformers/generation/utils.py
  function _make_sliding_window_mask (line 66) | def _make_sliding_window_mask(input_shape, past_key_values_length=0, win...
  function get_unfinished_flag (line 95) | def get_unfinished_flag(
  class BeamHypotheses (line 126) | class BeamHypotheses:
    method __init__ (line 127) | def __init__(self, num_beams, length_penalty, early_stopping):
    method __len__ (line 137) | def __len__(self):
    method add (line 143) | def add(self, hyp, sum_logprobs, origin_len=0):
    method is_done (line 157) | def is_done(self, best_sum_logprobs, cur_len, origin_len=0):
  class BeamSearchScorer (line 173) | class BeamSearchScorer(object):
    method __init__ (line 178) | def __init__(
    method is_done (line 221) | def is_done(self):
    method process (line 224) | def process(
    method finalize (line 292) | def finalize(
  class GenerationMixin (line 347) | class GenerationMixin(object):
    method prepare_input_ids_for_generation (line 357) | def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
    method prepare_attention_mask_for_generation (line 366) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos...
    method _prepare_decoder_attention_mask (line 386) | def _prepare_decoder_attention_mask(
    method prepare_seq_len_for_generation (line 467) | def prepare_seq_len_for_generation(input_ids, pad_token_id, eos_token_...
    method get_logits_processor (line 478) | def get_logits_processor(
    method expand_inputs_for_generation (line 527) | def expand_inputs_for_generation(input_ids, expand_size, attention_mas...
    method update_model_kwargs_for_generation (line 561) | def update_model_kwargs_for_generation(outputs, model_kwargs, is_encod...
    method update_scores_for_generation (line 614) | def update_scores_for_generation(scores, next_scores, length, unfinish...
    method prepare_encoder_decoder_kwargs_for_generation (line 623) | def prepare_encoder_decoder_kwargs_for_generation(self, input_ids, mod...
    method prepare_decoder_input_ids_for_generation (line 641) | def prepare_decoder_input_ids_for_generation(self, input_ids, decoder_...
    method get_decoder_start_token_id (line 651) | def get_decoder_start_token_id(self, decoder_start_token_id=None, bos_...
    method prepare_inputs_for_generation (line 669) | def prepare_inputs_for_generation(
    method adjust_logits_during_generation (line 781) | def adjust_logits_during_generation(self, logits):
    method prepare_fast_entry (line 787) | def prepare_fast_entry(self, kwargs):
    method _convert_to_fast (line 790) | def _convert_to_fast(self, kwargs):
    method _build_fast (line 794) | def _build_fast(self, kwargs):
    method set_pad_token_id (line 806) | def set_pad_token_id(self, pad_token_id, eos_token_id):
    method generate (line 818) | def generate(
    method greedy_search (line 1248) | def greedy_search(
    method sample (line 1349) | def sample(
    method _get_model_inputs_spec (line 1493) | def _get_model_inputs_spec(self, dtype: str):
    method to_static (line 1502) | def to_static(self, path: str, config: dict):
    method sample_d2s (line 1543) | def sample_d2s(
    method reorder_cache (line 1677) | def reorder_cache(self, cache, beam_idx):
    method beam_search (line 1684) | def beam_search(
    method group_beam_search (line 1849) | def group_beam_search(

FILE: paddleformers/mergekit/merge_config.py
  class MergeConfig (line 25) | class MergeConfig:
    method __post_init__ (line 81) | def __post_init__(self):
    method config_check (line 84) | def config_check(self):
    method __dict__ (line 128) | def __dict__(self):
    method to_dict (line 131) | def to_dict(self):
    method save_pretrained (line 134) | def save_pretrained(self, save_directory):
    method from_pretrained (line 155) | def from_pretrained(cls, pretrained_model_path, **kwargs):
    method from_json_file (line 180) | def from_json_file(cls, path_json_file):

FILE: paddleformers/mergekit/merge_method.py
  class MergeMethod (line 19) | class MergeMethod:
    method __init__ (line 20) | def __init__(self, merge_config, sparsify_method=None):
    method merge (line 24) | def merge(self, tensor_list):
    method linear (line 36) | def linear(self, tensor_list):
    method slerp (line 58) | def slerp(self, tensor_list):
    method ties (line 127) | def ties(self, tensor_list):
    method normalize (line 192) | def normalize(self, t):

FILE: paddleformers/mergekit/merge_model.py
  class MergeModel (line 63) | class MergeModel:
    method __init__ (line 64) | def __init__(self, merge_config):
    method reset_merge_model (line 70) | def reset_merge_model(self, merge_config=None, merge_param_dict=None):
    method merge_model (line 94) | def merge_model(self):
    method copy_file (line 107) | def copy_file(self):
    method mergekit (line 121) | def mergekit(self):
    method merge_mix_model (line 135) | def merge_mix_model(self, file_type_list):
    method get_model_state_dict (line 249) | def get_model_state_dict(self, model_path, file_type, key_list=None, f...
    method get_safetensor_index (line 304) | def get_safetensor_index(self, model_path, file_type):
    method merge_safetensor_model (line 319) | def merge_safetensor_model(self, file_type_list):
    method shard_merge_np (line 406) | def shard_merge_np(
    method shard_merge_pd (line 451) | def shard_merge_pd(
    method check_model_path (line 523) | def check_model_path(self, model_path, lora_merge=False):
    method check_lora_model_path (line 536) | def check_lora_model_path(self, model_path):
    method weight_name (line 547) | def weight_name(self):
    method safe_weight_name (line 553) | def safe_weight_name(self):
    method safe_index_name (line 559) | def safe_index_name(self):
    method merge_lora_model (line 565) | def merge_lora_model(self):
    method get_split_qkv_hidden_size (line 576) | def get_split_qkv_hidden_size(self, base_state_dict):
    method split_fuse_lora_state_dict (line 589) | def split_fuse_lora_state_dict(self, base_state_dict, lora_state_dict):
    method shard_lora_merge (line 618) | def shard_lora_merge(self, base_index, shard_file, lora_config, file_t...
    method merge_safetensor_lora_model (line 691) | def merge_safetensor_lora_model(self, file_type_list):
    method merge_pdparams_lora_model (line 792) | def merge_pdparams_lora_model(self, file_type_list):

FILE: paddleformers/mergekit/merge_utils.py
  function divide_positions (line 17) | def divide_positions(m, n):
  function divide_lora_key_list (line 35) | def divide_lora_key_list(key_list, n, lora_config):
  function divide_safetensor_key_list (line 58) | def divide_safetensor_key_list(weight_map, n):

FILE: paddleformers/mergekit/sparsify_method.py
  class SparsifyMethod (line 18) | class SparsifyMethod:
    method __init__ (line 19) | def __init__(self, merge_config):
    method sparsify (line 22) | def sparsify(self, tensor):
    method dare (line 34) | def dare(self, tensor):
    method magprune (line 47) | def magprune(self, tensor):
    method trim (line 91) | def trim(self, tensor):

FILE: paddleformers/nn/activation.py
  class ClassInstantier (line 20) | class ClassInstantier(OrderedDict):
    method __getitem__ (line 21) | def __getitem__(self, key):

FILE: paddleformers/nn/attention/eager_attention.py
  function repeat_kv (line 23) | def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
  function eager_attention_forward (line 31) | def eager_attention_forward(

FILE: paddleformers/nn/attention/flashmask_attention.py
  function flashmask_attention_forward (line 24) | def flashmask_attention_forward(

FILE: paddleformers/nn/attention/interface.py
  class AttentionInterface (line 22) | class AttentionInterface(GeneralInterface):

FILE: paddleformers/nn/attention/sdpa_attention.py
  function sdpa_attention_forward (line 24) | def sdpa_attention_forward(

FILE: paddleformers/nn/attention/sink_impl.py
  function _get_fa_version (line 26) | def _get_fa_version():
  function _flash_attention_forward_dispatch (line 33) | def _flash_attention_forward_dispatch(
  function _flash_attention_backward_dispatch (line 87) | def _flash_attention_backward_dispatch(
  function _flashmask_attention_forward_dispatch (line 129) | def _flashmask_attention_forward_dispatch(
  function _flashmask_attention_backward_dispatch (line 180) | def _flashmask_attention_backward_dispatch(
  class FlashMaskSinkPyLayer (line 222) | class FlashMaskSinkPyLayer(PyLayer):
    method forward (line 232) | def forward(
    method backward (line 387) | def backward(ctx, grad_output):
  function sink_attention_forward (line 550) | def sink_attention_forward(

FILE: paddleformers/nn/attention/utils.py
  function repeat_kv (line 18) | def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:

FILE: paddleformers/nn/criterion/dpo_loss.py
  function dpo_preprocess_inputs (line 33) | def dpo_preprocess_inputs(self, logits, labels):
  function loss_impl (line 50) | def loss_impl(self, logits, labels):
  function dpo_logps (line 56) | def dpo_logps(
  function cal_dpo_loss (line 248) | def cal_dpo_loss(
  function dpo_loss_forward (line 336) | def dpo_loss_forward(

FILE: paddleformers/nn/criterion/interface.py
  class LossInterface (line 27) | class LossInterface(GeneralInterface):
  class CriterionLayer (line 40) | class CriterionLayer(nn.Layer):
    method __init__ (line 41) | def __init__(self, config, return_tuple=True, use_infohub=False, **kwa...
    method forward (line 89) | def forward(self, logits, labels, loss_mask=None, **kwargs):

FILE: paddleformers/nn/criterion/kto_loss.py
  function kto_preprocess_inputs (line 36) | def kto_preprocess_inputs(self, logits, labels):
  function _nested_gather (line 51) | def _nested_gather(self, tensors):
  function kto_logps (line 69) | def kto_logps(
  function kto_loss (line 198) | def kto_loss(
  function kto_loss_forward (line 231) | def kto_loss_forward(

FILE: paddleformers/nn/criterion/loss_utils.py
  function calc_lm_head_logits (line 23) | def calc_lm_head_logits(
  function subbatch (line 67) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar...

FILE: paddleformers/nn/criterion/sft_loss.py
  function sft_preprocess_inputs (line 29) | def sft_preprocess_inputs(self, logits, labels):
  function sft_postprocess_loss (line 44) | def sft_postprocess_loss(self, masked_lm_loss, labels, loss_mask, **kwar...
  function loss_impl (line 60) | def loss_impl(self, logits, labels):
  function sft_calculate_loss (line 66) | def sft_calculate_loss(self, logits, hidden_states, lm_head_weight, lm_h...
  function sft_loss_forward (line 128) | def sft_loss_forward(
  function mtp_sft_loss_forward (line 188) | def mtp_sft_loss_forward(

FILE: paddleformers/nn/embedding.py
  class Embedding (line 24) | class Embedding(GeneralInterface):
    method create (line 31) | def create(
    method process_kwargs (line 57) | def process_kwargs(self, embedding_type, **kwargs):
    method get_embedding_type (line 67) | def get_embedding_type(self, config: PretrainedConfig):

FILE: paddleformers/nn/general.py
  class GeneralInterface (line 19) | class GeneralInterface(MutableMapping):
    method __init__ (line 29) | def __init__(self):
    method __getitem__ (line 32) | def __getitem__(self, key):
    method __setitem__ (line 38) | def __setitem__(self, key, value):
    method __delitem__ (line 42) | def __delitem__(self, key):
    method __iter__ (line 45) | def __iter__(self):
    method __len__ (line 49) | def __len__(self):
    method register (line 53) | def register(cls, key: str, value: Callable):
    method valid_keys (line 56) | def valid_keys(self) -> list[str]:

FILE: paddleformers/nn/linear.py
  class Linear (line 29) | class Linear(GeneralInterface):
    method create (line 39) | def create(
    method get_linear_type (line 62) | def get_linear_type(self, config: PretrainedConfig, tp_plan: str = None):
    method get_linear_kwargs (line 72) | def get_linear_kwargs(self, linear_type, has_bias=False, gather_output...

FILE: paddleformers/nn/lm_head.py
  class LMHead (line 27) | class LMHead(nn.Layer):
    method __init__ (line 28) | def __init__(self, config: PretrainedConfig):
    method _set_distributed_attr (line 68) | def _set_distributed_attr(self, param):
    method forward (line 73) | def forward(self, hidden_states, tensor_parallel_output=None):
    method extra_repr (line 111) | def extra_repr(self):
    method sharded_state_dict (line 114) | def sharded_state_dict(

FILE: paddleformers/nn/mlp.py
  class MLP (line 25) | class MLP(nn.Layer):
    method __init__ (line 26) | def __init__(
    method forward (line 105) | def forward(self, x):

FILE: paddleformers/nn/moe/abstract.py
  class MOELayerBase (line 18) | class MOELayerBase(nn.Layer):

FILE: paddleformers/nn/moe/all_gather.py
  function allgather_async (line 26) | def allgather_async(input, group=None):
  function reduce_scatter_async (line 51) | def reduce_scatter_async(input, group=None):
  class AllGatherAsync (line 86) | class AllGatherAsync(PyLayer):
    method forward (line 92) | def forward(ctx, input, *fn_args, group=None, fn=None, is_first_fwd=Fa...
    method backward (line 116) | def backward(ctx, grad, *fn_out_grads):
  class AlltoAllSmart (line 137) | class AlltoAllSmart(paddle.autograd.PyLayer):
    method forward (line 143) | def forward(
    method backward (line 334) | def backward(
  class AlltoAllSmartXPU (line 415) | class AlltoAllSmartXPU(paddle.autograd.PyLayer):
    method forward (line 421) | def forward(
    method backward (line 611) | def backward(

FILE: paddleformers/nn/moe/all_to_all.py
  class AlltoAll (line 23) | class AlltoAll(PyLayer):
    method forward (line 29) | def forward(ctx, x, group, sync_op=True):
    method backward (line 53) | def backward(ctx, *dx):
  class AlltoAllAsync (line 66) | class AlltoAllAsync(PyLayer):
    method forward (line 72) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False):
    method backward (line 106) | def backward(ctx, dx_out, *fn_out_grads):

FILE: paddleformers/nn/moe/moe_allgather_layer.py
  class ReshardCombineWeight (line 52) | class ReshardCombineWeight(PyLayer):
    method forward (line 58) | def forward(ctx, input, group=None):
    method backward (line 78) | def backward(ctx, grad):
  class MOEAllGatherLayerV2 (line 95) | class MOEAllGatherLayerV2(MOEAlltoAllLayer):
    method __init__ (line 100) | def __init__(
    method forward (line 145) | def forward(
    method fused_gate_logits_process_fused (line 371) | def fused_gate_logits_process_fused(self, gate_logits_lm, gate_logits_...
    method fused_gate_and_dispatch (line 456) | def fused_gate_and_dispatch(self, input, token_type_ids=None, global_d...
    method forward_experts (line 666) | def forward_experts(self, *dispatched_input):
    method calc_router_loss_and_logging (line 748) | def calc_router_loss_and_logging(

FILE: paddleformers/nn/moe/moe_alltoall_layer.py
  class GateCombine (line 42) | class GateCombine(PyLayer):
    method forward (line 48) | def forward(ctx, x, combine_weights, scatter_index):
    method backward (line 67) | def backward(ctx, grad_y, *_):
  function combining (line 87) | def combining(x, combine_weights, scatter_index, hard_gate=False):
  class MOEAlltoAllLayer (line 113) | class MOEAlltoAllLayer(MOELayerBase):
    method __init__ (line 118) | def __init__(
    method forward_experts (line 199) | def forward_experts(self, dispatched_input):
    method fused_gate_logits_process (line 240) | def fused_gate_logits_process(self, gate_logits, token_type_ids=None, ...
    method gate_and_dispatch (line 303) | def gate_and_dispatch(self, input, token_type_ids=None):
    method _calc_router_loss (line 392) | def _calc_router_loss(
    method calc_router_loss_and_logging (line 444) | def calc_router_loss_and_logging(
    method combine_expert_output (line 540) | def combine_expert_output(self, expert_output, combine_weights, scatte...
    method forward_single_stage (line 560) | def forward_single_stage(self, dispatched_input, stage_id):
    method all2all_expert_overlap (line 574) | def all2all_expert_overlap(self, x, group):
    method forward (line 602) | def forward(

FILE: paddleformers/nn/moe/moe_block.py
  function create_moe_block (line 31) | def create_moe_block(
  class MoEStatics (line 87) | class MoEStatics(nn.Layer):
    method __init__ (line 93) | def __init__(self, config, layer_idx):

FILE: paddleformers/nn/moe/topk_gate.py
  function masked_fill (line 40) | def masked_fill(x, mask, value):
  function compute_optimal_transport (line 57) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:...
  function cast_if_needed (line 86) | def cast_if_needed(x, dtype):
  class FusedGateDetachMatmul (line 100) | class FusedGateDetachMatmul(paddle.autograd.PyLayer):
    method forward (line 107) | def forward(ctx, x, w):
    method backward (line 124) | def backward(ctx, y_grad):
  function gate_detach_matmul (line 145) | def gate_detach_matmul(x, weight, use_fuse):
  class TopKGate (line 164) | class TopKGate(nn.Layer):
    method __init__ (line 169) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->...
    method _create_gate_parameter (line 279) | def _create_gate_parameter(self):
    method get_gate_weight (line 318) | def get_gate_weight(self, transform_weight):
    method forward (line 346) | def forward(
    method get_capacity (line 380) | def get_capacity(self, num_tokens, cap_factor=None):
    method _cal_aux_loss (line 406) | def _cal_aux_loss(
    method _cal_z_loss (line 503) | def _cal_z_loss(self, logits, loss_mask=None):
    method _cal_orthogonal_loss_opt_each_weight (line 523) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group):
    method _cal_orthogonal_loss (line 553) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None):

FILE: paddleformers/nn/moe/utils.py
  function get_hcg (line 32) | def get_hcg():
  function scatter_axis (line 39) | def scatter_axis(input, group=None, axis=0):
  class ReduceScatterGroupOp (line 71) | class ReduceScatterGroupOp(PyLayer):
    method forward (line 77) | def forward(ctx, input, group=None):
    method backward (line 92) | def backward(ctx, grad):
  class AllGatherGroupOp (line 103) | class AllGatherGroupOp(PyLayer):
    method forward (line 109) | def forward(ctx, input, group=None):
    method backward (line 124) | def backward(ctx, grad):
  function get_async_loader (line 135) | def get_async_loader():
  function hack_offload_wait (line 149) | def hack_offload_wait(task):
  function all_gather_group (line 154) | def all_gather_group(input, group=None, axis=0):
  function reduce_scatter_group (line 190) | def reduce_scatter_group(input, group=None):
  class ScatterOp (line 221) | class ScatterOp(PyLayer):
    method forward (line 234) | def forward(ctx, input, axis=0, group=None):
    method backward (line 241) | def backward(ctx, grad):
  function detach_and_requires_grad_ (line 246) | def detach_and_requires_grad_(*args):
  class FakeClone (line 263) | class FakeClone(paddle.autograd.PyLayer):
    method forward (line 269) | def forward(ctx, input):
    method backward (line 287) | def backward(ctx, grad_output):
  function manual_backward (line 300) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]):
  function _parse_moe_group (line 351) | def _parse_moe_group(

FILE: paddleformers/nn/moe_deepep/modular_moe_layer.py
  class ModularMoELayer (line 40) | class ModularMoELayer(nn.Layer):
    method __init__ (line 41) | def __init__(
    method _init_expert_parallel (line 191) | def _init_expert_parallel(self):
    method forward (line 236) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
    method _forward_traditional_moe (line 287) | def _forward_traditional_moe(
    method _forward_with_ep_parallel (line 327) | def _forward_with_ep_parallel(
    method get_auxiliary_loss (line 367) | def get_auxiliary_loss(self) -> paddle.Tensor:
    method get_z_loss (line 370) | def get_z_loss(self) -> paddle.Tensor:
    method get_all_losses (line 373) | def get_all_losses(self) -> Dict[str, paddle.Tensor]:
    method get_total_loss (line 379) | def get_total_loss(self) -> paddle.Tensor:
    method remove_loss_function (line 385) | def remove_loss_function(self, name: str):
    method update_loss_weights (line 395) | def update_loss_weights(self, weights: Dict[str, float]):
    method set_loss_combiner (line 405) | def set_loss_combiner(self, combiner_name: str):
    method get_expert_info (line 415) | def get_expert_info(self) -> Dict[str, Any]:

FILE: paddleformers/nn/moe_deepep/moe_communication.py
  class MoECommunicationInterface (line 26) | class MoECommunicationInterface(ABC):
    method forward (line 28) | def forward(
  class AllToAllMoECommunication (line 70) | class AllToAllMoECommunication(nn.Layer, MoECommunicationInterface):
    method forward (line 75) | def forward(
  class DeepEPMoECommunication (line 226) | class DeepEPMoECommunication(nn.Layer, MoECommunicationInterface):
    method expert_forward (line 231) | def expert_forward(self, dispatched_input, tokens_per_expert, experts,...
    method forward (line 248) | def forward(

FILE: paddleformers/nn/moe_deepep/moe_expert.py
  class MoEExpertInterface (line 23) | class MoEExpertInterface(ABC):
    method forward (line 25) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
  class StandardMLPExpert (line 36) | class StandardMLPExpert(MLP):
    method __init__ (line 37) | def __init__(

FILE: paddleformers/nn/moe_deepep/moe_factory.py
  class QuickAccessMoEFactory (line 19) | class QuickAccessMoEFactory:
    method create_from_model_name (line 21) | def create_from_model_name(

FILE: paddleformers/nn/moe_deepep/moe_gate.py
  class MoEGateMixin (line 30) | class MoEGateMixin:
    method gate_score_func (line 31) | def gate_score_func(self, logits: paddle.Tensor) -> paddle.Tensor:
    method gumbel_rsample (line 54) | def gumbel_rsample(self, logits: paddle.Tensor) -> paddle.Tensor:
    method uniform_sample (line 58) | def uniform_sample(self, logits: paddle.Tensor) -> paddle.Tensor:
    method _one_hot_to_float (line 63) | def _one_hot_to_float(self, x, num_classes):
    method _one_hot_to_int64 (line 69) | def _one_hot_to_int64(self, x, num_classes):
    method _capacity (line 75) | def _capacity(
    method _cal_aux_loss (line 99) | def _cal_aux_loss(self, gates, mask):
    method _cal_seq_aux_loss (line 126) | def _cal_seq_aux_loss(self, probs, top_k, routing_map, max_seq_len):
    method _cal_z_loss (line 160) | def _cal_z_loss(self, logits) -> paddle.Tensor:
    method _cal_orthogonal_loss (line 173) | def _cal_orthogonal_loss(self) -> paddle.Tensor:
    method _priority (line 183) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle....
    method _probs_drop_policy (line 205) | def _probs_drop_policy(
    method _topk_greedy (line 275) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle....
    method _topk_group_limited_greedy (line 291) | def _topk_group_limited_greedy(
    method _topk_noaux_tc (line 323) | def _topk_noaux_tc(
  class StandardMoEGate (line 365) | class StandardMoEGate(nn.Layer, MoEGateMixin):
    method __init__ (line 366) | def __init__(
    method forward (line 442) | def forward(
    method topkgating (line 453) | def topkgating(

FILE: paddleformers/nn/moe_deepep/moe_loss.py
  class LossType (line 27) | class LossType(Enum):
  class LossConfig (line 37) | class LossConfig:
    method __post_init__ (line 45) | def __post_init__(self):
  class LossFunction (line 50) | class LossFunction(Protocol):
    method __call__ (line 51) | def __call__(
  class AddAuxiliaryLoss (line 61) | class AddAuxiliaryLoss(paddle.autograd.PyLayer):
    method forward (line 68) | def forward(ctx, x, loss):
    method backward (line 75) | def backward(ctx, grad_output):
  class LossCombiner (line 82) | class LossCombiner(Protocol):
    method __call__ (line 83) | def __call__(self, losses: Dict[str, paddle.Tensor], configs: Dict[str...
  class LossRegistry (line 87) | class LossRegistry:
    method __init__ (line 88) | def __init__(self):
    method _register_default_losses (line 94) | def _register_default_losses(self):
    method _register_default_combiners (line 101) | def _register_default_combiners(self):
    method register_loss (line 106) | def register_loss(self, name: str, loss_func: LossFunction):
    method register_combiner (line 110) | def register_combiner(self, name: str, combiner: LossCombiner):
    method get_loss (line 114) | def get_loss(self, name: str) -> Optional[LossFunction]:
    method get_combiner (line 117) | def get_combiner(self, name: str) -> Optional[LossCombiner]:
    method list_losses (line 120) | def list_losses(self) -> List[str]:
    method list_combiners (line 123) | def list_combiners(self) -> List[str]:
    method _auxiliary_loss (line 126) | def _auxiliary_loss(
    method _z_loss (line 145) | def _z_loss(
    method _entropy_loss (line 156) | def _entropy_loss(
    method _sparsity_loss (line 166) | def _sparsity_loss(
    method _diversity_loss (line 184) | def _diversity_loss(
    method _weighted_sum_combiner (line 207) | def _weighted_sum_combiner(
    method _adaptive_sum_combiner (line 217) | def _adaptive_sum_combiner(
    method _geometric_mean_combiner (line 239) | def _geometric_mean_combiner(

FILE: paddleformers/nn/moe_deepep/moe_loss_instance.py
  function get_global_loss_registry (line 22) | def get_global_loss_registry():
  function custom_diversity_loss (line 32) | def custom_diversity_loss(
  function custom_weighted_sum_combiner (line 56) | def custom_weighted_sum_combiner(

FILE: paddleformers/nn/norm.py
  class LayerNorm (line 29) | class LayerNorm(nn.LayerNorm):
    method __init__ (line 30) | def __init__(
    method enable_sequence_parallel (line 47) | def enable_sequence_parallel(self):
  class RMSNorm (line 54) | class RMSNorm(nn.Layer):
    method __init__ (line 55) | def __init__(self, config: PretrainedConfig, hidden_size=None, norm_ep...
    method forward (line 69) | def forward(self, hidden_states):
    method enable_sequence_parallel (line 82) | def enable_sequence_parallel(self):
  class Norm (line 86) | class Norm(GeneralInterface):
    method create (line 90) | def create(

FILE: paddleformers/nn/pp_model.py
  function parse_args (line 40) | def parse_args(args, mtp_enable=False, is_embed=False):
  function get_pp_vp_split_layers (line 113) | def get_pp_vp_split_layers(config, skip_recompute_num=-1):
  function get_attr (line 179) | def get_attr(layer, name):
  class RotaryEmbedding (line 187) | class RotaryEmbedding(nn.Layer):
    method __init__ (line 188) | def __init__(self, config):
    method forward (line 194) | def forward(self, x, position_ids):
  class EmbeddingPipe (line 219) | class EmbeddingPipe(nn.Layer):
    method __init__ (line 220) | def __init__(self, config, embed_cls=None, rotary_emb_cls=None):
    method embedding_weight (line 239) | def embedding_weight(self):
    method forward (line 247) | def forward(self, args):
  class RMSNormPipe (line 349) | class RMSNormPipe(RMSNorm):
    method __init__ (line 350) | def __init__(self, *args, **kwargs):
    method forward (line 355) | def forward(self, args):
  class LayerNormPipe (line 361) | class LayerNormPipe(LayerNorm):
    method __init__ (line 362) | def __init__(self, *args, **kwargs):
    method forward (line 367) | def forward(self, args):
  class EmptyLayer (line 373) | class EmptyLayer(nn.Layer):
    method __init__ (line 378) | def __init__(self):
    method forward (line 381) | def forward(self, x):
  class LMHeadPipe (line 385) | class LMHeadPipe(LMHead):
    method forward (line 390) | def forward(self, args):
    method embedding_weight (line 408) | def embedding_weight(self):
  function make_decoder_layer_pipe (line 413) | def make_decoder_layer_pipe(decoder_layer):
  class CriterionLayerPipe (line 497) | class CriterionLayerPipe(CriterionLayer):
    method __init__ (line 498) | def __init__(self, *args, **kwargs):
    method forward (line 502) | def forward(self, logits, labels, mtp_logits=None):
  class GeneralModelForCausalLMPipe (line 509) | class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
    method __init__ (line 528) | def __init__(self, config: PretrainedConfig, **kwargs):
    method get_loss_fn (line 677) | def get_loss_fn(self, config):
    method register_cls_attr (line 686) | def register_cls_attr(cls, config_class=None, pretrained_model_class=N...
    method _prepare_pipeline_inputs_func (line 703) | def _prepare_pipeline_inputs_func(cls, inputs):

FILE: paddleformers/peft/lora/auto_lora_model.py
  class LoRAAutoLinear (line 45) | class LoRAAutoLinear(LoRALinear):
    method __init__ (line 46) | def __init__(
    method process_intermediate_api (line 76) | def process_intermediate_api(self):
    method process_base_api (line 83) | def process_base_api(self):
    method auto_dist_config (line 98) | def auto_dist_config(self, prefix=""):
  class LoRAAutoModel (line 120) | class LoRAAutoModel(nn.Layer):
    method __init__ (line 126) | def __init__(self, model, lora_config: LoRAAutoConfig) -> None:
    method from_pretrained (line 150) | def from_pretrained(cls, model, lora_path, **kwargs):
    method set_state_dict (line 227) | def set_state_dict(self, state_dict):
    method _get_tensor_parallel_convert_actions (line 271) | def _get_tensor_parallel_convert_actions(self, loaded_keys, is_split=T...
    method _convert_tensor_parallel (line 284) | def _convert_tensor_parallel(self, lora_state_dict):
    method save_pretrained (line 295) | def save_pretrained(self, save_directory: str, merge_tensor_parallel: ...
    method _find_and_replace_module (line 350) | def _find_and_replace_module(self, model, module_name, lora_config, en...
    method _find_and_restore_module (line 391) | def _find_and_restore_module(self, module_name):
    method get_trainable_state_dict (line 404) | def get_trainable_state_dict(self, concat_init_lora=False):
    method print_trainable_parameters (line 419) | def print_trainable_parameters(self) -> None:
    method mark_only_lora_as_trainable (line 431) | def mark_only_lora_as_trainable(self) -> None:
    method get_lora_model (line 454) | def get_lora_model(self, model: Union[PretrainedModel, nn.Layer], lora...
    method merge_auto_dist_configs (line 526) | def merge_auto_dist_configs(self, configs):
    method _generate_auto_dist_config (line 604) | def _generate_auto_dist_config(self, auto_dist_degree):
    method restore_original_model (line 676) | def restore_original_model(self):
    method __getattr__ (line 684) | def __getattr__(self, name: str):
    method train (line 691) | def train(self):
    method eval (line 698) | def eval(self):
    method save_to_aistudio (line 705) | def save_to_aistudio(
    method disable_lora (line 760) | def disable_lora(self):
    method enable_lora (line 765) | def enable_lora(self):
    method merge (line 770) | def merge(self):
    method unmerge (line 775) | def unmerge(self):
    method get_model_config (line 780) | def get_model_config(

FILE: paddleformers/peft/lora/lora_config.py
  class LoRAConfig (line 26) | class LoRAConfig:
    method __post_init__ (line 84) | def __post_init__(self):
    method scaling (line 92) | def scaling(self):
    method __dict__ (line 99) | def __dict__(self):
    method to_dict (line 102) | def to_dict(self):
    method save_pretrained (line 105) | def save_pretrained(self, save_directory):
    method from_pretrained (line 126) | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
    method from_json_file (line 152) | def from_json_file(cls, path_json_file):
  class LoRAAutoConfig (line 166) | class LoRAAutoConfig(LoRAConfig):

FILE: paddleformers/peft/lora/lora_layers.py
  class LoRALinear (line 50) | class LoRALinear(nn.Linear):
    method __init__ (line 52) | def __init__(
    method rope_init (line 107) | def rope_init(self):
    method get_delta_weight (line 116) | def get_delta_weight(self, lora_A=None, lora_B=None):
    method merge (line 124) | def merge(self):
    method unmerge (line 131) | def unmerge(self):
    method forward (line 138) | def forward(self, input: paddle.Tensor, *args, **kwargs):
    method extra_repr (line 146) | def extra_repr(self):
  class FleetLoRALinear (line 151) | class FleetLoRALinear(LoRALinear):
    method __init__ (line 152) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
    method forward (line 156) | def forward(self, input: paddle.Tensor):
  class RowParallelLoRALinear (line 164) | class RowParallelLoRALinear(RowParallelLinear):
    method __init__ (line 165) | def __init__(
    method sharded_state_dict (line 225) | def sharded_state_dict(
    method get_delta_weight (line 232) | def get_delta_weight(self, lora_A=None, lora_B=None):
    method unmerge (line 239) | def unmerge(self):
    method merge (line 246) | def merge(self):
    method forward (line 253) | def forward(self, x: paddle.Tensor):
    method extra_repr (line 299) | def extra_repr(self):
  class FleetRowParallelLoRALinear (line 304) | class FleetRowParallelLoRALinear(RowParallelLoRALinear):
    method __init__ (line 305) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
    method forward (line 309) | def forward(self, input: paddle.Tensor):
  class RowSequenceParallelLoRALinear (line 317) | class RowSequenceParallelLoRALinear(RowSequenceParallelLinear):
    method __init__ (line 318) | def __init__(
    method sharded_state_dict (line 378) | def sharded_state_dict(
    method get_delta_weight (line 385) | def get_delta_weight(self, lora_A=None, lora_B=None):
    method unmerge (line 392) | def unmerge(self):
    method merge (line 399) | def merge(self):
    method forward (line 406) | def forward(self, x: paddle.Tensor):
    method extra_repr (line 432) | def extra_repr(self):
  class FleetRowSequenceParallelLoRALinear (line 437) | class FleetRowSequenceParallelLoRALinear(RowSequenceParallelLoRALinear):
    method __init__ (line 438) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
    method forward (line 442) | def forward(self, input: paddle.Tensor):
  class ColumnParallelLoRALinear (line 450) | class ColumnParallelLoRALinear(ColumnParallelLinear):
    method __init__ (line 451) | def __init__(
    method sharded_state_dict (line 510) | def sharded_state_dict(
    method get_delta_weight (line 517) | def get_delta_weight(self, lora_A=None, lora_B=None):
    method unmerge (line 524) | def unmerge(self):
    method merge (line 532) | def merge(self):
    method forward (line 540) | def forward(self, input: paddle.Tensor):
    method extra_repr (line 571) | def extra_repr(self):
  class FleetColumnParallelLoRALinear (line 576) | class FleetColumnParallelLoRALinear(ColumnParallelLoRALinear):
    method __init__ (line 577) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
    method forward (line 581) | def forward(self, input: paddle.Tensor):
  class ColumnSequenceParallelLoRALinear (line 589) | class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear):
    method __init__ (line 590) | def __init__(
    method sharded_state_dict (line 650) | def sharded_state_dict(
    method get_delta_weight (line 657) | def get_delta_weight(self, lora_A=None, lora_B=None):
    method unmerge (line 664) | def unmerge(self):
    method merge (line 671) | def merge(self):
    method forward (line 678) | def forward(self, x: paddle.Tensor):
    method extra_repr (line 707) | def extra_repr(self):
  class FleetColumnSequenceParallelLoRALinear (line 712) | class FleetColumnSequenceParallelLoRALinear(ColumnSequenceParallelLoRALi...
    method __init__ (line 713) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
    method forward (line 717) | def forward(self, input: paddle.Tensor):
  class LoRAConv2D (line 725) | class LoRAConv2D(nn.Conv2D):
    method __init__ (line 727) | def __init__(
    method get_delta_weight (line 780) | def get_delta_weight(self, lora_A=None, lora_B=None):
    method unmerge (line 801) | def unmerge(self):
    method merge (line 809) | def merge(self):
    method forward (line 817) | def forward(self, input: paddle.Tensor, *args, **kwargs):
    method extra_repr (line 828) | def extra_repr(self):

FILE: paddleformers/peft/lora/lora_model.py
  function get_tensor_model_parallel_group (line 53) | def get_tensor_model_parallel_group():
  function get_tensor_model_parallel_world_size (line 56) | def get_tensor_model_parallel_world_size():
  class PaddleFleetPipelineLayer (line 59) | class PaddleFleetPipelineLayer:
  class FleetColumnParallelLinear (line 62) | class FleetColumnParallelLinear:
  class FleetRowParallelLinear (line 65) | class FleetRowParallelLinear:
  function get_lora_layers (line 93) | def get_lora_layers():
  class FleetColumnParallelQuantizationLinear (line 182) | class FleetColumnParallelQuantizationLinear:
  class FleetQuantizationLinear (line 185) | class FleetQuantizationLinear:
  class FleetRowParallelQuantizationLinear (line 188) | class FleetRowParallelQuantizationLinear:
  class FleetColumnParallelQuantizationLoRALinear (line 191) | class FleetColumnParallelQuantizationLoRALinear:
  class FleetQuantizationLoRALinear (line 194) | class FleetQuantizationLoRALinear:
  class FleetRowParallelQuantizationLoRALinear (line 197) | class FleetRowParallelQuantizationLoRALinear:
  class LoRAModel (line 214) | class LoRAModel(nn.Layer):
    method __init__ (line 224) | def __init__(self, model, lora_config: LoRAConfig) -> None:
    method add_lora_split_mapping (line 260) | def add_lora_split_mapping(self, module_name, is_column=False):
    method _get_tensor_parallel_mappings (line 263) | def _get_tensor_parallel_mappings(self, config, is_split=True):
    method from_pretrained (line 352) | def from_pretrained(cls, model, lora_path, **kwargs):
    method set_state_dict (line 459) | def set_state_dict(self, state_dict):
    method _merge_trainable_tensor_parallel (line 511) | def _merge_trainable_tensor_parallel(self, trainable_state_dict, offlo...
    method _get_tensor_parallel_convert_actions (line 549) | def _get_tensor_parallel_convert_actions(self, loaded_keys, is_split=T...
    method _convert_tensor_parallel (line 562) | def _convert_tensor_parallel(self, lora_state_dict):
    method sharded_state_dict (line 573) | def sharded_state_dict(self, *args, **kwargs):
    method save_pretrained (line 580) | def save_pretrained(self, save_directory: str, merge_tensor_parallel: ...
    method _find_and_replace_module (line 701) | def _find_and_replace_module(self, model, module_name, lora_config):
    method _find_and_restore_module (line 959) | def _find_and_restore_module(self, module_name):
    method get_trainable_state_dict (line 972) | def get_trainable_state_dict(self, concat_init_lora=False):
    method print_trainable_parameters (line 987) | def print_trainable_parameters(self) -> None:
    method mark_only_lora_as_trainable (line 999) | def mark_only_lora_as_trainable(self) -> None:
    method get_lora_model (line 1052) | def get_lora_model(self, model: Union[PretrainedModel, nn.Layer], lora...
    method restore_original_model (line 1066) | def restore_original_model(self):
    method __getattr__ (line 1091) | def __getattr__(self, name: str):
    method train (line 1098) | def train(self):
    method eval (line 1105) | def eval(self):
    method save_to_aistudio (line 1112) | def save_to_aistudio(
    method disable_lora (line 1167) | def disable_lora(self):
    method enable_lora (line 1172) | def enable_lora(self):
    method merge (line 1177) | def merge(self):
    method unmerge (line 1182) | def unmerge(self):
    method get_merge_state_dict (line 1187) | def get_merge_state_dict(self, offload: bool = True):

FILE: paddleformers/peft/lora/lora_quant_layers.py
  class QuantedLoRALinear (line 22) | class QuantedLoRALinear(ConvertibleQuantedLayer):
    method __init__ (line 32) | def __init__(self, layer: nn.Layer, q_config):
    method forward (line 57) | def forward(self, input):
    method _linear_forward (line 69) | def _linear_forward(self, input, weight):
    method unmerge (line 74) | def unmerge(self):
    method merge (line 81) | def merge(self):
    method weights_to_quanters (line 88) | def weights_to_quanters(self):
    method activation_quanters (line 91) | def activation_quanters(self):
  class ColumnParallelQuantedLoRALinear (line 95) | class ColumnParallelQuantedLoRALinear(ConvertibleQuantedLayer):
    method __init__ (line 105) | def __init__(self, layer: nn.Layer, q_config):
    method forward (line 133) | def forward(self, input):
    method _linear_forward (line 147) | def _linear_forward(self, input, weight):
    method unmerge (line 161) | def unmerge(self):
    method merge (line 168) | def merge(self):
    method weights_to_quanters (line 175) | def weights_to_quanters(self):
    method activation_quanters (line 178) | def activation_quanters(self):
  class RowParallelQuantedLoRALinear (line 182) | class RowParallelQuantedLoRALinear(ConvertibleQuantedLayer):
    method __init__ (line 192) | def __init__(self, layer: nn.Layer, q_config):
    method forward (line 220) | def forward(self, input):
    method _linear_forward (line 235) | def _linear_forward(self, input, weight):
    method unmerge (line 254) | def unmerge(self):
    method merge (line 261) | def merge(self):
    method weights_to_quanters (line 268) | def weights_to_quanters(self):
    method activation_quanters (line 271) | def activation_quanters(self):

FILE: paddleformers/peft/lora/lora_quantization_layers.py
  class QuantizationLoRABaseLinear (line 34) | class QuantizationLoRABaseLinear(nn.Layer):
    method __init__ (line 35) | def __init__(self, layer, lora_config):
    method forward (line 69) | def forward(self, x, add_bias=True):
    method merge (line 85) | def merge(self):
    method unmerge (line 88) | def unmerge(self):
  class QuantizationLoRALinear (line 92) | class QuantizationLoRALinear(QuantizationLoRABaseLinear):
    method __init__ (line 101) | def __init__(self, layer, lora_config):
    method forward (line 123) | def forward(self, x):
  class FleetQuantizationLoRALinear (line 130) | class FleetQuantizationLoRALinear(QuantizationLoRALinear):
    method __init__ (line 131) | def __init__(self, layer, skip_bias_add, lora_config):
    method forward (line 135) | def forward(self, input: paddle.Tensor):
  class ColumnParallelQuantizationLoRALinear (line 143) | class ColumnParallelQuantizationLoRALinear(QuantizationLoRABaseLinear):
    method __init__ (line 152) | def __init__(self, layer, lora_config):
    method forward (line 183) | def forward(self, x):
    method sharded_state_dict (line 219) | def sharded_state_dict(
  class FleetColumnParallelQuantizationLoRALinear (line 227) | class FleetColumnParallelQuantizationLoRALinear(ColumnParallelQuantizati...
    method __init__ (line 228) | def __init__(self, layer, skip_bias_add, lora_config):
    method forward (line 232) | def forward(self, input: paddle.Tensor):
  class RowParallelQuantizationLoRALinear (line 240) | class RowParallelQuantizationLoRALinear(QuantizationLoRABaseLinear):
    method __init__ (line 249) | def __init__(self, layer, lora_config):
    method forward (line 284) | def forward(self, x):
    method sharded_state_dict (line 321) | def sharded_state_dict(
  class FleetRowParallelQuantizationLoRALinear (line 329) | class FleetRowParallelQuantizationLoRALinear(RowParallelQuantizationLoRA...
    method __init__ (line 330) | def __init__(self, layer, skip_bias_add, lora_config):
    method forward (line 334) | def forward(self, input: paddle.Tensor):

FILE: paddleformers/peft/lora/loraga_utils.py
  class LoRAGATrainer (line 36) | class LoRAGATrainer(Trainer):
    method __init__ (line 39) | def __init__(self, loraga_init_iters: int, gradient_offload: bool, **k...
    method estimate_gradient (line 53) | def estimate_gradient(self, model: PretrainedModel):
    method _wrap_model (line 86) | def _wrap_model(self, model):
  function get_module_gradient (line 137) | def get_module_gradient(
  function loraga_svd_reinit (line 205) | def loraga_svd_reinit(
  function loraga_svd_module (line 268) | def loraga_svd_module(
  function set_hook_enable (line 321) | def set_hook_enable(value=False):
  function get_hook_enable (line 326) | def get_hook_enable():
  class GradientOffloadHookContext (line 331) | class GradientOffloadHookContext:
    method __init__ (line 334) | def __init__(
    method __enter__ (line 350) | def __enter__(self):
    method __exit__ (line 355) | def __exit__(self, exc_type, exc_val, exc_tb):
    method register_gradient_hook (line 358) | def register_gradient_hook(self):
    method get_record_gradient_hook (line 365) | def get_record_gradient_hook(self, model, gradient_dict, grad_name, pa...

FILE: paddleformers/peft/lora/utils.py
  function rng_ctx (line 20) | def rng_ctx(is_mp: bool, in_dynamic_mode: bool):

FILE: paddleformers/quantization/checkpoint_quantization_utils.py
  function cal_ratio (line 20) | def cal_ratio(m, v, eps=1e-8):
  function group_wise_quant_dequant (line 34) | def group_wise_quant_dequant(
  function merge_int4 (line 164) | def merge_int4(x, y):
  function split_int8 (line 179) | def split_int8(final):
  function cal_abs_min_max_channel (line 197) | def cal_abs_min_max_channel(inputs, quant_axis=1):
  function asymmetry_qdq_weight (line 219) | def asymmetry_qdq_weight(
  function cal_abs_max_channel (line 284) | def cal_abs_max_channel(inputs, quant_axis=1):
  function qdq_weight (line 305) | def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False...

FILE: paddleformers/quantization/hadamard_utils.py
  function matmul_hadU (line 20) | def matmul_hadU(X):
  function create_hadamard_matrix (line 36) | def create_hadamard_matrix(block_size, dtype):
  function hadamard_matmul (line 42) | def hadamard_matmul(input, side, hadamard_matrix, block_size):
  function apply_hadamard_matmul (line 59) | def apply_hadamard_matmul(x, side, block_size):

FILE: paddleformers/quantization/qat_utils.py
  function quantize (line 47) | def quantize(
  function dequantize (line 120) | def dequantize(
  function int8_forward (line 138) | def int8_forward(
  function int8_backward (line 169) | def int8_backward(ctx, x, grad_output, quant_weight, weight_scale, quant...
  function fp8_forward (line 197) | def fp8_forward(
  function fp8_backward (line 236) | def fp8_backward(ctx, x, grad_output, quant_weight, weight_scale, quant_...
  class QATFunc (line 358) | class QATFunc(PyLayer):
    method forward (line 360) | def forward(
    method backward (line 420) | def backward(ctx, grad_output):

FILE: paddleformers/quantization/qlora.py
  function qlora_weight_quantize (line 20) | def qlora_weight_quantize(
  function qlora_weight_dequantize (line 56) | def qlora_weight_dequantize(
  function qlora_weight_quantize_dequantize (line 71) | def qlora_weight_quantize_dequantize(
  function qlora_weight_linear (line 98) | def qlora_weight_linear(

FILE: paddleformers/quantization/quantization_config.py
  class QuantizationConfig (line 29) | class QuantizationConfig:
    method __init__ (line 48) | def __init__(
    method fp8_format (line 168) | def fp8_format(self):
    method is_weight_quantize (line 171) | def is_weight_quantize(self):
    method is_support_merge_tensor_parallel (line 189) | def is_support_merge_tensor_parallel(self):
    method from_dict (line 196) | def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
    method to_json_file (line 215) | def to_json_file(self, json_file_path):
    method to_dict (line 222) | def to_dict(self):
    method __repr__ (line 225) | def __repr__(self):
    method to_json_string (line 228) | def to_json_string(self, use_diff=True):
    method to_diff_dict (line 235) | def to_diff_dict(self):

FILE: paddleformers/quantization/quantization_linear.py
  function quant_weight_forward (line 52) | def quant_weight_forward(
  function dequant_weight (line 90) | def dequant_weight(
  class QuantizationLinearFunc (line 123) | class QuantizationLinearFunc(PyLayer):
    method forward (line 125) | def forward(
    method backward (line 161) | def backward(ctx, grad_output):
  function quant_weight_linear (line 203) | def quant_weight_linear(
  function get_activation_scale_group (line 246) | def get_activation_scale_group(is_row=False):
  class QuantizationLinear (line 262) | class QuantizationLinear(nn.Layer):
    method __init__ (line 265) | def __init__(
    method forward (line 387) | def forward(self, x):
  class FleetQuantizationLinear (line 409) | class FleetQuantizationLinear(QuantizationLinear):
    method __init__ (line 410) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
    method forward (line 414) | def forward(self, input: paddle.Tensor):
  class ColumnParallelQuantizationLinear (line 422) | class ColumnParallelQuantizationLinear(nn.Layer):
    method __init__ (line 432) | def __init__(
    method forward (line 600) | def forward(self, x):
    method sharded_state_dict (line 638) | def sharded_state_dict(
  class FleetColumnParallelQuantizationLinear (line 646) | class FleetColumnParallelQuantizationLinear(ColumnParallelQuantizationLi...
    method __init__ (line 647) | def __init__(self, in_features, output_size_per_partition, skip_bias_a...
    method forward (line 651) | def forward(self, input: paddle.Tensor):
  class RowParallelQuantizationLinear (line 659) | class RowParallelQuantizationLinear(nn.Layer):
    method __init__ (line 669) | def __init__(
    method forward (line 838) | def forward(self, x):
    method sharded_state_dict (line 896) | def sharded_state_dict(
  class FleetRowParallelQuantizationLinear (line 904) | class FleetRowParallelQuantizationLinear(RowParallelQuantizationLinear):
    method __init__ (line 905) | def __init__(self, input_size_per_partition, out_features, skip_bias_a...
    method forward (line 909) | def forward(self, input: paddle.Tensor):

FILE: paddleformers/quantization/quantization_utils.py
  function get_tensor_model_parallel_group (line 65) | def get_tensor_model_parallel_group():
  function get_tensor_model_parallel_world_size (line 68) | def get_tensor_model_parallel_world_size():
  class PaddleFleetPipelineLayer (line 71) | class PaddleFleetPipelineLayer:
  class FleetColumnParallelLinear (line 74) | class FleetColumnParallelLinear:
  class FleetRowParallelLinear (line 77) | class FleetRowParallelLinear:
  class FleetColumnParallelQuantizationLinear (line 80) | class FleetColumnParallelQuantizationLinear:
  class FleetQuantizationLinear (line 83) | class FleetQuantizationLinear:
  class FleetRowParallelQuantizationLinear (line 86) | class FleetRowParallelQuantizationLinear:
  function parse_weight_quantize_algo (line 102) | def parse_weight_quantize_algo(quantization_config, name):
  function replace_with_quantization_linear (line 117) | def replace_with_quantization_linear(model, quantization_config, llm_int...
  function convert_to_weight_quantize_state_dict (line 263) | def convert_to_weight_quantize_state_dict(state_dict, name, quantization...
  function convert_to_qlora_state_dict (line 300) | def convert_to_qlora_state_dict(state_dict, name, quantization_config, d...
  function convert_to_quantize_state_dict (line 339) | def convert_to_quantize_state_dict(state_dict, quantization_linear_list,...
  function convert_to_weight_quantize_dequantize_state_dict (line 364) | def convert_to_weight_quantize_dequantize_state_dict(state_dict, name, q...
  function convert_to_qlora_dequantize_state_dict (line 402) | def convert_to_qlora_dequantize_state_dict(state_dict, name, quantizatio...
  function convert_to_quantize_dequantize_state_dict (line 461) | def convert_to_quantize_dequantize_state_dict(state_dict, quantization_l...
  function update_loaded_state_dict_keys (line 488) | def update_loaded_state_dict_keys(state_dict, quantization_linear_list, ...

FILE: paddleformers/quantization/unified_checkpoint_quantization.py
  function dequant_unified_optimizer (line 37) | def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict, ...
  function quant_unified_optimizer (line 152) | def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stag...

FILE: paddleformers/trainer/argparser.py
  function strtobool (line 54) | def strtobool(v):
  class PdArgumentParser (line 67) | class PdArgumentParser(ArgumentParser):
    method __init__ (line 78) | def __init__(self, dataclass_types: Union[DataClassType, Iterable[Data...
    method _parse_dataclass_field (line 97) | def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses....
    method _add_dataclass_arguments (line 177) | def _add_dataclass_arguments(self, dtype: DataClassType):
    method parse_args_into_dataclasses (line 197) | def parse_args_into_dataclasses(
    method common_parse (line 239) | def common_parse(self, args, return_remaining_strings) -> Tuple[DataCl...
    method read_json (line 260) | def read_json(self, json_file: str) -> list:
    method read_yaml (line 277) | def read_yaml(self, yaml_file: str) -> list:
    method parse_json_file (line 296) | def parse_json_file(self, json_file: str, return_remaining_strings=Fal...
    method parse_json_file_and_cmd_lines (line 304) | def parse_json_file_and_cmd_lines(self, return_remaining_strings=False...
    method parse_yaml_file_and_cmd_lines (line 324) | def parse_yaml_file_and_cmd_lines(self, return_remaining_strings=False...
    method read_python (line 344) | def read_python(self, python_file: str) -> list:
    method parse_python_file_and_cmd_lines (line 382) | def parse_python_file_and_cmd_lines(self, return_remaining_strings=Fal...
    method parse_dict (line 402) | def parse_dict(self, args: dict) -> Tuple[DataClass, ...]:

FILE: paddleformers/trainer/integrations.py
  function is_visualdl_available (line 31) | def is_visualdl_available():
  function is_tensorboardX_available (line 35) | def is_tensorboardX_available():
  function is_wandb_available (line 39) | def is_wandb_available():
  function is_swanlab_available (line 45) | def is_swanlab_available():
  function is_ray_available (line 49) | def is_ray_available():
  function get_available_reporting_integrations (line 53) | def get_available_reporting_integrations():
  function rewrite_logs (line 67) | def rewrite_logs(d):
  class VisualDLCallback (line 83) | class VisualDLCallback(TrainerCallback):
    method __init__ (line 91) | def __init__(self, vdl_writer=None):
    method _init_summary_writer (line 106) | def _init_summary_writer(self, args, log_dir=None):
    method on_train_begin (line 111) | def on_train_begin(self, args, state, control, **kwargs):
    method on_log (line 140) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_train_end (line 161) | def on_train_end(self, args, state, control, **kwargs):
  class TensorBoardCallback (line 167) | class TensorBoardCallback(TrainerCallback):
    method __init__ (line 176) | def __init__(self, tb_writer=None):
    method _init_summary_writer (line 192) | def _init_summary_writer(self, args, log_dir=None):
    method on_train_begin (line 197) | def on_train_begin(self, args, state, control, **kwargs):
    method on_log (line 217) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_train_end (line 238) | def on_train_end(self, args, state, control, **kwargs):
  class WandbCallback (line 244) | class WandbCallback(TrainerCallback):
    method __init__ (line 249) | def __init__(self):
    method setup (line 261) | def setup(self, args, state, model, **kwargs):
    method on_train_begin (line 335) | def on_train_begin(self, args, state, control, model=None, **kwargs):
    method on_train_end (line 341) | def on_train_end(self, args, state, control, model=None, tokenizer=Non...
    method on_log (line 377) | def on_log(self, args, state, control, model=None, logs=None, **kwargs):
    method on_save (line 386) | def on_save(self, args, state, control, **kwargs):
  class SwanLabCallback (line 406) | class SwanLabCallback(TrainerCallback):
    method __init__ (line 411) | def __init__(self):
    method setup (line 420) | def setup(self, args, state, model, **kwargs):
    method on_train_begin (line 491) | def on_train_begin(self, args, state, control, model=None, **kwargs):
    method on_train_end (line 495) | def on_train_end(self, args, state, control, model=None, processing_cl...
    method on_log (line 502) | def on_log(self, args, state, control, model=None, logs=None, **kwargs):
    method on_save (line 521) | def on_save(self, args, state, control, **kwargs):
    method on_predict (line 528) | def on_predict(self, args, state, control, metrics, **kwargs):
  class AutoNLPCallback (line 536) | class AutoNLPCallback(TrainerCallback):
    method __init__ (line 541) | def __init__(self):
    method on_evaluate (line 550) | def on_evaluate(self, args, state, control, **kwargs):
  function get_reporting_integration_callbacks (line 568) | def get_reporting_integration_callbacks(report_to):

FILE: paddleformers/trainer/plugins/npu_plugin.py
  function npu_accelerate_plugin (line 25) | def npu_accelerate_plugin(optimizer):
  function _optimizer_step_with_flatten_param_grads (line 36) | def _optimizer_step_with_flatten_param_grads(optimizer):
  function _flatten_param_grads (line 61) | def _flatten_param_grads(optimizer, params_grads):

FILE: paddleformers/trainer/plugins/timer.py
  class _Timer (line 24) | class _Timer:
    method __init__ (line 27) | def __init__(self, name):
    method start (line 33) | def start(self):
    method stop (line 41) | def stop(self):
    method reset (line 49) | def reset(self):
    method elapsed (line 54) | def elapsed(self, reset=True):
  class RuntimeTimer (line 75) | class RuntimeTimer:
    method __init__ (line 78) | def __init__(self, name):
    method start (line 81) | def start(self, name):
    method stop (line 86) | def stop(self):
    method log (line 90) | def log(self):
  class Timers (line 101) | class Timers:
    method __init__ (line 104) | def __init__(self):
    method __call__ (line 107) | def __call__(self, name, use_event=False):
    method write (line 117) | def write(self, names, writer, iteration, normalizer=1.0, reset=True):
    method log (line 124) | def log(self, names, normalizer=1.0, reset=True):
    method info (line 145) | def info(self, names, normalizer=1.0, reset=False):
  function get_timers (line 158) | def get_timers():
  function set_timers (line 163) | def set_timers():
  function disable_timers (line 169) | def disable_timers():

FILE: paddleformers/trainer/trainer.py
  class Trainer (line 269) | class Trainer:
    method __init__ (line 328) | def __init__(
    method _wrap_amp_model (line 610) | def _wrap_amp_model(self, args, model):
    method add_callback (line 662) | def add_callback(self, callback):
    method pop_callback (line 673) | def pop_callback(self, callback):
    method remove_callback (line 686) | def remove_callback(self, callback):
    method _load_from_peft_checkpoint (line 696) | def _load_from_peft_checkpoint(self, resume_from_checkpoint=None):
    method _load_from_checkpoint (line 734) | def _load_from_checkpoint(self, resume_from_checkpoint=None):
    method _wrap_model_and_load_sharded_checkpoint (line 910) | def _wrap_model_and_load_sharded_checkpoint(self, resume_from_checkpoi...
    method _get_zcc_implementation_classes (line 926) | def _get_zcc_implementation_classes(self):
    method _create_zcc_manager_instance (line 932) | def _create_zcc_manager_instance(self, unwrapped_model, zcc_worker_cla...
    method _register_pipeline_hooks (line 952) | def _register_pipeline_hooks(self, unwrapped_model):
    method _setup_zcc_callback (line 966) | def _setup_zcc_callback(self, zcc_callback_class):
    method _handle_checkpoint_resume (line 971) | def _handle_checkpoint_resume(self, resume_from_checkpoint):
    method _get_ema_state_path (line 989) | def _get_ema_state_path(self, checkpoint_path):
    method _should_load_ema_state (line 997) | def _should_load_ema_state(self, checkpoint_path, ema_state_path):
    method create_zcc_manager (line 1011) | def create_zcc_manager(self, unwrapped_model, resume_from_checkpoint=N...
    method add_non_zcc_ema_callback (line 1038) | def add_non_zcc_ema_callback(self, resume_from_checkpoint, ema_state_a...
    method _save_flex_model_state (line 1052) | def _save_flex_model_state(self, output_dir):
    method _save_flex_optimizer_state (line 1066) | def _save_flex_optimizer_state(self, output_dir):
    method _load_flex_checkpoint (line 1095) | def _load_flex_checkpoint(self, resume_from_checkpoint):
    method prepare_resume_from_checkpoint (line 1329) | def prepare_resume_from_checkpoint(self, args, resume_from_checkpoint):
    method cal_epoch_step_samples (line 1363) | def cal_epoch_step_samples(self, args, train_dataloader, total_train_b...
    method _wrap_optimizer (line 1412) | def _wrap_optimizer(self, model):
    method train (line 1433) | def train(
    method log_trainable_numel (line 1588) | def log_trainable_numel(self, model):
    method _split_batches_for_accumulation (line 1616) | def _split_batches_for_accumulation(self, inputs):
    method optimizer_step (line 1752) | def optimizer_step(self, args, model, parameters_list=None):
    method _get_meshes_for_loader (line 1802) | def _get_meshes_for_loader(self):
    method _get_inputs_list (line 1805) | def _get_inputs_list(self, inputs):
    method _inner_training_loop (line 1836) | def _inner_training_loop(
    method _load_best_model_from_peft_checkpoint (line 2383) | def _load_best_model_from_peft_checkpoint(self):
    method _get_train_sampler (line 2418) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
    method _set_state_dict_in_model (line 2444) | def _set_state_dict_in_model(self, state_dict):
    method _print_timer (line 2448) | def _print_timer(self):
    method _check_loss_valid (line 2472) | def _check_loss_valid(self, loss):
    method _get_item_from_loss (line 2480) | def _get_item_from_loss(self, loss):
    method _maybe_log_save_evaluate (line 2490) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_...
    method log_trained_tokens (line 2647) | def log_trained_tokens(self):
    method _get_learning_rate (line 2667) | def _get_learning_rate(self):
    method get_train_dataloader (line 2670) | def get_train_dataloader(self, dense_tensor_idx=None):
    method _get_eval_sampler (line 2747) | def _get_eval_sampler(self, eval_dataset: Dataset):
    method get_eval_dataloader (line 2783) | def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) ...
    method get_test_dataloader (line 2847) | def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
    method create_optimizer_and_scheduler (line 2909) | def create_optimizer_and_scheduler(self, num_training_steps: int):
    method create_optimizer (line 2920) | def create_optimizer(self, lr_scheduler=None):
    method _apply_to_optimizer (line 2965) | def _apply_to_optimizer(self, action):
    method _offload_optimizer (line 2986) | def _offload_optimizer(self):
    method _reload_optimizer (line 2994) | def _reload_optimizer(self):
    method _load_rng_state (line 3000) | def _load_rng_state(self, checkpoint):
    method get_optimizer_cls_and_kwargs (line 3060) | def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any...
    method create_scheduler (line 3096) | def create_scheduler(self, num_training_steps: int):
    method num_examples (line 3125) | def num_examples(self, dataloader: DataLoader) -> int:
    method _decorate_exclude_layers (line 3139) | def _decorate_exclude_layers(self, model: nn.Layer):
    method _wrap_distributed_optimizer (line 3150) | def _wrap_distributed_optimizer(self, optimizer):
    method _wrap_model (line 3205) | def _wrap_model(self, model, training=True):
    method _prepare_input (line 3478) | def _prepare_input(self, data: Union[paddle.Tensor, Any]) -> Union[pad...
    method _prepare_inputs (line 3497) | def _prepare_inputs(self, inputs: Dict[str, Union[paddle.Tensor, Any]]...
    method autocast_smart_context_manager (line 3508) | def autocast_smart_context_manager(self):
    method compute_loss (line 3538) | def compute_loss(self, model, inputs, return_outputs=False):
    method _enable_delay_scale_loss (line 3593) | def _enable_delay_scale_loss(self):
    method training_step (line 3604) | def training_step(
    method training_pipeline_step (line 3662) | def training_pipeline_step(self, model: nn.Layer, inputs: Dict[str, Un...
    method save_model (line 3718) | def save_model(
    method copy_custom_files (line 3771) | def copy_custom_files(self, output_dir):
    method _filter_moe_no_sync_optimizer_params (line 3795) | def _filter_moe_no_sync_optimizer_params(self):
    method _ordered_save (line 3815) | def _ordered_save(self, state_dict, save_path, signal_path=None):
    method _save_checkpoint (line 3839) | def _save_checkpoint(self, model, metrics=None):
    method set_optimizer_grouped_parameters (line 4080) | def set_optimizer_grouped_parameters(self, optimizer_grouped_parameter...
    method disable_autocast_context_manager (line 4088) | def disable_autocast_context_manager(self):
    method _sorted_checkpoints (line 4097) | def _sorted_checkpoints(
    method _rotate_checkpoints (line 4121) | def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
    method _rotate_hf_checkpoints (line 4147) | def _rotate_hf_checkpoints(self, use_mtime=False, output_dir=None) -> ...
    method _save (line 4173) | def _save(
    method _load_scheduler (line 4380) | def _load_scheduler(self, checkpoint):
    method _load_optimizer_and_scheduler (line 4398) | def _load_optimizer_and_scheduler(self, checkpoint):
    method log (line 4481) | def log(self, logs: Dict[str, float], **kwargs) -> None:
    method evaluate (line 4517) | def evaluate(
    method evaluation_loop (line 4582) | def evaluation_loop(
    method predict (line 4800) | def predict(
    method prediction_pipeline_step (line 4857) | def prediction_pipeline_step(
    method prediction_step (line 4925) | def prediction_step(
    method is_local_process_zero (line 5012) | def is_local_process_zero(self) -> bool:
    method is_world_process_zero (line 5019) | def is_world_process_zero(self) -> bool:
    method _nested_gather (line 5026) | def _nested_gather(self, tensors):
    method _pad_across_processes (line 5046) | def _pad_across_processes(self, tensor, pad_index=-100):
    method _set_signature_columns_if_needed (line 5080) | def _set_signature_columns_if_needed(self):
    method _remove_unused_columns (line 5088) | def _remove_unused_columns(self, dataset: "datasets.Dataset", descript...
    method _get_collator_with_removed_columns (line 5118) | def _get_collator_with_removed_columns(
    method _is_iterable_dataset (line 5136) | def _is_iterable_dataset(self, dataset):
    method _is_iterable_dataset_distributed (line 5139) | def _is_iterable_dataset_distributed(self, dataset):
    method print_config (line 5148) | def print_config(self, args=None, key=""):
    method is_unified_checkpoint (line 5174) | def is_unified_checkpoint(self, resume_from_checkpoint, safe_serializa...

FILE: paddleformers/trainer/trainer_callback.py
  class TrainerState (line 75) | class TrainerState:
    method __post_init__ (line 128) | def __post_init__(self):
    method save_to_json (line 132) | def save_to_json(self, json_path: str):
    method load_from_json (line 139) | def load_from_json(cls, json_path: str):
  class TrainerControl (line 147) | class TrainerControl:
    method _new_training (line 182) | def _new_training(self):
    method _new_epoch (line 186) | def _new_epoch(self):
    method _new_step (line 190) | def _new_step(self):
  class TrainerCallback (line 198) | class TrainerCallback:
    method on_init_end (line 248) | def on_init_end(self, args: TrainingArguments, state: TrainerState, co...
    method on_train_begin (line 254) | def on_train_begin(self, args: TrainingArguments, state: TrainerState,...
    method on_train_end (line 260) | def on_train_end(self, args: TrainingArguments, state: TrainerState, c...
    method on_epoch_begin (line 266) | def on_epoch_begin(self, args: TrainingArguments, state: TrainerState,...
    method on_epoch_end (line 272) | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, c...
    method on_step_begin (line 278) | def on_step_begin(self, args: TrainingArguments, state: TrainerState, ...
    method on_load_data_end (line 285) | def on_load_data_end(self, args: TrainingArguments, state: TrainerStat...
    method on_optimizer_begin (line 288) | def on_optimizer_begin(self, args: TrainingArguments, state: TrainerSt...
    method on_optimizer_end (line 291) | def on_optimizer_end(self, args: TrainingArguments, state: TrainerStat...
    method on_substep_end (line 294) | def on_substep_end(self, args: TrainingArguments, state: TrainerState,...
    method on_step_end (line 300) | def on_step_end(self, args: TrainingArguments, state: TrainerState, co...
    method on_evaluate (line 307) | def on_evaluate(self, args: TrainingArguments, state: TrainerState, co...
    method on_save (line 313) | def on_save(self, args: TrainingArguments, state: TrainerState, contro...
    method on_log (line 319) | def on_log(self, args: TrainingArguments, state: TrainerState, control...
    method on_prediction_step (line 325) | def on_prediction_step(self, args: TrainingArguments, state: TrainerSt...
    method on_save_hf (line 331) | def on_save_hf(self, args: TrainingArguments, state: TrainerState, con...
  class CallbackHandler (line 338) | class CallbackHandler(TrainerCallback):
    method __init__ (line 341) | def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
    method add_callback (line 360) | def add_callback(self, callback):
    method pop_callback (line 371) | def pop_callback(self, callback):
    method remove_callback (line 383) | def remove_callback(self, callback):
    method callback_list (line 393) | def callback_list(self):
    method on_init_end (line 396) | def on_init_end(self, args: TrainingArguments, state: TrainerState, co...
    method on_train_begin (line 399) | def on_train_begin(self, args: TrainingArguments, state: TrainerState,...
    method on_train_end (line 403) | def on_train_end(self, args: TrainingArguments, state: TrainerState, c...
    method on_epoch_begin (line 406) | def on_epoch_begin(self, args: TrainingArguments, state: TrainerState,...
    method on_epoch_end (line 410) | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, c...
    method on_step_begin (line 413) | def on_step_begin(self, args: TrainingArguments, state: TrainerState, ...
    method on_load_data_end (line 420) | def on_load_data_end(self, args: TrainingArguments, state: TrainerStat...
    method on_optimizer_begin (line 423) | def on_optimizer_begin(self, args: TrainingArguments, state: TrainerSt...
    method on_optimizer_end (line 426) | def on_optimizer_end(self, args: TrainingArguments, state: TrainerStat...
    method on_substep_end (line 429) | def on_substep_end(self, args: TrainingArguments, state: TrainerState,...
    method on_step_end (line 432) | def on_step_end(self, args: TrainingArguments, state: TrainerState, co...
    method on_evaluate (line 435) | def on_evaluate(self, args: TrainingArguments, state: TrainerState, co...
    method on_save (line 439) | def on_save(self, args: TrainingArguments, state: TrainerState, contro...
    method on_save_hf (line 443) | def on_save_hf(self, args: TrainingArguments, state: TrainerState, con...
    method on_log (line 447) | def on_log(self, args: TrainingArguments, state: TrainerState, control...
    method on_prediction_step (line 451) | def on_prediction_step(self, args: TrainingArguments, state: TrainerSt...
    method call_event (line 454) | def call_event(self, event, args, state, control, **kwargs):
  class DefaultFlowCallback (line 474) | class DefaultFlowCallback(TrainerCallback):
    method on_step_end (line 479) | def on_step_end(self, args: TrainingArguments, state: TrainerState, co...
    method on_epoch_end (line 522) | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, c...
  class ProgressCallback (line 538) | class ProgressCallback(TrainerCallback):
    method __init__ (line 543) | def __init__(self):
    method on_train_begin (line 547) | def on_train_begin(self, args, state, control, **kwargs):
    method on_step_end (line 552) | def on_step_end(self, args, state, control, **kwargs):
    method on_prediction_step (line 557) | def on_prediction_step(self, args, state, control, eval_dataloader=Non...
    method on_evaluate (line 565) | def on_evaluate(self, args, state, control, **kwargs):
    method on_log (line 571) | def on_log(self, args, state, control, logs=None, **kwargs):
    method on_train_end (line 580) | def on_train_end(self, args, state, control, **kwargs):
  class PrinterCallback (line 589) | class PrinterCallback(TrainerCallback):
    method on_log (line 594) | def on_log(self, args, state, control, logs=None, **kwargs):
  class EarlyStoppingCallback (line 606) | class EarlyStoppingCallback(TrainerCallback):
    method __init__ (line 622) | def __init__(self, early_stopping_patience: int = 1, early_stopping_th...
    method check_metric_value (line 628) | def check_metric_value(self, args, state, control, metric_value):
    method on_train_begin (line 639) | def on_train_begin(self, args, state, control, **kwargs):
    method on_evaluate (line 648) | def on_evaluate(self, args, state, control, metrics, **kwargs):
  class StepFlexToken (line 665) | class StepFlexToken(TrainerCallback):
    method on_step_begin (line 666) | def on_step_begin(
  function enable_in_dict_config (line 681) | def enable_in_dict_config(config, key):
  class FP8QuantWeightCallback (line 689) | class FP8QuantWeightCallback(TrainerCallback):
    method on_step_begin (line 694) | def on_step_begin(self, args, state, control, **kwargs):
    method on_optimizer_begin (line 733) | def on_optimizer_begin(self, args, state, control, **kwargs):
  class MoECorrectionBiasAdjustCallback (line 751) | class MoECorrectionBiasAdjustCallback(TrainerCallback):
    method __init__ (line 756) | def __init__(self, lr=0.001, use_mp=False):
    method on_optimizer_end (line 761) | def on_optimizer_end(self, args, state, control, **kwargs):
  class MoeExpertsGradScaleCallback (line 812) | class MoeExpertsGradScaleCallback(TrainerCallback):
    method __init__ (line 817) | def __init__(self, args):
    method on_optimizer_begin (line 832) | def on_optimizer_begin(self, args, state, control, **kwargs):
  class MoEGateSpGradSyncCallBack (line 837) | class MoEGateSpGradSyncCallBack(TrainerCallback):
    method __init__ (line 844) | def __init__(self):
    method on_optimizer_begin (line 847) | def on_optimizer_begin(self, args, state, control, **kwargs):
  class SPGradSyncCallback (line 865) | class SPGradSyncCallback(TrainerCallback):
    method __init__ (line 872) | def __init__(self, model):
    method on_optimizer_begin (line 885) | def on_optimizer_begin(self, args, state, control, **kwargs):
  class EMAStateAssemblerCallback (line 895) | class EMAStateAssemblerCallback(TrainerCallback):
    method __init__ (line 896) | def __init__(self, ema_state_assembler):
    method on_step_end (line 899) | def on_step_end(self, args, state, control, **kwargs):
  class InterleaveGateUpCallback (line 906) | class InterleaveGateUpCallback(TrainerCallback):
    method __init__ (line 907) | def __init__(self, model, resume_from_checkpoint=None, output_dir=None):
    method interleave_gate_up_proj (line 912) | def interleave_gate_up_proj(self, w):
    method on_train_begin (line 920) | def on_train_begin(self, args, state, control, **kwargs):

FILE: paddleformers/trainer/trainer_utils.py
  function mock_offload_optimizer (line 99) | def mock_offload_optimizer():
  function log_trainer_start (line 112) | def log_trainer_start():
  class Topology (line 122) | class Topology:
    method __init__ (line 123) | def __init__(
    method __repr__ (line 183) | def __repr__(self):
  function _get_distributed_seeds (line 187) | def _get_distributed_seeds(seed: int = 1234, topo: Topology = None):
  function set_seed (line 298) | def set_seed(seed: int = 1234, topo=None):
  function set_random_seed (line 318) | def set_random_seed(
  function _switch_mode (line 360) | def _switch_mode(mode="dynamic"):
  function _exec_mode_guard (line 369) | def _exec_mode_guard(mode="dynamic"):
  class ExplicitEnum (line 378) | class ExplicitEnum(Enum):
    method _missing_ (line 384) | def _missing_(cls, value):
  class EvalPrediction (line 390) | class EvalPrediction(NamedTuple):
  class EvalLoopOutput (line 403) | class EvalLoopOutput(NamedTuple):
  class PredictionOutput (line 410) | class PredictionOutput(NamedTuple):
  class TrainOutput (line 416) | class TrainOutput(NamedTuple):
  function _check_checkpoint_files (line 422) | def _check_checkpoint_files(
  function get_last_checkpoint (line 440) | def get_last_checkpoint(folder, signal_folder=None, uc_async_save=False):
  class IntervalStrategy (line 480) | class IntervalStrategy(ExplicitEnum):
  class EvaluationStrategy (line 486) | class EvaluationStrategy(ExplicitEnum):
  class OptimizerNames (line 492) | class OptimizerNames(ExplicitEnum):
  class ShardingOption (line 503) | class ShardingOption(ExplicitEnum):
  function is_main_process (line 519) | def is_main_process(local_rank):
  function total_processes_number (line 528) | def total_processes_number(local_rank):
  function speed_metrics (line 539) | def speed_metrics(split, start_time, num_samples=None, num_steps=None, s...
  class SchedulerType (line 571) | class SchedulerType(ExplicitEnum):
  function get_constant_schedule (line 579) | def get_constant_schedule(learning_rate: float, last_epoch: int = -1):
  function get_constant_schedule_with_warmup (line 593) | def get_constant_schedule_with_warmup(learning_rate: float, num_warmup_s...
  function get_linear_schedule_with_warmup (line 616) | def get_linear_schedule_with_warmup(learning_rate: float, num_warmup_ste...
  function get_cosine_schedule_with_warmup (line 643) | def get_cosine_schedule_with_warmup(
  function get_polynomial_decay_schedule_with_warmup (line 681) | def get_polynomial_decay_schedule_with_warmup(
  function get_scheduler (line 741) | def get_scheduler(
  function _secs2timedelta (line 814) | def _secs2timedelta(secs):
  function metrics_format (line 823) | def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
  function log_metrics (line 847) | def log_metrics(self, split, metrics):
  function save_metrics (line 868) | def save_metrics(self, split, metrics, combined=True):
  function save_state (line 902) | def save_state(self):
  function has_length (line 914) | def has_length(dataset):
  class TrainerMemoryTracker (line 925) | class TrainerMemoryTracker:
    method __init__ (line 957) | def __init__(self, skip_memory_metrics=False):
    method derive_stage (line 984) | def derive_stage(self):
    method cpu_mem_used (line 994) | def cpu_mem_used(self):
    method peak_monitor_func (line 998) | def peak_monitor_func(self):
    method start (line 1010) | def start(self):
    method stop (line 1040) | def stop(self, stage):
    method update_metrics (line 1084) | def update_metrics(self, stage, metrics):
    method stop_and_update_metrics (line 1129) | def stop_and_update_metrics(self, metrics=None):
  class IterableDatasetShard (line 1142) | class IterableDatasetShard(IterableDataset):
    method __init__ (line 1169) | def __init__(
    method set_epoch (line 1187) | def set_epoch(self, epoch):
    method __iter__ (line 1192) | def __iter__(self):
    method __len__ (line 1227) | def __len__(self):
  class LastBatchPaddingSampler (line 1235) | class LastBatchPaddingSampler(paddle.io.DistributedBatchSampler):
    method __iter__ (line 1238) | def __iter__(self):
  function find_batch_size (line 1295) | def find_batch_size(tensors):
  class RemoveColumnsCollator (line 1315) | class RemoveColumnsCollator:
    method __init__ (line 1318) | def __init__(
    method _remove_columns (line 1333) | def _remove_columns(self, feature: dict) -> dict:
    method __call__ (line 1349) | def __call__(self, features: List[dict]):
  function set_hyrbid_parallel_seed (line 1354) | def set_hyrbid_parallel_seed(basic_seed, dataset_rank, tp_rank, pp_rank=0):
  function should_skip_data (line 1373) | def should_skip_data(global_step, skip_data_intervals):
  function split_parallel_config (line 1390) | def split_parallel_config(parallel_config):
  function download_recovery_ckpt_from_pdc (line 1398) | def download_recovery_ckpt_from_pdc(recovery_checkpoint_path, timeout):
  function _insert_sync (line 1438) | def _insert_sync(self, sync_var, src, mp_group, sync_mode):
  function init_optimizer (line 1473) | def init_optimizer(optimizer, model_sharded_state_dict, state_dict_metad...
  function parse_nccl_config_file (line 1549) | def parse_nccl_config_file(config_dir):
  function init_nccl_config (line 1606) | def init_nccl_config(nccl_comm_group_config, strategy):
  class HFFormatFullParamSaver (line 1638) | class HFFormatFullParamSaver:
    method __init__ (line 1639) | def __init__(
    method get_full_param_iter (line 1660) | def get_full_param_iter(self):
    method determin_saver_based_group (line 1680) | def determin_saver_based_group(self):
    method save_checkpoint (line 1692) | def save_checkpoint(self, path, max_shard_size="16GB"):
  function recover_params_from_master_weight (line 1715) | def recover_params_from_master_weight(ema_state_dict, model, optimizer, ...
  class EMAStateAssembler (line 1765) | class EMAStateAssembler:
    method __init__ (line 1766) | def __init__(
    method run (line 1831) | def run(self):
    method _update_expected_next_save_ckpt_step (line 1903) | def _update_expected_next_save_ckpt_step(self):
    method _set_latest_processed_checkpoint_step (line 1909) | def _set_latest_processed_checkpoint_step(self, start_step):
    method _find_checkpoint (line 1914) | def _find_checkpoint(self, mode: str = "next") -> Tuple[Optional[int],...
    method _is_already_handled (line 1940) | def _is_already_handled(self, checkpoint_dir: Path) -> bool:
    method _check_all_ranks_saved (line 1944) | def _check_all_ranks_saved(self, checkpoint_dir: Path) -> bool:
    method _mark_as_handled (line 1955) | def _mark_as_handled(self, checkpoint_dir: Path, step: int):
    method _handle_checkpoint_with_ema (line 1969) | def _handle_checkpoint_with_ema(self, step: int, checkpoint_dir: Path):
    method _handle_naive_checkpoint (line 1991) | def _handle_naive_checkpoint(self, step: int, checkpoint_dir: Path):
    method _get_ema_state_path (line 2004) | def _get_ema_state_path(self, checkpoint_dir: Path) -> Path:
    method _load_ema_state_dict (line 2012) | def _load_ema_state_dict(self, ema_state_path: Path):
    method _build_ema_sharded_state_dict (line 2020) | def _build_ema_sharded_state_dict(self, ema_state_dict):
    method _save_full_ema_states (line 2086) | def _save_full_ema_states(self, step, ema_sharded_state_dict):
  function select_flex_ckpt_comm_method (line 2104) | def select_flex_ckpt_comm_method():

FILE: paddleformers/trainer/training_args.py
  function get_tensor_model_parallel_group (line 54) | def get_tensor_model_parallel_group(*args, **kwargs):
  function initialize_fleet (line 57) | def initialize_fleet(*args, **kwargs):
  function default_logdir (line 67) | def default_logdir() -> str:
  class TrainingArguments (line 79) | class TrainingArguments:
    method __post_init__ (line 1576) | def __post_init__(self):
    method _post_init_parallel_degree (line 2583) | def _post_init_parallel_degree(self):
    method _post_init_save_checkpoint_format (line 2713) | def _post_init_save_checkpoint_format(self):
    method _post_init_load_checkpoint_format (line 2725) | def _post_init_load_checkpoint_format(self):
    method add_moe_comm_group (line 2737) | def add_moe_comm_group(self):
    method __str__ (line 2746) | def __str__(self):
    method train_batch_size (line 2756) | def train_batch_size(self) -> int:
    method eval_batch_size (line 2764) | def eval_batch_size(self) -> int:
    method current_device (line 2772) | def current_device(self) -> "paddle.device":
    method world_size (line 2779) | def world_size(self):
    method data_parallel_rank (line 2788) | def data_parallel_rank(self):
    method cp_sharding_degree (line 2802) | def cp_sharding_degree(self):
    method cp_sharding_rank (line 2823) | def cp_sharding_rank(self):
    method dataset_rank (line 2836) | def dataset_rank(self):
    method dataset_world_size (line 2851) | def dataset_world_size(self):
    method sharding_parallel_rank (line 2868) | def sharding_parallel_rank(self):
    method tensor_parallel_rank (line 2877) | def tensor_parallel_rank(self):
    method pipeline_parallel_rank (line 2889) | def pipeline_parallel_rank(self):
    method expert_parallel_rank (line 2901) | def expert_parallel_rank(self):
    method moe_sharding_parallel_rank (line 2912) | def moe_sharding_parallel_rank(self):
    method context_parallel_rank (line 2923) | def context_parallel_rank(self):
    method _format_name (line 2933) | def _format_name(self, prefix, rank, degree):
    method optimizer_name_suffix (line 2938) | def optimizer_name_suffix(self):
    method weight_name_suffix (line 2956) | def weight_name_suffix(self):
    method sharded_name_suffix (line 2976) | def sharded_name_suffix(self, shard_id=None, pp_id=None, moe_id=None, ...
    method process_index (line 3007) | def process_index(self):
    method logical_process_index (line 3016) | def logical_process_index(self):
    method local_process_index (line 3038) | def local_process_index(self):
    method should_log (line 3047) | def should_log(self):
    method should_save (line 3059) | def should_save(self):
    method should_save_model_state (line 3077) | def should_save_model_state(self):
    method _no_sync_in_gradient_accumulation (line 3105) | def _no_sync_in_gradient_accumulation(self):
    method should_save_sharding_stage1_model (line 3112) | def should_save_sharding_stage1_model(self):
    method should_load_sharding_stage1_model (line 3122) | def should_load_sharding_stage1_model(self):
    method should_load_dataset (line 3128) | def should_load_dataset(self):
    method get_auto_dist_flag (line 3137) | def get_auto_dist_flag(self):
    method main_process_first (line 3152) | def main_process_first(self, local=True, desc="work"):
    method get_warmup_steps (line 3194) | def get_warmup_steps(self, num_training_steps: int):
    method to_dict (line 3203) | def to_dict(self):
    method to_json_string (line 3218) | def to_json_string(self):
    method to_sanitized_dict (line 3224) | def to_sanitized_dict(self) -> Dict[str, Any]:
    method print_config (line 3236) | def print_config(self, args=None, key=""):
    method should_save_model_with_tensor_fusion (line 3260) | def should_save_model_with_tensor_fusion(self):

FILE: paddleformers/trainer/unified_checkpoint/async_handler.py
  class AsyncCheckpointHandler (line 41) | class AsyncCheckpointHandler:
    method __init__ (line 42) | def __init__(self, args):
    method _file_save_async_or_sync (line 73) | def _file_save_async_or_sync(
    method _save_file_async_in_process (line 193) | def _save_file_async_in_process(
    method _reset_and_update (line 232) | def _reset_and_update(self, shared_array, new_value):
    method unlink_shared_memory (line 240) | def unlink_shared_memory(self):

FILE: paddleformers/trainer/unified_checkpoint/check_completion.py
  function check_unified_checkpoint (line 42) | def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_s...
  function check_unified_optimizer (line 106) | def check_unified_optimizer(args, model, optimizer, resume_from_checkpoi...

FILE: paddleformers/trainer/unified_checkpoint/load_dynamic.py
  function create_send_table (line 56) | def create_send_table(file_keyname_mappings, file_machine_mappings):
  function create_dispatch_table (line 75) | def create_dispatch_table(args, model, file_keyname_mappings, file_machi...
  function create_optimizer_dispatch_table (line 113) | def create_optimizer_dispatch_table(
  function get_file_mappings (line 172) | def get_file_mappings(index, resume_from_checkpoint):
  function distributed_send_recv (line 200) | def distributed_send_recv(
  function load_unified_checkpoint_dynamically (line 258) | def load_unified_checkpoint_dynamically(args, model, resume_from_checkpo...
  function load_unified_optimizer_dynamically (line 317) | def load_unified_optimizer_dynamically(args, model, optimizer, resume_fr...

FILE: paddleformers/trainer/unified_checkpoint/load_local.py
  function load_unified_checkpoint_locally (line 53) | def load_unified_checkpoint_locally(
  function load_unified_optimizer_locally (line 158) | def load_unified_optimizer_locally(args, model, optimizer, resume_from_c...

FILE: paddleformers/trainer/unified_checkpoint/load_save_single_card.py
  function save_file_sync (line 62) | def save_file_sync(state_dict, path, save_to_hf=False):
  function save_single_card_checkpoint (line 67) | def save_single_card_checkpoint(model_to_save, output_dir, save_to_hf=Fa...
  function save_single_card_optimizer (line 105) | def save_single_card_optimizer(model, optimizer, output_dir):
  function load_single_card_checkpoint (line 170) | def load_single_card_checkpoint(model, resume_from_checkpoint: str, conv...
  function load_single_card_optimizer (line 204) | def load_single_card_optimizer(model, optimizer, resume_from_checkpoint:...

FILE: paddleformers/trainer/unified_checkpoint/sharding_split_param_utils.py
  function merge_splited_param (line 49) | def merge_splited_param(
  function gather_splited_param_for_optimizer (line 117) | def gather_splited_param_for_optimizer(optimizer, ckpt_quant_stage="O0"):
  function get_params_info (line 182) | def get_params_info(comm_buffer_list):
  function reshape_params (line 202) | def reshape_params(state_dict, struct2static_name_mappings, param_shape_...
  function load_unified_optimizer_split_param (line 225) | def load_unified_optimizer_split_param(args, model, optimizer, resume_fr...
  function load_non_merge_optimizer_with_split_param (line 364) | def load_non_merge_optimizer_with_split_param(args, model, optimizer, re...

FILE: paddleformers/trainer/unified_checkpoint/shared_memory_utils.py
  class TensorMeta (line 26) | class TensorMeta:
  function _write_shared_memory (line 49) | def _write_shared_memory(value: paddle.Tensor, meta: TensorMeta, buffer):
  function _traverse_copy_to_shm (line 63) | def _traverse_copy_to_shm(value, meta, buffer):
  function _read_ndarray_from_buf (line 86) | def _read_ndarray_from_buf(value, shm_tensor_buffer):
  function _read_state_dict_from_shm (line 105) | def _read_state_dict_from_shm(meta_dict, tensor_shm):
  function _traverse_state_dict (line 113) | def _traverse_state_dict(value, visitor):
  function create_meta_dict (line 131) | def create_meta_dict(state_dict):

FILE: paddleformers/trainer/unified_checkpoint/unified_checkpoint.py
  class UnifiedCheckpointHandler (line 92) | class UnifiedCheckpointHandler:
    method __init__ (line 93) | def __init__(self, args):
    method save_unified_checkpoint (line 97) | def save_unified_checkpoint(self, model, optimizer, output_dir, signal...
    method load_unified_checkpoint (line 182) | def load_unified_checkpoint(self, model, resume_from_checkpoint: str, ...
    method save_non_merge_optimizer (line 208) | def save_non_merge_optimizer(
    method load_non_merge_optimizer (line 326) | def load_non_merge_optimizer(self, model, optimizer, resume_from_check...
    method save_unified_optimizer (line 422) | def save_unified_optimizer(self, model, optimizer, output_dir, signal_...
    method load_unified_optimizer (line 523) | def load_unified_optimizer(self, model, optimizer, resume_from_checkpo...
    method unlink_shared_memory (line 593) | def unlink_shared_memory(self):
  function unified_checkpoint_into_shards (line 597) | def unified_checkpoint_into_shards(
  function unified_optimizer_into_shards (line 680) | def unified_optimizer_into_shards(

FILE: paddleformers/trainer/unified_checkpoint/utils.py
  class UnifiedCheckpointOption (line 70) | class UnifiedCheckpointOption(ExplicitEnum):
  function unwrap_optimizer (line 87) | def unwrap_optimizer(optimizer):
  function is_need_master_weight (line 96) | def is_need_master_weight(optimizer, is_fp16_or_bp16):
  function update_master_weight_status (line 104) | def update_master_weight_status(args, optimizer, has_master_weight, safe...
  function reduce_master_weights_status (line 142) | def reduce_master_weights_status(has_master_weights=False):
  function select_model_weight_index (line 163) | def select_model_weight_index(model, resume_from_checkpoint, safe_serial...
  function mapping_optimizer_tp_actions (line 189) | def mapping_optimizer_tp_actions(tp_actions, optimizer_loaded_keys):
  function get_expected_state_dict (line 208) | def get_expected_state_dict(model_to_save, **kwargs):
  function get_expected_keys (line 224) | def get_expected_keys(args, sharded_metadata, model, optimizer, is_maste...
  function get_optimizer_shard_files (line 264) | def get_optimizer_shard_files(optimizer_path, index_filename):
  function generate_base_static_name (line 301) | def generate_base_static_name(vname):
  function merge_large_tensor_parallel (line 318) | def merge_large_tensor_parallel(tensor, tp_group, tp_action, dst_rank, i...
  function merge_tensor_parallel_with_shard (line 357) | def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_...
  function merge_tensor_parallel_for_optimizer (line 412) | def merge_tensor_parallel_for_optimizer(state_dict, model_state_dict, tp...
  function filter_params (line 464) | def filter_params(model_to_save, state_dict, args, is_optimizer=False):
  function get_sharded_file_name (line 569) | def get_sharded_file_name(args, file_name, is_optimizer=False):
  function get_sharded_index (line 607) | def get_sharded_index(
  function gather_sharded_object (line 630) | def gather_sharded_object(
  function rename_shard_file (line 692) | def rename_shard_file(args, shard_file, file_name):
  function is_sharding_split_param_mode (line 733) | def is_sharding_split_param_mode(args):
  function save_model_config (line 737) | def save_model_config(model_to_save, save_directory, save_to_hf=False):
  function filter_sync_parameters (line 771) | def filter_sync_parameters(

FILE: paddleformers/trainer/utils/async_save.py
  function _save_optimizer (line 26) | def _save_optimizer(obj, name_mapping, path, saved_signal_path, protocol):
  class AsyncSaver (line 47) | class AsyncSaver:
    method __init__ (line 48) | def __init__(self):
    method run (line 57) | def run(self, optimizer_state_dict, path, saved_signal_path, protocol=4):
    method _wait_for_previous_result (line 71) | def _wait_for_previous_result(self):
    method _reset_state (line 100) | def _reset_state(self, path, saved_signal_path, protocol):
    method _process_optimizer_state_dict (line 107) | def _process_optimizer_state_dict(self, optimizer_state_dict):
    method shutdown (line 121) | def shutdown(self):
    method __del__ (line 125) | def __del__(self):

FILE: paddleformers/trainer/utils/ckpt_converter.py
  class CheckpointConverter (line 43) | class CheckpointConverter:
    method __init__ (line 44) | def __init__(
    method load_from_hybrid_parallel_checkpoint (line 116) | def load_from_hybrid_parallel_checkpoint(self):
    method gen_metadata_and_prepare_source_state_dict (line 183) | def gen_metadata_and_prepare_source_state_dict(self):
    method rename_local_view_state_dict (line 518) | def rename_local_view_state_dict(self, state_dict, file_name):
    method load_state_dict_and_rename (line 558) | def load_state_dict_and_rename(self):
    method get_sharded_tensor_infos (line 825) | def get_sharded_tensor_infos(self, file, state_dict, cur_rank_sharded_...
    method gen_metadata_for_tp_sharded_tensor (line 847) | def gen_metadata_for_tp_sharded_tensor(self):
    method rename_using_model_meta (line 897) | def rename_using_model_meta(self, file_name):
    method rename_auto_parallel_state_dict (line 926) | def rename_auto_parallel_state_dict(self):
    method rename_using_parameter_to_structured_name_mapping (line 935) | def rename_using_parameter_to_structured_name_mapping(self, state_dict...
    method rename_using_optimizer_state_order (line 969) | def rename_using_optimizer_state_order(self, model_state_keys, optimiz...
    method partition_parameters (line 1018) | def partition_parameters(self, model_state_shapes, is_sort, shard_num):
    method get_is_model_meta_exists_flag (line 1042) | def get_is_model_meta_exists_flag(self):
    method get_is_model_state_stored_flag (line 1048) | def get_is_model_state_stored_flag(self):
    method flatten_state_dict (line 1064) | def flatten_state_dict(self, state_dict):
    method gather_global_object (line 1073) | def gather_global_object(self, cur_rank_object):
    method get_local_checkpoint_file_names (line 1094) | def get_local_checkpoint_file_names(self):
    method get_distribution_rank_from_file_name (line 1110) | def get_distribution_rank_from_file_name(self, file_name):
    method initial_distributed_configuration (line 1128) | def initial_distributed_configuration(self):
    method infer_sharding_stage1_v (line 1145) | def infer_sharding_stage1_v(self):
    method infer_is_sharding_stage3 (line 1160) | def infer_is_sharding_stage3(self):
    method get_model_state_file_from (line 1188) | def get_model_state_file_from(self, optimizer_state_file_name):
    method optimizer_key_to_model_state_key (line 1196) | def optimizer_key_to_model_state_key(self, optimizer_key):
    method print_checkpoint_file_info (line 1205) | def print_checkpoint_file_info(self, flags):

FILE: paddleformers/trainer/utils/doc.py
  function add_start_docstrings (line 19) | def add_start_docstrings(*docstr):
  function add_start_docstrings_to_model_forward (line 27) | def add_start_docstrings_to_model_forward(*docstr):
  function add_end_docstrings (line 49) | def add_end_docstrings(*docstr):

FILE: paddleformers/trainer/utils/helper.py
  function distributed_concat (line 46) | def distributed_concat(tensor: Any, num_total_examples: Optional[int] = ...
  function paddle_pad_and_concatenate (line 63) | def paddle_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
  function numpy_pad_and_concatenate (line 83) | def numpy_pad_and_concatenate(array1, array2, padding_index=-100):
  function nested_concat (line 98) | def nested_concat(tensors, new_tensors, padding_index=-100):
  function nested_detach (line 116) | def nested_detach(tensors):
  function nested_numpify (line 123) | def nested_numpify(tensors):
  function nested_truncate (line 133) | def nested_truncate(tensors, limit):
  function distributed_isfile (line 140) | def distributed_isfile(filename):
  function distributed_file (line 155) | def distributed_file(filename):
  function broadcast_dp_optimizer (line 188) | def broadcast_dp_optimizer(state_dict):
  function broadcast_moe_optimizer (line 233) | def broadcast_moe_optimizer(state_dict, model_state_dict=None, broadcast...
  function broadcast_dataset_rank0_model (line 315) | def broadcast_dataset_rank0_model(model):

FILE: paddleformers/trainer/utils/offload_optimizer.py
  function offload (line 25) | def offload(tensor):
  function reload (line 37) | def reload(tensor):
  function hack_offload_optimizer (line 42) | def hack_offload_optimizer(mode=None):
  function hack_offload_optimizer_eb5 (line 96) | def hack_offload_optimizer_eb5():

FILE: paddleformers/trainer/utils/reshard/common.py
  function is_sharding_opt (line 34) | def is_sharding_opt(optimizer):
  function get_sharding_strategy (line 51) | def get_sharding_strategy(optimizer):
  function convert_opt_name_to_tname (line 59) | def convert_opt_name_to_tname(tensor_names, opt_names):
  class NodeModelState (line 102) | class NodeModelState:
    method __init__ (line 103) | def __init__(self, group):
    method group (line 111) | def group(self):
    method _add_kv (line 114) | def _add_kv(self, d, k, v):
    method model_weights (line 119) | def model_weights(self):
    method add_weight (line 122) | def add_weight(self, k, v):
    method add_weights (line 125) | def add_weights(self, model_state_dict, rank=None):
    method set_weights (line 131) | def set_weights(self, model_state_dict):
    method set_opt_state (line 134) | def set_opt_state(self, opt_state_dict):
    method set_master_weights (line 137) | def set_master_weights(self, master_weights):
    method opt_state (line 141) | def opt_state(self):
    method add_opt (line 144) | def add_opt(self, k, v):
    method add_opts (line 147) | def add_opts(self, opts, rank=None):
    method master_weights (line 164) | def master_weights(self):
    method add_master_weight (line 167) | def add_master_weight(self, k, v):
    method add_master_weights (line 170) | def add_master_weights(self, master, rank=None):
    method lr_scheduler (line 177) | def lr_scheduler(self):
    method set_lr_scheduler (line 180) | def set_lr_scheduler(self, lr_scheduler):
    method map_names (line 184) | def map_names(self, map_func):
    method drop_rank (line 222) | def drop_rank(self):
    method collapse_key (line 244) | def collapse_key(self):
    method flatten_key (line 272) | def flatten_key(self):
    method pack_keys (line 293) | def pack_keys(self, structure_name_mapping=None):
    method unpack_keys (line 341) | def unpack_keys(self):
    method split_state (line 378) | def split_state(self, split_func):
    method even_distribute (line 403) | def even_distribute(self):
    method reshard (line 452) | def reshard(self, filter_func):
    method split_items (line 464) | def split_items(self, split_func):
    method merge_items (line 485) | def merge_items(self, merge_func):
    method merge_from (line 508) | def merge_from(self, other, rank=None):
    method get_opt_state_dict (line 517) | def get_opt_state_dict(self):
  function split_model_state (line 527) | def split_model_state(model_state, group_getter):
  function merge_model_state (line 537) | def merge_model_state(model_state_map):
  function split_opt_state (line 544) | def split_opt_state(opt_state, group_getter):
  function merge_opt_state (line 565) | def merge_opt_state(opt_state_map):
  function split_structure_name_mapping (line 579) | def split_structure_name_mapping(structure_name_mapping, group_getter):
  function all_gather_simple_object (line 589) | def all_gather_simple_object(obj, group):
  function all_gather_state_dict (line 597) | def all_gather_state_dict(state_dict, filter_func, group):
  function _all_gather_state_dict (line 646) | def _all_gather_state_dict(state_dict, filter_func, group):
  function get_moe_sharding_group (line 658) | def get_moe_sharding_group(hcg=None):
  function get_param_sharding_group (line 667) | def get_param_sharding_group(param, hcg=None):

FILE: paddleformers/trainer/utils/reshard/pp_reshard.py
  function regitser_extract_layer_name_func (line 23) | def regitser_extract_layer_name_func(func):
  function get_extract_layer_name_func (line 28) | def get_extract_layer_name_func():
  function register_index_layer_func (line 37) | def register_index_layer_func(func):
  function get_index_layer_func (line 42) | def get_index_layer_func():
  function register_sname_to_tname_func (line 51) | def register_sname_to_tname_func(func):
  function has_register_sname_to_tname_func (line 56) | def has_register_sname_to_tname_func():
  function get_sname_to_tname_func (line 61) | def get_sname_to_tname_func():
  class LayerNameScope (line 67) | class LayerNameScope:
    method __init__ (line 74) | def __init__(self, prefix, template):
    method get_layer_prefix (line 83) | def get_layer_prefix(cls, old_layer_name):
    method register_layer_prefix (line 90) | def register_layer_prefix(cls, prefix):
    method get_next_scope (line 95) | def get_next_scope(self, layer_id, old_layer_name):
    method get_layer_name (line 103) | def get_layer_name(self):
    method get_sub_scope (line 111) | def get_sub_scope(self, sub_layer_name):
  function register_layername_prefix (line 123) | def register_layername_prefix(layer_name):
  function extract_param_names_groupby_layer (line 127) | def extract_param_names_groupby_layer(
  function build_pipeline_context (line 150) | def build_pipeline_context(meta, pp_model):
  class LayerReNamingManager (line 161) | class LayerReNamingManager:
    method __init__ (line 162) | def __init__(self):
    method get_new_layer_name (line 165) | def get_new_layer_name(self, layer_id: str, old_name: str):
    method get_new_param_name (line 169) | def get_new_param_name(self, layer_id, old_name: str):
  class PipeLinelayer (line 176) | class PipeLinelayer:
    method __init__ (line 177) | def __init__(self, layer_name, param_names):
    method params (line 193) | def params(self):
    method name (line 197) | def name(self):
  class PipeLineSegment (line 201) | class PipeLineSegment:
    method __init__ (line 202) | def __init__(self, start_index, end_index):
    method add_layer (line 208) | def add_layer(self, layer_name, param_names):
    method layers (line 215) | def layers(self):
  class PipeLineStage (line 220) | class PipeLineStage:
    method __init__ (line 221) | def __init__(self):
    method add_segment (line 229) | def add_segment(self, start_index, end_index):
    method add_layer (line 235) | def add_layer(self, layer_index, layer_name, param_names):
    method build_name_mapping (line 240) | def build_name_mapping(self, sname_to_tname=None):
    method map_name (line 253) | def map_name(self, param_name, t_name):
    method print_name_mapping (line 261) | def print_name_mapping(self):
  class PipeLineSegmentContext (line 267) | class PipeLineSegmentContext:
    method __init__ (line 268) | def __init__(
    method _index_layers (line 309) | def _index_layers(self):
    method _segment (line 315) | def _segment(self):
    method map_name (line 324) | def map_name(self, param_name, t_name):
    method map_name_to_stage (line 332) | def map_name_to_stage(self, name):
    method print_name_mapping (line 339) | def print_name_mapping(self):
  function reshard (line 345) | def reshard(node_model_state, reshard_context, hcg):

FILE: paddleformers/trainer/utils/reshard/sharding_v1.py
  function shard (line 23) | def shard(node_model_state, model, optimizer):
  function restore (line 46) | def restore(node_model_state, model, optimizer):

FILE: paddleformers/trainer/utils/reshard/sharding_v2.py
  function shard (line 32) | def shard(node_model_state, model, optimizer):
  function restore (line 89) | def restore(node_model_state, model, optimizer):
  function merge_tensors (line 109) | def merge_tensors(k, tensor_list, shape):
  function pad_tensor (line 124) | def pad_tensor(k, tensor, padded_size):
  function slice_tensor (line 135) | def slice_tensor(tensor, begin, end):
  function collect_split_info (line 139) | def collect_split_info(optimizer, model, only_return_lengths=False):
  function is_matched_optimizer_state_dict (line 175) | def is_matched_optimizer_state_dict(opt_state_dict, optimizer, model, hc...
  function is_bata (line 230) | def is_bata(name):

FILE: paddleformers/trainer/utils/sharding_io.py
  function to_device (line 54) | def to_device(tensor, place=None):
  function filter_sharded_params (line 70) | def filter_sharded_params(state_dict, optimizer, sharding_group, include...
  function exclude_parameters_in_state_dict (line 106) | def exclude_parameters_in_state_dict(
  class ParameterNameRemapper (line 135) | class ParameterNameRemapper:
    method __init__ (line 136) | def __init__(self, old_mapping, new_mapping, checkpoint):
    method _map_tensor (line 150) | def _map_tensor(self, tensor, old_p_name=None):
    method remap_model_state (line 176) | def remap_model_state(self, model_state):
    method remap_optimizer_state (line 182) | def remap_optimizer_state(self, opt_state):
  class GroupGetter (line 214) | class GroupGetter:
    method __init__ (line 215) | def __init__(self, model, hcg=None):
    method _get_parameter_name (line 230) | def _get_parameter_name(self, name):
    method get_group (line 253) | def get_group(self, name):
    method get_group_by_id (line 260) | def get_group_by_id(self, gid):
    method get_group_ids (line 263) | def get_group_ids(self):
  class ShardingIO (line 267) | class ShardingIO:
    method __init__ (line 268) | def __init__(self, args, model, optimizer=None, hcg=None, remap_parame...
    method _get_remapper (line 282) | def _get_remapper(self, checkpoint):
    method _remap_parameter_name (line 297) | def _remap_parameter_name(self, checkpoint, state_dict, is_opt):
    method set_optimizer (line 306) | def set_optimizer(self, optimizer):
    method load_state_dict_from_checkpoint_with_reshard (line 309) | def load_state_dict_from_checkpoint_with_reshard(
    method _load_one_state_dict_from_checkpoint (line 397) | def _load_one_state_dict_from_checkpoint(self, resume_from_checkpoint,...
    method _load_optimizer_state_of_one_shard (line 415) | def _load_optimizer_state_of_one_shard(self, checkpoint, base_opt_name...
    method _modify_ckpt_for_compatibility (line 433) | def _modify_ckpt_for_compatibility(self, ckpt):
    method _need_reshard (line 444) | def _need_reshard(self, checkpoint):
    method _need_reshard_pp (line 483) | def _need_reshard_pp(self, checkpoint):
    method load_optimizer_state_with_reshard (line 492) | def load_optimizer_state_with_reshard(self, checkpoint, base_opt_name,...
    method manipulate_state_dict_and_config (line 608) | def manipulate_state_dict_and_config(self, model_to_save, merge_tensor...
    method gather_distributed_model_meta (line 663) | def gather_distributed_model_meta(self):
    method _check_distributed_strategy (line 680) | def _check_distributed_strategy(self, parallel_config):
    method check_same_strategy (line 688) | def check_same_strategy(self, resume_from_checkpoint=None):
    method _get_distributed_strategy (line 703) | def _get_distributed_strategy(self):
    method _recover_params_from_master_weights (line 729) | def _recover_params_from_master_weights(self, state_dict, opt_state_di...
    method _all_gather_simple_object (line 783) | def _all_gather_simple_object(self, obj, group=None):
    method _load_model_meta_impl (line 792) | def _load_model_meta_impl(self, dir):
    method _load_model_meta (line 801) | def _load_model_meta(self, dir):
    method _sharding_meta_suffix (line 819) | def _sharding_meta_suffix(self, tp_rank=None, pp_rank=None):
    method _load_distributed_strategy (line 831) | def _load_distributed_strategy(self, dir):
    method _load_sharding_meta (line 839) | def _load_sharding_meta(self, dir, pp_rank=None):
    method _map_optimizer_state_to_param (line 857) | def _map_optimizer_state_to_param(self, optimizer_state_names):
    method _gather_sharding_metas (line 873) | def _gather_sharding_metas(self):

FILE: paddleformers/trainer/utils/zero_cost_checkpoint.py
  function md5 (line 98) | def md5(tensor):
  class ZCCTaskType (line 105) | class ZCCTaskType(Enum):
  class ZCCWorkerStatus (line 117) | class ZCCWorkerStatus(Enum):
  function showmem (line 124) | def showmem(msg):
  function sharded_state_dict_compatibility (line 134) | def sharded_state_dict_compatibility(func, *, return_sharded_state_dict=...
  function get_fused_param_mappings (line 192) | def get_fused_param_mappings(optimizer, manipulated_state_dict):
  class ZeroCostCheckpointEMAProcessor (line 219) | class ZeroCostCheckpointEMAProcessor:
    method __init__ (line 225) | def __init__(self, optimizer_fusion_storage_helper, param_fusion_stora...
    method status (line 236) | def status(self):
    method build_ema_buffer (line 244) | def build_ema_buffer(self):
    method ema_reset (line 266) | def ema_reset(self):
    method ema_accumulate (line 271) | def ema_accumulate(self, global_step, loss, zcc_ema_loss_threshold):
    method ema_state_dict (line 299) | def ema_state_dict(self):
    method load_ema_state_dict (line 327) | def load_ema_state_dict(self, state_dict):
  class ParamFusionStorageHelper (line 348) | class ParamFusionStorageHelper:
    method __init__ (line 349) | def __init__(
    method reset_meta (line 366) | def reset_meta(
    method init_buffer (line 391) | def init_buffer(self, meta):
    method sync_partial_param (line 400) | def sync_partial_param(self, numel_to_sync):
    method wait_all (line 441) | def wait_all(self):
    method state_dict (line 454) | def state_dict(self):
    method restore_tensor_from_meta (line 461) | def restore_tensor_from_meta(self, tensor_meta):
  class ZeroCostCheckpointCallback (line 473) | class ZeroCostCheckpointCallback(TrainerCallback):
    method __init__ (line 488) | def __init__(self, args, zcc_manager, timer, sharding_io):
    method on_substep_end (line 499) | def on_substep_end(self, args, state, control, **kwargs):
    method on_optimizer_begin (line 502) | def on_optimizer_begin(self, args, state, control, **kwargs):
    method on_step_end (line 509) | def on_step_end(self, args, state, control, model, lr_scheduler, optim...
    method get_rng_states (line 525) | def get_rng_states(self, args):
    method _get_save_infos_based_on_steps (line 541) | def _get_save_infos_based_on_steps(self, state, args, checkpoint_folder):
    method _pack_dynamic_objects (line 550) | def _pack_dynamic_objects(self):
    method _pack_static_objects (line 559) | def _pack_static_objects(self, args):
    method maybe_update_zcc_worker (line 568) | def maybe_update_zcc_worker(self, args, model, optimizer, global_step):
    method _cache_meta_for_sharded_save (line 591) | def _cache_meta_for_sharded_save(self, model, unused):
  class ZeroCostCheckpointManager (line 611) | class ZeroCostCheckpointManager:
    method __init__ (line 612) | def __init__(
    method set_ema_state_dict (line 674) | def set_ema_state_dict(self, path):
    method update_zcc_workers (line 681) | def update_zcc_workers(self, new_version, dynamic_objecs, static_objec...
    method get_idle_worker_for_saving (line 707) | def get_idle_worker_for_saving(self, save_infos_and_non_cached_objects...
    method sync_offload_status (line 735) | def sync_offload_status(self):
    method report_error_worker (line 755) | def report_error_worker(self):
    method zcc_pipeline_hook (line 761) | def zcc_pipeline_hook(self, hook_id):
    method finalize (line 772) | def finalize(self):
    method terminate_workers (line 783) | def terminate_workers(self):
  function worker_loop (line 791) | def worker_loop(worker):
  class ZeroCostCheckpointWorker (line 795) | class ZeroCostCheckpointWorker:
    method __init__ (line 796) | def __init__(
    method process_update_task (line 857) | def process_update_task(self, updates):
    method process_prepare_task (line 877) | def process_prepare_task(self, prepares):
    method process_offload_task (line 886) | def process_offload_task(self, dump, global_step):
    method process_dump_task (line 938) | def process_dump_task(self):
    method _filter_moe_no_sync_optimizer_params (line 963) | def _filter_moe_no_sync_optimizer_params(self, model_meta, optimzier_s...
    method _dump_static_objects (line 992) | def _dump_static_objects(self, output_dir):
    method _dump_states (line 1013) | def _dump_states(self, output_dir):
    method _dump_args_and_state (line 1038) | def _dump_args_and_state(self, output_dir):
    method process_dump_task_impl (line 1054) | def process_dump_task_impl(self, output_dir, saved_signal_type="tmp"):
    method run (line 1087) | def run(self):
    method build_fusion_storage_helper (line 1142) | def build_fusion_storage_helper(self, optimizer_states_meta, model_sta...
    method manage_offload_chunk (line 1169) | def manage_offload_chunk(self):
  class EMABuffer (line 1180) | class EMABuffer(ABC):
    method __init__ (line 1181) | def __init__(self, resume_from_checkpoint, args, offload=True):
    method _load (line 1189) | def _load(self, resume_from_checkpoint):
    method get_ema_state_dict (line 1207) | def get_ema_state_dict(self):
    method save (line 1212) | def save(self, global_step):
    method ema_accumulate (line 1222) | def ema_accumulate(self, global_step, loss, ema_loss_threshold):
    method _ema_impl (line 1235) | def _ema_impl(self, state_dict, ema_state_dict):
    method _get_master_weight (line 1252) | def _get_master_weight(self):
    method _get_model_state (line 1256) | def _get_model_state(self):
    method _check_consistent_dist_strategy (line 1260) | def _check_consistent_dist_strategy(self, resume_from_checkpoint):
  class EMABufferShardingIOBased (line 1264) | class EMABufferShardingIOBased(EMABuffer):
    method __init__ (line 1265) | def __init__(self, resume_from_checkpoint, args, sharding_io, offload=...
    method _ema_path (line 1270) | def _ema_path(self, base_path):
    method _get_model_state (line 1275) | def _get_model_state(self):
    method _get_master_weight (line 1281) | def _get_master_weight(self):
    method _check_consistent_dist_strategy (line 1284) | def _check_consistent_dist_strategy(self, resume_from_checkpoint):
  class EMABufferFcBased (line 1288) | class EMABufferFcBased(EMABuffer):
    method __init__ (line 1289) | def __init__(self, resume_from_checkpoint, args, offload=True, hcg=Non...
    method _get_model_meta (line 1301) | def _get_model_meta(self):
    method _ema_path (line 1304) | def _ema_path(self, base_path):
    method _check_consistent_dist_strategy (line 1307) | def _check_consistent_dist_strategy(self, resume_from_checkpoint):
    method _get_model_state (line 1310) | def _get_model_state(self):
    method _get_master_weight (line 1314) | def _get_master_weight(self):
    method save (line 1318) | def save(self, global_step):
  class NonZCCEMACallback (line 1330) | class NonZCCEMACallback(TrainerCallback):
    method __init__ (line 1331) | def __init__(self, ema_buffer: EMABuffer, ema_state_assembler=None):
    method create_nonzcc_callback (line 1336) | def create_nonzcc_callback(
    method on_step_end (line 1356) | def on_step_end(self, args, state, control, **kwargs):
  class DistInfoCollectorValidator (line 1384) | class DistInfoCollectorValidator:
    method __init__ (line 1385) | def __init__(self, args, hcg=None):
    method _load_model_meta_impl (line 1391) | def _load_model_meta_impl(self, dir):
    method _all_gather_simple_object (line 1400) | def _all_gather_simple_object(self, obj, group=None):
    method _sharding_meta_suffix (line 1409) | def _sharding_meta_suffix(self, tp_rank=None, pp_rank=None):
    method _gather_sharding_metas (line 1421) | def _gather_sharding_metas(self, model, optimizer):
    method _check_distributed_strategy (line 1465) | def _check_distributed_strategy(self, parallel_config):
    method _get_distributed_strategy (line 1473) | def _get_distributed_strategy(self):
    method gather_distributed_model_meta (line 1499) | def gather_distributed_model_meta(self, model, optimizer):
    method check_same_strategy (line 1516) | def check_same_strategy(self, resume_from_checkpoint=None):
  function saved_ckptmeta (line 1532) | def saved_ckptmeta(state_dict, ckpt_file_name, process_group=None, repli...
  class ZeroCostCheckpointCallbackFcBased (line 1645) | class ZeroCostCheckpointCallbackFcBased(ZeroCostCheckpointCallback):
    method __init__ (line 1646) | def __init__(self, args, zcc_manager, timer, unused_arg):
    method _manipulate_state_dict_and_config (line 1658) | def _manipulate_state_dict_and_config(self, model_to_save, optimizer):
    method _cache_meta_for_sharded_save (line 1692) | def _cache_meta_for_sharded_save(self, model, optimizer):
    method _gen_unified_name (line 1757) | def _gen_unified_name(self, optimizer, model_sharded_state_dict):
    method _pack_dynamic_objects (line 1840) | def _pack_dynamic_objects(self):
    method maybe_update_zcc_worker (line 1861) | def maybe_update_zcc_worker(self, args, model, optimizer, global_step):
  class ZeroCostCheckpointWorkerFcBased (line 1884) | class ZeroCostCheckpointWorkerFcBased(ZeroCostCheckpointWorker):
    method process_update_task (line 1885) | def process_update_task(self, updates):
    method _replace_pname_with_unified (line 1914) | def _replace_pname_with_unified(self, state_dict):
    method _filter_state_dict (line 1923) | def _filter_state_dict(state_dict, filter_map):
    method _slice_padded_tensor (line 1938) | def _slice_padded_tensor(static_dict, param_slice_info):
    method _save_model_state (line 1954) | def _save_model_state(self, output_dir):
    method _save_opt_state (line 1971) | def _save_opt_state(self, output_dir):
    method _save_ema_state (line 2012) | def _save_ema_state(self, output_dir):
    method _dump_states (line 2024) | def _dump_states(self, output_dir):

FILE: paddleformers/transformers/activations.py
  class PaddleGELUTanh (line 24) | class PaddleGELUTanh(nn.Layer):
    method forward (line 33) | def forward(self, input: paddle.Tensor) -> paddle.Tensor:
  class NewGELUActivation (line 37) | class NewGELUActivation(nn.Layer):
    method forward (line 43) | def forward(self, input: Tensor) -> Tensor:
  class GELUActivation (line 49) | class GELUActivation(nn.Layer):
    method __init__ (line 57) | def __init__(self, use_gelu_python: bool = False):
    method _gelu_python (line 64) | def _gelu_python(self, input: Tensor) -> Tensor:
    method forward (line 67) | def forward(self, input: Tensor) -> Tensor:
  class FastGELUActivation (line 71) | class FastGELUActivation(nn.Layer):
    method forward (line 76) | def forward(self, input: Tensor) -> Tensor:
  class QuickGELUActivation (line 80) | class QuickGELUActivation(nn.Layer):
    method forward (line 85) | def forward(self, input: Tensor) -> Tensor:
  class ClippedGELUActivation (line 89) | class ClippedGELUActivation(nn.Layer):
    method __init__ (line 102) | def __init__(self, min: float, max: float):
    method forward (line 110) | def forward(self, x: Tensor) -> Tensor:
  class SiLUActivation (line 114) | class SiLUActivation(nn.Layer):
    method forward (line 123) | def forward(self, input: Tensor) -> Tensor:
  class MishActivation (line 127) | class MishActivation(nn.Layer):
    method forward (line 133) | def forward(self, input: Tensor) -> Tensor:
  class LinearActivation (line 137) | class LinearActivation(nn.Layer):
    method forward (line 142) | def forward(self, input: Tensor) -> Tensor:
  class ClassInstantier (line 146) | class ClassInstantier(OrderedDict):
    method __getitem__ (line 147) | def __getitem__(self, key):
  function get_activation (line 173) | def get_activation(activation_string):

FILE: paddleformers/transformers/aistudio_utils.py
  class UnauthorizedError (line 21) | class UnauthorizedError(Exception):
  class EntryNotFoundError (line 25) | class EntryNotFoundError(Exception):
  function _add_subfolder (line 29) | def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -...
  function aistudio_download (line 35) | def aistudio_download(

FILE: paddleformers/transformers/attention_utils.py
  class Registry (line 26) | class Registry(object):
    method __init__ (line 27) | def __init__(self):
    method register (line 30) | def register(self, name):
  function create_bigbird_rand_mask_idx (line 41) | def create_bigbird_rand_mask_idx(
  function create_bigbird_rand_mask_idx_list (line 90) | def create_bigbird_rand_mask_idx_list(
  function _convert_param_attr_to_list (line 111) | def _convert_param_attr_to_list(param_attr, n):
  class Linear3D (line 140) | class Linear3D(Layer):
    method __init__ (line 141) | def __init__(self, hidden_size, num_attention_heads, size_per_head, we...
    method forward (line 154) | def forward(self, input):
  class Attention (line 166) | class Attention(Layer):
    method __init__ (line 167) | def __init__(self, num_heads=1, block_size=1, window_size=3, num_globa...
    method forward (line 170) | def forward(
  class DefaultAttention (line 186) | class DefaultAttention(Attention):
    method forward (line 187) | def forward(
  class BigBirdSparseAttention (line 214) | class BigBirdSparseAttention(Attention):
    method __init__ (line 215) | def __init__(self, num_heads=1, block_size=1, window_size=3, num_globa...
    method _get_band_mask (line 227) | def _get_band_mask(self, blocked_query_mask, blocked_key_mask, batch_s...
    method _get_band_matrix (line 291) | def _get_band_matrix(self, blocked_matrix, B, T):
    method _get_rand_mask (line 350) | def _get_rand_mask(self, blocked_query_mask, blocked_key_mask, rand_ma...
    method _gather_random_key_value (line 374) | def _gather_random_key_value(self, blocked_matrix, rand_mask_idx, B, T):
    method _get_global_out (line 389) | def _get_global_out(self, query_matrix, key_matrix, value_matrix, key_...
    method _get_splited_matrix (line 404) | def _get_splited_matrix(self, matrix):
    method forward (line 408) | def forward(
  class MultiHeadAttention (line 519) | class MultiHeadAttention(Layer):
    method __init__ (line 524) | def __init__(
    method _prepare_qkv (line 560) | def _prepare_qkv(self, query, key, value, cache=None):
    method compute_kv (line 577) | def compute_kv(self, key, value):
    method gen_cache (line 582) | def gen_cache(self, key, value=None, type=Cache):
    method forward (line 595) | def forward(

FILE: paddleformers/transformers/audio_processing_utils.py
  class ExplicitEnum (line 34) | class ExplicitEnum(Enum):
    method _missing_ (line 40) | def _missing_(cls, value):
  class SequenceFeatureExtractor (line 46) | class SequenceFeatureExtractor(FeatureExtractionMixin):
    method __init__ (line 59) | def __init__(self, feature_size: int, sampling_rate: int, padding_valu...
    method pad (line 69) | def pad(
    method _pad (line 239) | def _pad(
    method _truncate (line 311) | def _truncate(
    method _get_padding_strategies (line 354) | def _get_padding_strategies(self, padding=False, max_length=None):
  function process_audio_info (line 387) | def process_audio_info(conversations: list[dict] | list[list[dict]], use...

FILE: paddleformers/transformers/audio_utils.py
  function hertz_to_mel (line 28) | def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk")...
  function mel_to_hertz (line 62) | def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk")...
  function _create_triangular_filter_bank (line 96) | def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: ...
  function mel_filter_bank (line 118) | def mel_filter_bank(
  function optimal_fft_length (line 196) | def optimal_fft_length(window_length: int) -> int:
  function window_function (line 209) | def window_function(
  function spectrogram (line 270) | def spectrogram(
  function power_to_db (line 458) | def power_to_db(
  function amplitude_to_db (line 509) | def amplitude_to_db(
  function get_mel_filter_banks (line 558) | def get_mel_filter_banks(
  function fram_wave (line 582) | def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size...
  function stft (line 638) | def stft(frames: np.array, windowing_function: np.array, fft_window_size...

FILE: paddleformers/transformers/auto/configuration.py
  function config_class_to_model_type (line 103) | def config_class_to_model_type(config):
  class _LazyConfigMapping (line 115) | class _LazyConfigMapping(OrderedDict):
    method __init__ (line 120) | def __init__(self, mapping):
    method __getitem__ (line 125) | def __getitem__(self, key):
    method keys (line 148) | def keys(self):
    method values (line 151) | def values(self):
    method items (line 154) | def items(self):
    method __iter__ (line 157) | def __iter__(self):
    method __contains__ (line 160) | def __contains__(self, item):
    method register (line 163) | def register(self, key, value, exist_ok=False):
  function get_configurations (line 175) | def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]:
  function model_type_to_module_name (line 208) | def model_type_to_module_name(key):
  class AutoConfig (line 219) | class AutoConfig(PretrainedConfig):
    method _get_config_class_from_config (line 235) | def _get_config_class_from_config(
    method from_file (line 270) | def from_file(cls, config_file: str, **kwargs) -> AutoConfig:
    method from_pretrained (line 286) | def from_pretrained(cls, pretrained_model_name_or_path: str, *model_ar...
    method register (line 377) | def register(model_type, config, exist_ok=False):

FILE: paddleformers/transformers/auto/factory.py
  function getattribute_from_module (line 22) | def getattribute_from_module(module, attr):
  class _LazyAutoMapping (line 42) | class _LazyAutoMapping(OrderedDict):
    method __init__ (line 51) | def __init__(self, config_mapping, model_mapping):
    method __len__ (line 59) | def __len__(self):
    method __getitem__ (line 63) | def __getitem__(self, key):
    method _load_attr_from_module (line 79) | def _load_attr_from_module(self, model_type, attr):
    method keys (line 90) | def keys(self):
    method get (line 98) | def get(self, key, default):
    method __bool__ (line 104) | def __bool__(self):
    method values (line 107) | def values(self):
    method items (line 115) | def items(self):
    method __iter__ (line 126) | def __iter__(self):
    method __contains__ (line 129) | def __contains__(self, item):
    method register (line 137) | def register(self, key, value, exist_ok=False):

FILE: paddleformers/transformers/auto/feature_extraction.py
  function safe_load_json_file (line 47) | def safe_load_json_file(json_file: str):
  function feature_extractor_class_from_name (line 58) | def feature_extractor_class_from_name(class_name: str):
  function get_feature_extractor_config (line 81) | def get_feature_extractor_config(
  class AutoFeatureExtractor (line 188) | class AutoFeatureExtractor:
    method __init__ (line 196) | def __init__(self):
    method from_pretrained (line 203) | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
    method register (line 262) | def register(config_class, feature_extractor_class, exist_ok=False):

FILE: paddleformers/transformers/auto/image_processing.py
  function get_image_processor_class_from_name (line 70) | def get_image_processor_class_from_name(class_name: str):
  function get_image_processor_config (line 97) | def get_image_processor_config(
  function _bind_paddle_mixin_if_available (line 222) | def _bind_paddle_mixin_if_available(image_processor_class):
  class AutoImageProcessor (line 238) | class AutoImageProcessor(hf.AutoImageProcessor):
    method from_pretrained (line 256) | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwa...

FILE: paddleformers/transformers/auto/modeling.py
  function get_name_mapping (line 114) | def get_name_mapping(task="Model"):
  function get_task_name (line 133) | def get_task_name(model_class):
  class _BaseAutoModelClass (line 140) | class _BaseAutoModelClass:
    method __init__ (line 148) | def __init__(self, *args, **kwargs):
    method _get_model_class_from_config (line 156) | def _get_model_class_from_config(cls, pretrained_model_name_or_path, c...
    method from_config (line 249) | def from_config(cls, config, **kwargs):
    method _from_pretrained (line 254) | def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *m...
    method register (line 330) | def register(cls, config_class, model_class, exist_ok=False):
  class AutoBackbone (line 348) | class AutoBackbone(_BaseAutoModelClass):
    method from_pretrained (line 357) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModel (line 390) | class AutoModel(_BaseAutoModelClass):
    method from_pretrained (line 403) | def from_pretrained(cls, pretrained_model_name_or_path, task=None, *mo...
  class AutoModelForPretraining (line 461) | class AutoModelForPretraining(_BaseAutoModelClass):
    method from_pretrained (line 470) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForSequenceClassification (line 507) | class AutoModelForSequenceClassification(_BaseAutoModelClass):
    method from_pretrained (line 516) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForTokenClassification (line 553) | class AutoModelForTokenClassification(_BaseAutoModelClass):
    method from_pretrained (line 562) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForQuestionAnswering (line 599) | class AutoModelForQuestionAnswering(_BaseAutoModelClass):
    method from_pretrained (line 608) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForMultipleChoice (line 645) | class AutoModelForMultipleChoice(_BaseAutoModelClass):
    method from_pretrained (line 654) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForMaskedLM (line 691) | class AutoModelForMaskedLM(_BaseAutoModelClass):
    method from_pretrained (line 700) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForCausalLM (line 737) | class AutoModelForCausalLM(_BaseAutoModelClass):
    method from_pretrained (line 746) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForCausalLMPipe (line 783) | class AutoModelForCausalLMPipe(_BaseAutoModelClass):
    method from_pretrained (line 792) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoEncoder (line 796) | class AutoEncoder(_BaseAutoModelClass):
    method from_pretrained (line 805) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoDecoder (line 837) | class AutoDecoder(_BaseAutoModelClass):
    method from_pretrained (line 846) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoGenerator (line 878) | class AutoGenerator(_BaseAutoModelClass):
    method from_pretrained (line 887) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoDiscriminator (line 924) | class AutoDiscriminator(_BaseAutoModelClass):
    method from_pretrained (line 933) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForConditionalGeneration (line 970) | class AutoModelForConditionalGeneration(_BaseAutoModelClass):
    method from_pretrained (line 979) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...
  class AutoModelForConditionalGenerationPipe (line 1012) | class AutoModelForConditionalGenerationPipe(_BaseAutoModelClass):
    method from_pretrained (line 1021) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *...

FILE: paddleformers/transformers/auto/processing.py
  function processor_class_from_name (line 66) | def processor_class_from_name(class_name: str):
  class AutoProcessor (line 90) | class AutoProcessor:
    method from_pretrained (line 106) | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):

FILE: paddleformers/transformers/auto/tokenizer.py
  function get_paddleformers_tokenizer_config (line 53) | def get_paddleformers_tokenizer_config(
  function tokenizer_class_from_name (line 150) | def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
  function _bind_paddle_mixin_if_available (line 175) | def _bind_paddle_mixin_if_available(tokenizer_class):
  class AutoTokenizer (line 191) | class AutoTokenizer(hf.AutoTokenizer):
    method from_pretrained (line 209) | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwa...

FILE: paddleformers/transformers/auto/video_processing.py
  function video_processor_class_from_name (line 53) | def video_processor_class_from_name(class_name: str):
  function get_video_processor_config (line 77) | def get_video_processor_config(
  class AutoVideoProcessor (line 201) | class AutoVideoProcessor:
    method from_pretrained (line 217) | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwa...

FILE: paddleformers/transformers/auto_utils.py
  function get_mesh (line 19) | def get_mesh(pp_idx=None):
  function einsum (line 29) | def einsum(rule, a, b):

FILE: paddleformers/transformers/cache_utils.py
  class CacheLayerMixin (line 26) | class CacheLayerMixin(ABC):
    method __init__ (line 31) | def __init__(self):
    method __repr__ (line 36) | def __repr__(self):
    method lazy_initialization (line 40) | def lazy_initialization(self, key_states: paddle.Tensor):
    method update (line 44) | def update(
    method get_mask_sizes (line 50) | def get_mask_sizes(self, cache_position: paddle.Tensor) -> tuple[int, ...
    method get_seq_length (line 54) | def get_seq_length(self) -> int:
    method get_max_cache_shape (line 58) | def get_max_cache_shape(self) -> int:
    method offload (line 61) | def offload(self):
    method prefetch (line 67) | def prefetch(self):
    method reset (line 73) | def reset(self) -> None:
    method reorder_cache (line 82) | def reorder_cache(self, beam_idx: paddle.LongTensor) -> None:
  class DynamicLayer (line 89) | class DynamicLayer(CacheLayerMixin):
    method lazy_initialization (line 97) | def lazy_initialization(self, key_states: paddle.Tensor, value_states:...
    method update (line 108) | def update(
    method get_mask_sizes (line 133) | def get_mask_sizes(self, cache_position: paddle.Tensor) -> tuple[int, ...
    method get_seq_length (line 140) | def get_seq_length(self) -> int:
    method get_max_cache_shape (line 146) | def get_max_cache_shape(self) -> int:
    method crop (line 150) | def crop(self, max_length: int) -> None:
    method batch_repeat_interleave (line 164) | def batch_repeat_interleave(self, repeats: int) -> None:
    method batch_select_indices (line 170) | def batch_select_indices(self, indices: paddle.Tensor) -> None:
  class Cache (line 177) | class Cache:
    method __init__ (line 197) | def __init__(
    method __repr__ (line 221) | def __repr__(self):
    method prefetch (line 224) | def prefetch(self, layer_idx: int, only_non_sliding: bool = True):
    method offload (line 253) | def offload(self, layer_idx: int, only_non_sliding: bool = True):
    method update (line 262) | def update(
    method early_initialization (line 306) | def early_initialization(
    method get_seq_length (line 322) | def get_seq_length(self, layer_idx: int = 0) -> int:
    method get_mask_sizes (line 328) | def get_mask_sizes(self, cache_position: paddle.Tensor, layer_idx: int...
    method get_max_cache_shape (line 340) | def get_max_cache_shape(self, layer_idx: int = 0) -> int:
    method reset (line 348) | def reset(self):
    method reorder_cache (line 353) | def reorder_cache(self, beam_idx: paddle.LongTensor):
    method crop (line 358) | def crop(self, max_length: int):
    method batch_repeat_interleave (line 363) | def batch_repeat_interleave(self, repeats: int):
    method batch_select_indices (line 368) | def batch_select_indices(self, indices: paddle.Tensor):
    method max_batch_size (line 374) | def max_batch_size(self) -> int:
    method max_cache_len (line 382) | def max_cache_len(self) -> int:
    method is_compileable (line 388) | def is_compileable(self) -> bool:
    method is_initialized (line 396) | def is_initialized(self) -> bool:
    method is_sliding (line 401) | def is_sliding(self) -> list[bool]:
    method __len__ (line 405) | def __len__(self):
  class DynamicCache (line 414) | class DynamicCache(Cache):
    method __init__ (line 457) | def __init__(
    method __iter__ (line 518) | def __iter__(self):
  class DynamicSlidingWindowLayer (line 523) | class DynamicSlidingWindowLayer(DynamicLayer):
    method __init__ (line 531) | def __init__(self, sliding_window: int):
    method lazy_initialization (line 537) | def lazy_initialization(self, key_states: paddle.Tensor, value_states:...
    method update (line 541) | def update(
    method get_mask_sizes (line 574) | def get_mask_sizes(self, cache_position: paddle.Tensor) -> tuple[int, ...
    method get_seq_length (line 587) | def get_seq_length(self) -> int:
    method get_max_cache_shape (line 591) | def get_max_cache_shape(self) -> int:
    method crop (line 595) | def crop(self, max_length: int) -> None:

FILE: paddleformers/transformers/configuration_utils.py
  function custom_object_save (line 45) | def custom_object_save(obj, folder, config=None):
  function attribute_map (line 95) | def attribute_map(config: PretrainedConfig, kwargs: Dict[str, Any]) -> D...
  function convert_to_legacy_config (line 110) | def convert_to_legacy_config(attribute_map: Dict[str, str], config: Dict...
  function flatten_model_config (line 134) | def flatten_model_config(config: dict) -> dict:
  function is_standard_config (line 166) | def is_standard_config(config: Union[PretrainedConfig, Dict[str, Any]]) ...
  function resolve_hf_config_path (line 178) | def resolve_hf_config_path(repo_id: str, cache_dir: str, subfolder=None)...
  function set_expected_keys (line 204) | def set_expected_keys(config, llm_meta, kwargs):
  function llmmetaclass (line 213) | def llmmetaclass(cls):
  class LlmMetaConfig (line 229) | class LlmMetaConfig:
    method _get_defaults (line 519) | def _get_defaults(cls):
    method _get_init (line 538) | def _get_init(cls):
    method _get_all_meta (line 556) | def _get_all_meta(cls):
    method _get_unsavable_keys (line 574) | def _get_unsavable_keys(cls):
    method set_llm_config (line 590) | def set_llm_config(cls, config, args):
  class PretrainedConfig (line 598) | class PretrainedConfig:
    method __setattr__ (line 802) | def __setattr__(self, key, value):
    method __getattribute__ (line 808) | def __getattribute__(self, key):
    method __getitem__ (line 813) | def __getitem__(self, key):
    method __setitem__ (line 816) | def __setitem__(self, key, value):
    method __init__ (line 820) | def __init__(self, **kwargs):
    method _create_id_label_maps (line 951) | def _create_id_label_maps(self, num_labels: int):
    method _get_generation_defaults (line 956) | def _get_generation_defaults() -> Dict[str, Any]:
    method _has_non_default_generation_parameters (line 985) | def _has_non_default_generation_parameters(self) -> bool:
    method name_or_path (line 995) | def name_or_path(self) -> str:
    method name_or_path (line 999) | def name_or_path(self, value):
    method use_return_dict (line 1003) | def use_return_dict(self) -> bool:
    method num_labels (line 1010) | def num_labels(self) -> int:
    method num_labels (line 1017) | def num_labels(self, num_labels: int):
    method save_pretrained (line 1021) | def save_pretrained(self, save_directory: Union[str, os.PathLike], **k...
    method from_pretrained (line 1051) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os....
    method get_config_dict (line 1120) | def get_config_dict(
    method _get_config_dict (line 1159) | def _get_config_dict(
    method from_dict (line 1217) | def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "Pretrain...
    method from_json_file (line 1279) | def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "Pretra...
    method _dict_from_json_file (line 1295) | def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
    method __eq__ (line 1300) | def __eq__(self, other):
    method __repr__ (line 1303) | def __repr__(self):
    method to_diff_dict (line 1306) | def to_diff_dict(self, saving_file=False) -> Dict[str, Any]:
    method register_unsavable_keys (line 1360) | def register_unsavable_keys(self, keys):
    method to_dict (line 1369) | def to_dict(self, saving_file=False) -> Dict[str, Any]:
    method to_json_string (line 1419) | def to_json_string(self, use_diff: bool = True, saving_file=False) -> ...
    method to_json_file (line 1438) | def to_json_file(self, json_file_path: Union[str, os.PathLike], use_di...
    method update (line 1458) | def update(self, config_dict: Dict[str, Any]):
    method update_from_string (line 1468) | def update_from_string(self, update_str: str):
    method _remove_keys_not_serialized (line 1506) | def _remove_keys_not_serialized(self, d: dict[str, Any], saving_file: ...
    method register_for_auto_class (line 1526) | def register_for_auto_class(cls, auto_class="AutoConfig"):
    method get (line 1551) | def get(self, key, default=None):
    method get_text_config (line 1563) | def get_text_config(self, decoder=None, encoder=None) -> "PretrainedCo...
  function get_configuration_file (line 1636) | def get_configuration_file(configuration_files: List[str]) -> str:
  function recursive_diff_dict (line 1676) | def recursive_diff_dict(dict_a, dict_b, config_obj=None):
  function layer_type_validation (line 1700) | def layer_type_validation(layer_types: List[str], num_hidden_layers: Opt...

FILE: paddleformers/transformers/context_parallel_utils.py
  function split_inputs_sequence_dim_load_balance (line 33) | def split_inputs_sequence_dim_load_balance(inputs, rank=None, degree=None):
  function auto_split_sequence_dim_load_balance (line 68) | def auto_split_sequence_dim_load_balance(inputs):

FILE: paddleformers/transformers/contrastive_loss.py
  class SimpleContrastiveLoss (line 21) | class SimpleContrastiveLoss(nn.Layer):
    method __init__ (line 22) | def __init__(self, embedding_temperature: float = 0.02):
    method forward (line 27) | def forward(self, q_reps, p_reps):
  class MatryoshkaContrastiveLoss (line 41) | class MatryoshkaContrastiveLoss(nn.Layer):
    method __init__ (line 42) | def __init__(self, embedding_temperature: float = 0.02, embedding_matr...
    method forward (line 51) | def forward(self, q_reps, p_reps):
  class SimpleInfclLoss (line 68) | class SimpleInfclLoss(nn.Layer):
    method __init__ (line 69) | def __init__(self, inf_cl_head_dim=64):
    method forward (line 79) | def forward(self, q_reps, p_reps):
  class MatryoshkaInfclLoss (line 103) | class MatryoshkaInfclLoss(nn.Layer):
    method __init__ (line 104) | def __init__(self, embedding_matryoshka_dims: Optional[List[int]] = No...
    method forward (line 120) | def forward(self, q_reps, p_reps):

FILE: paddleformers/transformers/conversion_utils.py
  function add_quant_mapping (line 62) | def add_quant_mapping(name_action_mappings, quantization_config):
  function tensor_summary (line 98) | def tensor_summary(tensor: Union[str, Tensor, PytorchTensor, tuple, list...
  function compare_model_weights (line 134) | def compare_model_weights(first_state_dict: Dict[str, ndarray], second_s...
  function state_dict_contains_prefix (line 153) | def state_dict_contains_prefix(state_dict: Dict[str, ndarray], prefix: s...
  function init_name_mappings (line 159) | def init_name_mappings(mappings: list[StateDictNameMapping]) -> list[Sta...
  class StateDictKeysChecker (line 177) | class StateDictKeysChecker:
    method __init__ (line 180) | def __init__(
    method change_base_downstream_mismatched_keys (line 200) | def change_base_downstream_mismatched_keys(self):
    method change_downstream_base_mismatched_keys (line 215) | def change_downstream_base_mismatched_keys(self):
    method change_diff_keys (line 232) | def change_diff_keys(self) -> List[str]:
    method get_unexpected_keys (line 263) | def get_unexpected_keys(self):
    method get_mismatched_keys (line 269) | def get_mismatched_keys(self):
    method get_diff_keys (line 275) | def get_diff_keys(self, return_all_diff: bool = False) -> List[str]:
  function naive_fuse_merge_tp (line 293) | def naive_fuse_merge_tp(weight_list, is_column=True, fuse_tensor_parts=2...
  function naive_fuse_split_tp (line 344) | def naive_fuse_split_tp(
  function normal_fuse_merge_tp (line 436) | def normal_fuse_merge_tp(weight_list, is_column=True):
  function normal_fuse_split_tp (line 467) | def normal_fuse_split_tp(weight, tensor_model_parallel_size, tensor_para...
  function tensor_parallel_qkv_to_naive_merged_qkv (line 564) | def tensor_parallel_qkv_to_naive_merged_qkv(weight, num_attention_heads):
  function naive_merged_qkv_to_tensor_parallel_qkv (line 580) | def naive_merged_qkv_to_tensor_parallel_qkv(weight, num_attention_heads):
  function splited_qkv_to_tensor_parallel_qkv (line 603) | def splited_qkv_to_tensor_parallel_qkv(weight_list, num_attention_heads):
  function fuse_param_func (line 617) | def fuse_param_func():
  function split_param_func (line 673) | def split_param_func():
  function split_or_fuse_func (line 725) | def split_or_fuse_func(is_fuse=True):
  function get_tensor_parallel_merge_func (line 729) | def get_tensor_parallel_merge_func(tensor_model_parallel_size, tensor_pa...
  function get_tensor_parallel_split_func (line 761) | def get_tensor_parallel_split_func(tensor_model_parallel_size, tensor_pa...
  function split_or_merge_func (line 801) | def split_or_merge_func(is_split, tensor_model_parallel_size, tensor_par...
  class StateDictNameMapping (line 808) | class StateDictNameMapping:
    method __post_init__ (line 819) | def __post_init__(self):
    method should_transpose (line 822) | def should_transpose(self) -> bool:
    method should_merge_last_two_dim (line 825) | def should_merge_last_two_dim(self) -> bool:
    method run (line 829) | def run(self, state_dict: dict[str, ndarray], name: str) -> ndarray:
    method matched (line 858) | def matched(self, text: str) -> bool:
  class TensorInfoSaver (line 874) | class TensorInfoSaver:
    method __init__ (line 875) | def __init__(self) -> None:
    method add (line 878) | def add(self, state_dict_key: str, key: str, values: Union[float, ndar...
    method summary (line 894) | def summary(self, output_path: Optional[str] = None):
    method summary_to_excel (line 906) | def summary_to_excel(self, file: str):
    method summary_to_terminal (line 921) | def summary_to_terminal(self):
    method clear (line 928) | def clear(self):
  class LogitHooker (line 933) | class LogitHooker:
    method __init__ (line 936) | def __init__(self, mappings: List[StateDictNameMapping], tensor_info_s...
    method _paddle_hooks (line 946) | def _paddle_hooks(self, layer: Layer, inputs: Tuple[Tensor], outputs: ...
    method _pytorch_hooks (line 960) | def _pytorch_hooks(
    method register_paddle_model_hooks (line 983) | def register_paddle_model_hooks(self, model: Layer):
    method register_pytorch_model_hooks (line 1018) | def register_pytorch_model_hooks(self, model: Module):
    method summary (line 1053) | def summary(self):
  class LogitComparer (line 1058) | class LogitComparer:
    method __init__ (line 1076) | def __init__(self, input_dir: str) -> None:
    method get_paddle_pytorch_model_classes (line 1079) | def get_paddle_pytorch_model_classes(self) -> Tuple[object, object]:
    method get_inputs (line 1089) | def get_inputs(self):
    method resolve_paddle_output_logits (line 1095) | def resolve_paddle_output_logits(self, paddle_outputs: Tuple[Tensor]):
    method resolve_pytorch_output_logits (line 1108) | def resolve_pytorch_output_logits(self, pytorch_outputs: Module):
    method get_model_state_dict (line 1117) | def get_model_state_dict(model: Union[Layer, Module], copy: bool = Fal...
    method compare_model_state_dicts (line 1134) | def compare_model_state_dicts(
    method compare_logits (line 1175) | def compare_logits(self) -> bool:
    method on_converted (line 1236) | def on_converted(self):
  class ConversionMixin (line 1265) | class ConversionMixin:
    method convert_transpose_selected_weights (line 1270) | def convert_transpose_selected_weights(state_dict: dict, transpose_wei...
    method get_tensor_parallel_convert_actions (line 1293) | def get_tensor_parallel_convert_actions(
    method convert_tensor_parallel (line 1314) | def convert_tensor_parallel(
    method merge_tensor_parallel (line 1350) | def merge_tensor_parallel(cls, state_dict, config) -> None:
    method _get_tensor_parallel_mappings (line 1397) | def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_sp...
    method _resolve_prefix_keys (line 1412) | def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_erro...
    method convert_fuse_and_split (line 1442) | def convert_fuse_and_split(cls, config: PretrainedConfig, state_dict, ...
    method get_fuse_or_split_param_convert_actions (line 1487) | def get_fuse_or_split_param_convert_actions(
    method _get_fuse_or_split_param_mappings (line 1523) | def _get_fuse_or_split_param_mappings(cls, config: PretrainedConfig, i...
    method _resolve_prefix_keys_for_fuse_and_split (line 1542) | def _resolve_prefix_keys_for_fuse_and_split(state_keys_base, state_key...

FILE: paddleformers/transformers/deepseek_v3/configuration.py
  class DeepseekV3Config (line 24) | class DeepseekV3Config(PretrainedConfig):
    method __init__ (line 136) | def __init__(

FILE: paddleformers/transformers/deepseek_v3/mfu_utils.py
  class DeepSeekProjection (line 18) | class DeepSeekProjection:
    method __init__ (line 19) | def __init__(self, model_config, train_options=None):
    method get_num_params (line 66) | def get_num_params(self, include_embedding: bool = True) -> tuple[int,...
    method get_num_flop_fwd (line 132) | def get_num_flop_fwd(self, batch_size: int) -> int:
    method get_num_flop_per_token (line 184) | def get_num_flop_per_token(self):
    method _get_num_flop_QK_fwd (line 190) | def _get_num_flop_QK_fwd(self, batch_size: int) -> int:
    method get_num_flop_bwd (line 198) | def get_num_flop_bwd(self, batch_size: int) -> int:

FILE: paddleformers/transformers/deepseek_v3/modeling.py
  function scaled_dot_product_attention (line 80) | def scaled_dot_product_attention(
  function yarn_get_mscale (line 134) | def yarn_get_mscale(scale, mscale=1):
  class DeepseekV3YarnRotaryEmbedding (line 140) | class DeepseekV3YarnRotaryEmbedding(nn.Layer):
    method __init__ (line 141) | def __init__(self, config: DeepseekV3Config, device=None):
    method compute_default_rope_parameters (line 158) | def compute_default_rope_parameters(
    method forward (line 183) | def forward(self, x, position_ids):
  function rotate_half (line 199) | def rotate_half(x):
  function apply_rotary_pos_emb (line 206) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion...
  class FakeGate (line 250) | class FakeGate(paddle.autograd.PyLayer):
    method forward (line 252) | def forward(ctx, hidden_states, weight):
    method backward (line 264) | def backward(ctx, grad_output):
  class MoEGate (line 268) | class MoEGate(PretrainedMoEGate):
    method __init__ (line 269) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
    method forward (line 297) | def forward(self, hidden_states):
  class AddAuxiliaryLoss (line 320) | class AddAuxiliaryLoss(paddle.autograd.PyLayer):
    method forward (line 327) | def forward(ctx, x, loss):
    method backward (line 334) | def backward(ctx, grad_output):
  class DeepseekV3TopkRouter (line 341) | class DeepseekV3TopkRouter(nn.Layer):
    method __init__ (line 342) | def __init__(self, config):
    method get_topk_indices (line 360) | def get_topk_indices(self, scores):
    method forward (line 379) | def forward(self, hidden_states):
  class DeepseekV3NaiveMoe (line 394) | class DeepseekV3NaiveMoe(nn.Layer):
    method __init__ (line 395) | def __init__(self, config):
    method sharded_state_dict (line 413) | def sharded_state_dict(
    method forward (line 428) | def forward(
  class DeepseekV3MoE (line 456) | class DeepseekV3MoE(nn.Layer):
    method __init__ (line 461) | def __init__(self, config):
    method moe (line 487) | def moe(self, hidden_states: paddle.Tensor, topk_indices: paddle.Tenso...
    method forward (line 512) | def forward(self, hidden_states):
  class DeepseekV3MoEFlexToken (line 526) | class DeepseekV3MoEFlexToken(MoEFlexTokenLayer):
    method __init__ (line 531) | def __init__(self, config: DeepseekV3Config):
    method forward (line 575) | def forward(self, hidden_states):
  class DeepseekV3Attention (line 587) | class DeepseekV3Attention(nn.Layer):
    method __init__ (line 590) | def __init__(self, config: DeepseekV3Config, layer_idx: int):
    method _shape (line 713) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
    method forward (line 716) | def forward(
  class DeepseekV3DecoderLayer (line 849) | class DeepseekV3DecoderLayer(nn.Layer):
    method __init__ (line 850) | def __init__(self, config: DeepseekV3Config, layer_idx: int):
    method subbatch_recompute_forward (line 890) | def subbatch_recompute_forward(
    method attn (line 949) | def attn(
    method post_process (line 1020) | def post_process(
    method forward (line 1040) | def forward(
  class DeepseekV3MTPLayer (line 1077) | class DeepseekV3MTPLayer(DeepseekV3DecoderLayer):
    method __init__ (line 1078) | def __init__(
    method subbatch_recompute_forward (line 1101) | def subbatch_recompute_forward(
    method forward (line 1138) | def forward(
  class DeepseekV3PretrainedModel (line 1176) | class DeepseekV3PretrainedModel(PretrainedModel):
    method _gen_aoa_config (line 1195) | def _gen_aoa_config(cls, config: DeepseekV3Config):
    method _gen_inv_aoa_config (line 1283) | def _gen_inv_aoa_config(cls, config: DeepseekV3Config):
  class DeepseekV3Model (line 1382) | class DeepseekV3Model(DeepseekV3PretrainedModel):
    method __init__ (line 1390) | def __init__(self, config: DeepseekV3Config):
    method _prepare_decoder_attention_mask (line 1420) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_...
    method recompute_training_full (line 1448) | def recompute_training_full(
    method forward (line 1481) | def forward(
  class DeepseekV3PretrainingCriterion (line 1713) | class DeepseekV3PretrainingCriterion(nn.Layer):
    method __init__ (line 1719) | def __init__(self, config: DeepseekV3Config, **kwargs):
    method forward (line 1730) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non...
  class DeepseekV3ForCausalLM (line 1834) | class DeepseekV3ForCausalLM(DeepseekV3PretrainedModel):
    method __init__ (line 1837) | def __init__(self, config: DeepseekV3Config):
    method get_input_embeddings (line 1845) | def get_input_embeddings(self):
    method set_input_embeddings (line 1848) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 1851) | def get_output_embeddings(self):
    method set_output_embeddings (line 1854) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 1857) | def set_decoder(self, decoder):
    method get_decoder (line 1860) | def get_decoder(self):
    method forward (line 1863) | def forward(
    method prepare_inputs_for_generation (line 1985) | def prepare_inputs_for_generation(
    method _get_model_inputs_spec (line 2010) | def _get_model_inputs_spec(self, dtype: str):
    method _reorder_cache (line 2018) | def _reorder_cache(past_key_values, beam_idx):
  class DeepseekV3ForSequenceClassification (line 2025) | class DeepseekV3ForSequenceClassification(DeepseekV3PretrainedModel):
    method __init__ (line 2026) | def __init__(self, config):
    method get_input_embeddings (line 2035) | def get_input_embeddings(self):
    method set_input_embeddings (line 2038) | def set_input_embeddings(self, value):
    method forward (line 2041) | def forward(
  class DeepseekV3MTPLayerPipe (line 2128) | class DeepseekV3MTPLayerPipe(DeepseekV3MTPLayer):
    method forward (line 2129) | def forward(self, args):
  class DeepseekV3EmbeddingPipe (line 2219) | class DeepseekV3EmbeddingPipe(EmbeddingPipe):
    method __init__ (line 2220) | def __init__(self, config, embed_cls=None, rotary_emb_cls=None):
    method forward (line 2224) | def forward(self, args):
  class DeepseekV3DecoderLayerPipe (line 2305) | class DeepseekV3DecoderLayerPipe(DeepseekV3DecoderLayer):
    method forward (line 2306) | def forward(self, args):
  class DeepseekV3LMHeadPipe (line 2377) | class DeepseekV3LMHeadPipe(GeneralLMHead):
    method forward (line 2378) | def forward(self, args):
  class DeepseekV3PretrainingCriterionPipe (line 2390) | class DeepseekV3PretrainingCriterionPipe(DeepseekV3PretrainingCriterion):
    method forward (line 2391) | def forward(self, logits, labels):
  class DeepseekV3RMSNormLayerPipe (line 2404) | class DeepseekV3RMSNormLayerPipe(RMSNorm):
    method __init__ (line 2405) | def __init__(self, *args, **kwargs):
    method forward (line 2410) | def forward(self, args):
  class DeepseekV3ForCausalLMPipe (line 2427) | class DeepseekV3ForCausalLMPipe(GeneralModelForCausalLMPipe):

FILE: paddleformers/transformers/dpo_criterion.py
  class DPOCriterion (line 32) | class DPOCriterion(nn.Layer):
    method __init__ (line 35) | def __init__(self, config, dpo_config=None, use_infohub=False, ignore_...
    method dpo_loss (line 51) | def dpo_loss(self, policy_chosen_logps, policy_rejected_logps, referen...
    method dpo_logps (line 128) | def dpo_logps(
    method forward (line 279) | def forward(
  class AutoDPOCriterion (line 318) | class AutoDPOCriterion(DPOCriterion):
    method __init__ (line 319) | def __init__(self, config, dpo_config=None, use_infohub=False, ignore_...
    method forward (line 323) | def forward(
    method dpo_logps (line 341) | def dpo_logps(

FILE: paddleformers/transformers/embedding_utils.py
  function dist_gather_tensor_with_gradient (line 19) | def dist_gather_tensor_with_gradient(tensor):

FILE: paddleformers/transformers/ernie4_5/configuration.py
  class Ernie4_5Config (line 20) | class Ernie4_5Config(PretrainedConfig):
    method __init__ (line 30) | def __init__(

FILE: paddleformers/transformers/ernie4_5/modeling.py
  function rotate_half (line 50) | def rotate_half(x):
  function apply_rotary_pos_emb (line 57) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  function apply_fused_rope (line 94) | def apply_fused_rope(query_states, key_states, rope_theta):
  class Ernie4_5RotaryEmbedding (line 113) | class Ernie4_5RotaryEmbedding(nn.Layer):
    method __init__ (line 114) | def __init__(self, config):
    method compute_default_rope_parameters (line 132) | def compute_default_rope_parameters(
    method forward (line 162) | def forward(self, x, position_ids):
  class Ernie4_5Attention (line 188) | class Ernie4_5Attention(nn.Layer):
    method __init__ (line 191) | def __init__(self, config, layer_idx=0):
    method forward (line 245) | def forward(
  class Ernie4_5DecoderLayer (line 330) | class Ernie4_5DecoderLayer(nn.Layer):
    method __init__ (line 337) | def __init__(self, config, layer_idx):
    method forward (line 376) | def forward(
  class Ernie4_5PretrainedModel (line 442) | class Ernie4_5PretrainedModel(PretrainedModel):
    method _gen_aoa_config (line 450) | def _gen_aoa_config(cls, config: Ernie4_5Config):
    method _gen_inv_aoa_config (line 484) | def _gen_inv_aoa_config(cls, config: Ernie4_5Config):
  class Ernie4_5Model (line 525) | class Ernie4_5Model(Ernie4_5PretrainedModel):
    method __init__ (line 528) | def __init__(self, config: Ernie4_5Config):
    method recompute_training (line 555) | def recompute_training(
    method forward (line 603) | def forward(
  class Ernie4_5ForCausalLM (line 763) | class Ernie4_5ForCausalLM(Ernie4_5PretrainedModel):
    method __init__ (line 768) | def __init__(self, config):
    method prepare_attention_mask_for_generation (line 788) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i...
    method forward (line 794) | def forward(
  class Ernie4_5ForCausalLMPipe (line 879) | class Ernie4_5ForCausalLMPipe(GeneralModelForCausalLMPipe):

FILE: paddleformers/transformers/ernie4_5_moe/configuration.py
  class Ernie4_5_MoeConfig (line 25) | class Ernie4_5_MoeConfig(PretrainedConfig):
    method __init__ (line 35) | def __init__(
    method to_json_string (line 284) | def to_json_string(self, use_diff: bool = True, saving_file=False) -> ...

FILE: paddleformers/transformers/ernie4_5_moe/modeling.py
  function mtp_hidden_states_set_zero (line 62) | def mtp_hidden_states_set_zero(hidden_states, inbatch_pack_offset):
  class Ernie4_5_MoeRotaryEmbedding (line 91) | class Ernie4_5_MoeRotaryEmbedding(nn.Layer):
    method __init__ (line 92) | def __init__(self, config):
    method compute_default_rope_parameters (line 110) | def compute_default_rope_parameters(
    method forward (line 135) | def forward(self, x, position_ids):
  class Ernie4_5_MoeMLP (line 161) | class Ernie4_5_MoeMLP(Ernie4_5MLP):
    method __init__ (line 164) | def __init__(self, config, hidden_size, moe_intermediate_size, layer_i...
    method forward (line 184) | def forward(self, x):
  class FakeMoERouterLoss (line 213) | class FakeMoERouterLoss(PyLayer):
    method forward (line 221) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss):
    method backward (line 240) | def backward(ctx, out_grad):
  class Ernie4_5_MoeSparseMoeBlock (line 259) | class Ernie4_5_MoeSparseMoeBlock(MOEAllGatherLayerV2):
    method __init__ (line 260) | def __init__(self, config, layer_idx):
  class Ernie4_5_MoeDecoderLayer (line 340) | class Ernie4_5_MoeDecoderLayer(nn.Layer):
    method __init__ (line 347) | def __init__(self, config, layer_idx):
    method forward (line 410) | def forward(
  class Ernie4_5_MoePretrainedModel (line 503) | class Ernie4_5_MoePretrainedModel(PretrainedModel):
    method _gen_aoa_config (line 522) | def _gen_aoa_config(cls, config: Ernie4_5_MoeConfig):
    method _gen_inv_aoa_config (line 571) | def _gen_inv_aoa_config(cls, config: Ernie4_5_MoeConfig):
  class Ernie4_5_MoeModel (line 649) | class Ernie4_5_MoeModel(Ernie4_5_MoePretrainedModel):
    method __init__ (line 652) | def __init__(self, config: Ernie4_5_MoeConfig):
    method recompute_training (line 746) | def recompute_training(
    method forward (line 793) | def forward(
  class Ernie4_5_MoeForCausalLM (line 1104) | class Ernie4_5_MoeForCausalLM(Ernie4_5_MoePretrainedModel):
    method __init__ (line 1109) | def __init__(self, config):
    method prepare_attention_mask_for_generation (line 1129) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i...
    method forward (line 1135) | def forward(
  class Ernie4_5_MoeForCausalLMPipe (line 1225) | class Ernie4_5_MoeForCausalLMPipe(GeneralModelForCausalLMPipe):

FILE: paddleformers/transformers/ernie4_5_moe_vl/image_processor.py
  function smart_resize (line 50) | def smart_resize(
  function is_scaled_image (line 100) | def is_scaled_image(image: np.ndarray) -> bool:
  function make_batched_images (line 111) | def make_batched_images(images) -> List[List[ImageInput]]:
  function make_batched_videos (line 134) | def make_batched_videos(videos) -> List[VideoInput]:
  class Ernie4_5_VLImageProcessor (line 151) | class Ernie4_5_VLImageProcessor(BaseImageProcessor):
    method __init__ (line 192) | def __init__(
    method set_pixels (line 226) | def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
    method get_smarted_resize (line 239) | def get_smarted_resize(self, height, width, min_pixels=None, max_pixel...
    method _preprocess (line 255) | def _preprocess(
    method preprocess (line 406) | def preprocess(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/comm_utils.py
  function all_gather_varlen (line 22) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py
  class Ernie4_5_Config (line 52) | class Ernie4_5_Config(PretrainedConfig):
    method __init__ (line 65) | def __init__(
  class Ernie4_5_MoeConfig (line 226) | class Ernie4_5_MoeConfig(Ernie4_5_Config):
    method __init__ (line 249) | def __init__(
    method multimodel_experts (line 397) | def multimodel_experts(self) -> bool:
    method use_moe (line 402) | def use_moe(self) -> bool:
    method to_json_string (line 411) | def to_json_string(self, use_diff: bool = True, saving_file=False) -> ...
  class Ernie4_5_VLMoeConfig (line 456) | class Ernie4_5_VLMoeConfig(Ernie4_5_MoeConfig):
    method __init__ (line 498) | def __init__(
    method multimodel_experts (line 553) | def multimodel_experts(self) -> bool:
    method use_moe (line 558) | def use_moe(self) -> bool:
    method to_dict (line 567) | def to_dict(self, saving_file=False):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/activation.py
  class NewGELUActivation (line 24) | class NewGELUActivation(nn.Layer):
    method forward (line 30) | def forward(self, input: Tensor) -> Tensor:
  class GELUActivation (line 43) | class GELUActivation(nn.Layer):
    method __init__ (line 51) | def __init__(self, use_gelu_python: bool = False):
    method _gelu_python (line 62) | def _gelu_python(self, input: Tensor) -> Tensor:
    method forward (line 72) | def forward(self, input: Tensor) -> Tensor:
  class FastGELUActivation (line 83) | class FastGELUActivation(nn.Layer):
    method forward (line 88) | def forward(self, input: Tensor) -> Tensor:
  class QuickGELUActivation (line 99) | class QuickGELUActivation(nn.Layer):
    method forward (line 104) | def forward(self, input: Tensor) -> Tensor:
  class ClippedGELUActivation (line 115) | class ClippedGELUActivation(nn.Layer):
    method __init__ (line 128) | def __init__(self, min: float, max: float):
    method forward (line 136) | def forward(self, x: Tensor) -> Tensor:
  class SiLUActivation (line 147) | class SiLUActivation(nn.Layer):
    method forward (line 156) | def forward(self, input: Tensor) -> Tensor:
  class MishActivation (line 167) | class MishActivation(nn.Layer):
    method forward (line 173) | def forward(self, input: Tensor) -> Tensor:
  class LinearActivation (line 184) | class LinearActivation(nn.Layer):
    method forward (line 189) | def forward(self, input: Tensor) -> Tensor:
  class ClassInstantier (line 200) | class ClassInstantier(OrderedDict):
    method __getitem__ (line 203) | def __getitem__(self, key):
  function get_activation (line 229) | def get_activation(activation_string):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/configuration.py
  class DFNRopeVisionTransformerConfig (line 24) | class DFNRopeVisionTransformerConfig(PretrainedConfig):
    method __init__ (line 33) | def __init__(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/modeling.py
  class _AllToAll (line 37) | class _AllToAll(paddle.autograd.PyLayer):
    method forward (line 39) | def forward(
    method backward (line 84) | def backward(ctx, *grad_output):
  function rotate_half (line 97) | def rotate_half(x):
  function apply_rotary_pos_emb_vision (line 104) | def apply_rotary_pos_emb_vision(tensor: paddle.Tensor, freqs: paddle.Ten...
  function qkv_reshard_head (line 126) | def qkv_reshard_head(tensor, group):
  class VisionFlashAttention2 (line 150) | class VisionFlashAttention2(nn.Layer):
    method __init__ (line 153) | def __init__(self, dim: int, num_heads: int = 16) -> None:
    method forward (line 165) | def forward(
  class PatchEmbed (line 212) | class PatchEmbed(nn.Layer):
    method __init__ (line 215) | def __init__(
    method forward (line 233) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
  class VisionMlp (line 248) | class VisionMlp(nn.Layer):
    method __init__ (line 251) | def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
    method forward (line 257) | def forward(self, x) -> paddle.Tensor:
  class VisionRotaryEmbedding (line 268) | class VisionRotaryEmbedding(nn.Layer):
    method __init__ (line 271) | def __init__(self, dim: int, theta: float = 10000.0) -> None:
    method forward (line 280) | def forward(self, seqlen: int) -> paddle.Tensor:
  class DFNRopeVisionBlock (line 293) | class DFNRopeVisionBlock(nn.Layer):
    method __init__ (line 296) | def __init__(self, config, attn_implementation: str = "sdpa") -> None:
    method forward (line 315) | def forward(self, hidden_states, startend_row_indices, rotary_pos_emb,...
  class DFNRopeVisionTransformerPretrainedModel (line 335) | class DFNRopeVisionTransformerPretrainedModel(PretrainedModel):
    method __init__ (line 340) | def __init__(self, config) -> None:
    method get_dtype (line 366) | def get_dtype(self) -> paddle.dtype:
    method rot_pos_emb (line 373) | def rot_pos_emb(self, grid_thw, num_pad=0):
    method forward (line 419) | def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tenso...
    method extract_feature (line 471) | def extract_feature(self, images, grid_thw):
    method _extract_feature (line 511) | def _extract_feature(self, images, grid_thw, num_pad=0):
    method _get_tensor_parallel_mappings (line 519) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method set_state_dict (line 525) | def set_state_dict(self, state_dict, *args, **kwargs):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/modeling_pp.py
  class DFNRopeVisionTransformerPipe (line 31) | class DFNRopeVisionTransformerPipe(DFNRopeVisionTransformerPretrainedMod...
    method __init__ (line 36) | def __init__(self, config, use_full_recompute=False):
    method extract_feature (line 53) | def extract_feature(self, images, grid_thw, second_fwd=False):
    method _extract_feature (line 99) | def _extract_feature(self, images, grid_thw, num_pad=0):
    method forward (line 106) | def forward(self, args):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/distributed/__init__.py
  function parallel_matmul (line 73) | def parallel_matmul(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/distributed/common_dist_utils.py
  function get_hcg (line 68) | def get_hcg():
  function _parallel_matmul (line 75) | def _parallel_matmul(
  function scatter_axis (line 130) | def scatter_axis(input, group=None, axis=0):
  function mp_slice (line 162) | def mp_slice(x, indices=None, group=None, axis=0):
  function all_gather_varlen (line 191) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):
  class ReduceScatterGroupOp (line 226) | class ReduceScatterGroupOp(PyLayer):
    method forward (line 232) | def forward(ctx, input, group=None):
    method backward (line 247) | def backward(ctx, grad):
  class AllGatherGroupOp (line 258) | class AllGatherGroupOp(PyLayer):
    method forward (line 264) | def forward(ctx, input, group=None):
    method backward (line 279) | def backward(ctx, grad):
  class RRColumnSequenceParallelLinear (line 290) | class RRColumnSequenceParallelLinear(ColumnSequenceParallelLinear):
    method __init__ (line 295) | def __init__(
    method forward (line 346) | def forward(self, x):
  class RRRowSequenceParallelLinear (line 378) | class RRRowSequenceParallelLinear(RowSequenceParallelLinear):
    method __init__ (line 383) | def __init__(
    method forward (line 425) | def forward(self, x):
  class AllGatherVarlenOp (line 468) | class AllGatherVarlenOp(PyLayer):
    method forward (line 480) | def forward(ctx, input):
    method backward (line 524) | def backward(ctx, grad):
  function sequence_parallel_sparse_mask_labels (line 547) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
  function get_async_loader (line 592) | def get_async_loader():
  function hack_offload_wait (line 606) | def hack_offload_wait(task):
  function hack_reload_wait (line 611) | def hack_reload_wait(task):
  function all_gather_group (line 616) | def all_gather_group(input, group=None, axis=0):
  function reduce_scatter_group (line 652) | def reduce_scatter_group(input, group=None):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/fusion_ops/__init__.py
  function fusion_flash_attention (line 40) | def fusion_flash_attention(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/fusion_ops/common_fusion_ops.py
  function _fusion_flash_attention (line 38) | def _fusion_flash_attention(
  function _gen_from_sparse_attn_mask_indices (line 136) | def _gen_from_sparse_attn_mask_indices(attn_mask_start_row_indices, dtype):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/fusion_ops/npu_fusion_ops.py
  function npu_combining (line 24) | def npu_combining(x, combine_weights, scatter_index, hard_gate=False):
  function npu_cal_aux_loss_func (line 40) | def npu_cal_aux_loss_func(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/longcontext_ops.py
  class MaxHeap (line 30) | class MaxHeap:
    method __init__ (line 36) | def __init__(self, data=None):
    method push (line 48) | def push(self, item):
    method pop (line 56) | def pop(self):
    method top (line 65) | def top(self):
    method is_empty (line 73) | def is_empty(self):
    method __len__ (line 79) | def __len__(self):
  function redistribute_tokens (line 86) | def redistribute_tokens(piles):
  class TensorBalanceByTokenType (line 141) | class TensorBalanceByTokenType(PyLayer):
    method forward (line 145) | def forward(
    method backward (line 306) | def backward(ctx, tensor_grad, token_type_ids_grad):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/loss/dpo.py
  class ErnieDPOCriterion (line 34) | class ErnieDPOCriterion(DPOCriterion):
    method dpo_logps (line 37) | def dpo_logps(
    method dpo_loss (line 183) | def dpo_loss(
    method forward (line 268) | def forward(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py
  function calc_lm_head_logits (line 61) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para...
  function subbatch (line 104) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar...
  class FusedDropoutImpl (line 167) | class FusedDropoutImpl(nn.Layer):
    method __init__ (line 184) | def __init__(self, prob, mode):
    method forward (line 197) | def forward(self, x, y):
  class RMSNorm (line 215) | class RMSNorm(nn.Layer):
    method __init__ (line 225) | def __init__(self, config):
    method forward (line 245) | def forward(self, hidden_states):
  class LayerNorm (line 271) | class LayerNorm(nn.LayerNorm):
    method __init__ (line 282) | def __init__(self, config):
  class RopeEmbedding (line 296) | class RopeEmbedding(nn.Layer):
    method __init__ (line 314) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a...
    method forward (line 331) | def forward(self, seq_length, position_ids=None):
    method apply_rotary (line 359) | def apply_rotary(self, rp, q, k):
    method apply_rotary_3d (line 398) | def apply_rotary_3d(self, rp, q, k, position_ids):
    method forward_single (line 479) | def forward_single(self, position_ids):
    method apply_rotary_single (line 497) | def apply_rotary_single(x, rope_emb):
  class Ernie4_5_MLP (line 505) | class Ernie4_5_MLP(nn.Layer):
    method __init__ (line 510) | def __init__(self, config, layer_idx=0):
    method forward (line 578) | def forward(self, x):
  class Ernie4_5_Attention (line 602) | class Ernie4_5_Attention(nn.Layer):
    method __init__ (line 605) | def __init__(self, config, layer_idx=0):
    method set_attn_func (line 745) | def set_attn_func(self):
    method forward (line 758) | def forward(
    method _flash_attention_wrapper (line 849) | def _flash_attention_wrapper(
    method core_attn (line 885) | def core_attn(
    method rope_attn (line 966) | def rope_attn(
  class FusedHeadParallelCrossEntropy (line 1073) | class FusedHeadParallelCrossEntropy(PyLayer):
    method forward (line 1081) | def forward(
    method backward (line 1198) | def backward(ctx, loss_all_grad, labels_all_grad):
  class ErniePretrainingCriterion (line 1313) | class ErniePretrainingCriterion(paddle.nn.Layer):
    method __init__ (line 1316) | def __init__(self, config, return_tuple=True):
    method forward (line 1338) | def forward(self, prediction_scores, masked_lm_labels, loss_mask=None):
    method forward_impl_with_fused_head_loss_fn (line 1414) | def forward_impl_with_fused_head_loss_fn(
    method forward_impl_with_calc_logits (line 1470) | def forward_impl_with_calc_logits(
    method loss_impl (line 1497) | def loss_impl(self, prediction_scores, masked_lm_labels):
    method forward_impl (line 1511) | def forward_impl(self, prediction_scores, masked_lm_labels, loss_mask=...
  class Ernie4_5_LMHead (line 1579) | class Ernie4_5_LMHead(nn.Layer):
    method __init__ (line 1582) | def __init__(self, config):
    method forward (line 1635) | def forward(self, hidden_states, tensor_parallel_output=None):
    method sharded_state_dict (line 1679) | def sharded_state_dict(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py
  function mtp_hidden_states_set_zero (line 65) | def mtp_hidden_states_set_zero(hidden_states, inbatch_pack_offset):
  class BaseModelOutputWithPastAndCrossAttentions (line 95) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput):
  class CausalLMOutputWithCrossAttentions (line 120) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput):
  function get_gate (line 154) | def get_gate(
  function _parse_moe_group (line 258) | def _parse_moe_group(
  class Ernie4_5_MoeMLP (line 314) | class Ernie4_5_MoeMLP(Ernie4_5_MLP):
    method __init__ (line 317) | def __init__(self, config, layer_idx=0):
    method forward (line 335) | def forward(self, x):
  class FakeMoERouterLoss (line 359) | class FakeMoERouterLoss(PyLayer):
    method forward (line 367) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss):
    method backward (line 386) | def backward(ctx, out_grad):
  class Ernie4_5_DecoderLayer (line 405) | class Ernie4_5_DecoderLayer(nn.Layer):
    method __init__ (line 412) | def __init__(self, config, layer_idx):
    method _init_shared_experts (line 535) | def _init_shared_experts(self):
    method _init_gate_and_experts (line 558) | def _init_gate_and_experts(self, layer_idx):
    method forward (line 622) | def forward(
    method model_parallel_dropout (line 766) | def model_parallel_dropout(self):
  class Ernie4_5_PretrainedModel (line 778) | class Ernie4_5_PretrainedModel(PretrainedModel):
    method _get_tensor_parallel_mappings (line 796) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
  class Ernie4_5_Model (line 924) | class Ernie4_5_Model(Ernie4_5_PretrainedModel):
    method __init__ (line 927) | def __init__(self, config: Ernie4_5_MoeConfig):
    method get_input_embeddings (line 1003) | def get_input_embeddings(self):
    method set_input_embeddings (line 1011) | def set_input_embeddings(self, value):
    method recompute_training (line 1020) | def recompute_training(
    method forward (line 1067) | def forward(
  class ErniePretrainingCriterion (line 1372) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase):
    method __init__ (line 1375) | def __init__(self, config, return_tuple=True):
    method forward (line 1396) | def forward(
  class Ernie4_5_MoeForCausalLM (line 1464) | class Ernie4_5_MoeForCausalLM(Ernie4_5_PretrainedModel):
    method __init__ (line 1469) | def __init__(self, config):
    method set_state_dict (line 1505) | def set_state_dict(self, state_dict, *args, **kwargs):
    method get_input_embeddings (line 1515) | def get_input_embeddings(self):
    method set_input_embeddings (line 1519) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 1523) | def get_output_embeddings(self):
    method set_output_embeddings (line 1527) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 1531) | def set_decoder(self, decoder):
    method get_decoder (line 1535) | def get_decoder(self):
    method prepare_attention_mask_for_generation (line 1543) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i...
    method prepare_inputs_for_generation (line 1547) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 1611) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method forward (line 1665) | def forward(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py
  function parse_args (line 28) | def parse_args(args, mtp_enable=False):
  function get_pp_vp_split_layers (line 92) | def get_pp_vp_split_layers(config, skip_recompute_num=-1):
  function create_skip_config_for_refined_recompute (line 162) | def create_skip_config_for_refined_recompute(layer_idx, config):
  class Ernie4_5_EmbeddingPipe (line 210) | class Ernie4_5_EmbeddingPipe(nn.Layer):
    method __init__ (line 213) | def __init__(self, config):
    method embedding_weight (line 234) | def embedding_weight(self):
    method forward (line 243) | def forward(self, args):
  class EmptyLayer (line 326) | class EmptyLayer(nn.Layer):
    method __init__ (line 331) | def __init__(self):
    method forward (line 341) | def forward(self, x):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl.py
  class TokenType (line 64) | class TokenType:
  function monkey_patch_param_hook (line 76) | def monkey_patch_param_hook(param):
  function get_backbone_lm_param_regex (line 124) | def get_backbone_lm_param_regex(config):
  function create_freeze_hook (line 156) | def create_freeze_hook(name, param, factor=0.0):
  function create_partial_freeze_hook (line 168) | def create_partial_freeze_hook(name, param, factor, index):
  class ModalityDetach (line 181) | class ModalityDetach(PyLayer):
    method forward (line 185) | def forward(
    method backward (line 211) | def backward(ctx, *last_hidden_grad):
  class VariableResolutionResamplerModel (line 218) | class VariableResolutionResamplerModel(nn.Layer):
    method __init__ (line 223) | def __init__(self, in_dim, out_dim, spatial_conv_size, temporal_conv_s...
    method spatial_conv_reshape (line 288) | def spatial_conv_reshape(self, x, spatial_conv_size):
    method forward (line 296) | def forward(self, x, image_mask, token_type_ids, image_type_ids, grid_...
    method _get_tensor_parallel_mappings (line 420) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
  class ErniePretrainingCriterion (line 434) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase):
    method __init__ (line 439) | def __init__(self, config):
    method forward (line 445) | def forward(
    method update_log (line 550) | def update_log(self, loss, token_type_ids_untouched):
  function calc_multimodal_logits (line 568) | def calc_multimodal_logits(
  class Ernie4_5_MoeVLHead (line 668) | class Ernie4_5_MoeVLHead(Ernie4_5_LMHead):
    method __init__ (line 671) | def __init__(self, config):
    method forward (line 684) | def forward(self, hidden_state, token_type_ids_labels, use_cache=False):
  class Ernie4_5_VLMoeForConditionalGeneration (line 728) | class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
    method __init__ (line 753) | def __init__(self, config: Ernie4_5_VLMoeConfig):
    method add_vision_model (line 787) | def add_vision_model(
    method add_image_preprocess (line 795) | def add_image_preprocess(self, preprocess):
    method _get_tensor_parallel_mappings (line 801) | def _get_tensor_parallel_mappings(cls, config, is_split=True):
    method _gen_aoa_config (line 831) | def _gen_aoa_config(cls, config):
    method _gen_inv_aoa_config (line 901) | def _gen_inv_aoa_config(cls, config):
    method _set_modality_param_mapping (line 970) | def _set_modality_param_mapping(self):
    method update_params_stat (line 989) | def update_params_stat(self, param_group, stop_gradient):
    method freeze_vision (line 1003) | def freeze_vision(self):
    method vision_forward (line 1012) | def vision_forward(
    method vision_mapping_forward (line 1038) | def vision_mapping_forward(
    method get_rope_index (line 1073) | def get_rope_index(
    method get_token_type_ids (line 1221) | def get_token_type_ids(
    method prepare_inputs_for_generation (line 1315) | def prepare_inputs_for_generation(
    method _post_init (line 1383) | def _post_init(self, original_init, *args, **kwargs):
    method forward (line 1396) | def forward(
    method _resolve_prefix_keys (line 1646) | def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_erro...

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py
  class PipelinePretrainedModel (line 75) | class PipelinePretrainedModel(PipelinePretrainedModelBase):
    method _set_pipeline_name_mapping (line 77) | def _set_pipeline_name_mapping(self, mappings=None):
  class ErniePretrainingCriterionPipe (line 143) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion):
    method __init__ (line 148) | def __init__(self, config):
    method forward (line 158) | def forward(self, logits, labels):
  function modality_detach (line 192) | def modality_detach(wrapped_class):
  function inbatch_pack_offset_to_attn_mask_start_row_indices (line 252) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs...
  class ErnieMoELMHeadPipe (line 268) | class ErnieMoELMHeadPipe(Ernie4_5_MoeVLHead):
    method __init__ (line 273) | def __init__(self, config):
    method embedding_weight (line 277) | def embedding_weight(self):
    method forward (line 281) | def forward(self, args):
  class ErnieVLEmbeddingPipe (line 314) | class ErnieVLEmbeddingPipe(Ernie4_5_EmbeddingPipe):
    method __init__ (line 317) | def __init__(self, config, use_full_recompute=False):
    method forward (line 339) | def forward(self, args):
  class ErnieDecoderLayerPipe (line 498) | class ErnieDecoderLayerPipe(ErnieMoEDecoderLayer):
    method __init__ (line 505) | def __init__(self, config, layer_idx, use_full_recompute=False):
    method forward (line 513) | def forward(self, args):
  class LayerNormPipe (line 590) | class LayerNormPipe(LayerNorm):
    method __init__ (line 593) | def __init__(self, config):
    method forward (line 599) | def forward(self, args):
  class RMSNormPipe (line 608) | class RMSNormPipe(RMSNorm):
    method __init__ (line 611) | def __init__(self, config):
    method forward (line 616) | def forward(self, args):
  function multimodal_data_provider (line 624) | def multimodal_data_provider(
  function exchange_pp_imgs_with_thw (line 746) | def exchange_pp_imgs_with_thw(
  function get_len_and_offset (line 795) | def get_len_and_offset(input_len, group):
  class Ernie4_5_VLModel (line 807) | class Ernie4_5_VLModel(nn.Layer):
    method __init__ (line 810) | def __init__(self, config):
  class Ernie4_5_VLMoeForConditionalGenerationPipe (line 814) | class Ernie4_5_VLMoeForConditionalGenerationPipe(PipelinePretrainedModel...
    method _prepare_pipeline_inputs_func (line 829) | def _prepare_pipeline_inputs_func(self, data: Union[List, Dict]):
    method __init__ (line 1154) | def __init__(self, config, recompute=False):
    method add_vision_model (line 1298) | def add_vision_model(
    method add_image_preprocess (line 1305) | def add_image_preprocess(self, preprocess):
    method set_pp_need_data_degree (line 1310) | def set_pp_need_data_degree(self, p):
    method _set_modality_param_mapping (line 1321) | def _set_modality_param_mapping(self, use_stop_grad=True):
    method update_params_stat (line 1357) | def update_params_stat(self, param_group, stop_gradient):
    method freeze_vision (line 1371) | def freeze_vision(self):
    method state_dict (line 1381) | def state_dict(self, *args, **kwargs):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/moe/moe_all_gather_layer.py
  function allgather_async (line 51) | def allgather_async(input, group=None):
  function reduce_scatter_async (line 76) | def reduce_scatter_async(input, group=None):
  class AllGatherAsync (line 111) | class AllGatherAsync(PyLayer):
    method forward (line 117) | def forward(ctx, input, *fn_args, group=None, fn=None, is_first_fwd=Fa...
    method backward (line 141) | def backward(ctx, grad, *fn_out_grads):
  class ReshardCombineWeight (line 162) | class ReshardCombineWeight(PyLayer):
    method forward (line 168) | def forward(ctx, input, group=None):
    method backward (line 188) | def backward(ctx, grad):
  class AlltoAllSmart (line 205) | class AlltoAllSmart(paddle.autograd.PyLayer):
    method forward (line 211) | def forward(
    method backward (line 402) | def backward(
  class AlltoAllSmartXPU (line 483) | class AlltoAllSmartXPU(paddle.autograd.PyLayer):
    method forward (line 489) | def forward(
    method backward (line 679) | def backward(
  class MOEAllGatherLayerV2 (line 764) | class MOEAllGatherLayerV2(MOELayer):
    method __init__ (line 769) | def __init__(
    method forward (line 818) | def forward(
    method fused_gate_logits_process_fused (line 1063) | def fused_gate_logits_process_fused(self, gate_logits_lm, gate_logits_...
    method fused_gate_and_dispatch (line 1148) | def fused_gate_and_dispatch(self, input, token_type_ids=None, global_d...
    method forward_experts (line 1358) | def forward_experts(self, *dispatched_input):
    method calc_router_loss_and_logging (line 1441) | def calc_router_loss_and_logging(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/moe/moe_layer.py
  class MoEStatics (line 51) | class MoEStatics(nn.Layer):
    method __init__ (line 57) | def __init__(self, config, layer_idx):
  class GateCombine (line 93) | class GateCombine(PyLayer):
    method forward (line 99) | def forward(ctx, x, combine_weights, scatter_index):
    method backward (line 118) | def backward(ctx, grad_y, *_):
  function combining (line 138) | def combining(x, combine_weights, scatter_index, hard_gate=False):
  class AlltoAll (line 164) | class AlltoAll(PyLayer):
    method forward (line 170) | def forward(ctx, x, group, sync_op=True):
    method backward (line 194) | def backward(ctx, *dx):
  class AlltoAllAsync (line 207) | class AlltoAllAsync(PyLayer):
    method forward (line 213) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False):
    method backward (line 247) | def backward(ctx, dx_out, *fn_out_grads):
  function detach_and_requires_grad_ (line 277) | def detach_and_requires_grad_(*args):
  class FakeClone (line 294) | class FakeClone(paddle.autograd.PyLayer):
    method forward (line 300) | def forward(ctx, input):
    method backward (line 318) | def backward(ctx, grad_output):
  function manual_backward (line 331) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]):
  class MOELayer (line 382) | class MOELayer(nn.Layer):
    method __init__ (line 387) | def __init__(
    method forward_experts (line 468) | def forward_experts(self, dispatched_input):
    method fused_gate_logits_process (line 513) | def fused_gate_logits_process(self, gate_logits, token_type_ids=None, ...
    method gate_and_dispatch (line 577) | def gate_and_dispatch(self, input, token_type_ids=None):
    method _calc_router_loss (line 667) | def _calc_router_loss(
    method calc_router_loss_and_logging (line 719) | def calc_router_loss_and_logging(
    method combine_expert_output (line 815) | def combine_expert_output(self, expert_output, combine_weights, scatte...
    method forward_single_stage (line 835) | def forward_single_stage(self, dispatched_input, stage_id):
    method all2all_expert_overlap (line 849) | def all2all_expert_overlap(self, x, group):
    method forward (line 877) | def forward(

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/moe/topk_gate.py
  function masked_fill (line 36) | def masked_fill(x, mask, value):
  function compute_optimal_transport (line 53) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:...
  function cast_if_needed (line 84) | def cast_if_needed(x, dtype):
  class FusedGateDetachMatmul (line 98) | class FusedGateDetachMatmul(paddle.autograd.PyLayer):
    method forward (line 105) | def forward(ctx, x, w):
    method backward (line 122) | def backward(ctx, y_grad):
  function gate_detach_matmul (line 143) | def gate_detach_matmul(x, weight, use_fuse):
  class TopKGate (line 162) | class TopKGate(nn.Layer):
    method __init__ (line 167) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->...
    method _create_gate_parameter (line 273) | def _create_gate_parameter(self):
    method get_gate_weight (line 312) | def get_gate_weight(self, transform_weight):
    method forward (line 340) | def forward(
    method get_capacity (line 374) | def get_capacity(self, num_tokens, cap_factor=None):
    method _cal_aux_loss (line 400) | def _cal_aux_loss(
    method _cal_z_loss (line 499) | def _cal_z_loss(self, logits, loss_mask=None):
    method _cal_orthogonal_loss_opt_each_weight (line 520) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group):
    method _cal_orthogonal_loss (line 550) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/refined_recompute/utils.py
  function is_second_fwd (line 35) | def is_second_fwd():
  function set_second_fwd (line 43) | def set_second_fwd(value=True):
  class CustomSavedTensorsHooks (line 51) | class CustomSavedTensorsHooks:
    method __init__ (line 57) | def __init__(self, pack_hook, unpack_hook) -> None:
    method __enter__ (line 72) | def __enter__(self) -> None:
    method __exit__ (line 79) | def __exit__(self, *args: object) -> None:
  function create_skip_config_for_refined_recompute (line 91) | def create_skip_config_for_refined_recompute(layer_idx, config):
  class RefinedRcomputeQueue (line 146) | class RefinedRcomputeQueue:
    method __init__ (line 154) | def __init__(self):
    method update (line 160) | def update(self, queue: queue.Queue, queue_name="unknown"):
    method check (line 177) | def check(self):
  class _NoopSaveInputs (line 193) | class _NoopSaveInputs(paddle.autograd.PyLayer):
    method forward (line 200) | def forward(ctx, *args):
    method backward (line 208) | def backward(ctx, *args):
  class RefinedRecomputeFunction (line 213) | class RefinedRecomputeFunction:
    method __init__ (line 216) | def __init__(self):
    method post_init (line 222) | def post_init(self, function, function_name=None):
    method __call__ (line 235) | def __call__(self, function, *args, **kwargs):
    method forward (line 245) | def forward(self, *args, **kwargs):
    method _first_fwd (line 253) | def _first_fwd(self, *args, **kwargs):
    method _second_fwd (line 303) | def _second_fwd(self, *args, **kwargs):
    method parse_to_args (line 316) | def parse_to_args(self, *args, **kwargs):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/sequence_parallel_utils.py
  function contiguous (line 31) | def contiguous(self):
  function _md5sum (line 42) | def _md5sum(self):
  class AllGatherVarlenOpV2 (line 53) | class AllGatherVarlenOpV2(PyLayer):
    method forward (line 59) | def forward(ctx, input, indices, axis=0, group=None):
    method backward (line 67) | def backward(ctx, grad):
  class SliceVarlenOp (line 72) | class SliceVarlenOp(PyLayer):
    method forward (line 89) | def forward(
    method backward (line 102) | def backward(ctx, grad):
  class ScatterOp (line 107) | class ScatterOp(PyLayer):
    method forward (line 120) | def forward(ctx, input, axis=0, group=None):
    method backward (line 127) | def backward(ctx, grad):
  class GatherOp (line 135) | class GatherOp(PyLayer):
    method forward (line 143) | def forward(ctx, input, axis=0, group=None):
    method backward (line 150) | def backward(ctx, grad):
  class AllGatherOp (line 155) | class AllGatherOp(PyLayer):
    method forward (line 163) | def forward(ctx, input, group=None):
    method backward (line 171) | def backward(ctx, grad):
  function mark_as_sequence_parallel_parameter (line 183) | def mark_as_sequence_parallel_parameter(parameter):

FILE: paddleformers/transformers/ernie4_5_moe_vl/model/utils/misc.py
  class SmoothedValue (line 44) | class SmoothedValue:
    method __init__ (line 49) | def __init__(
    method update (line 58) | def update(self, value):
    method global_avg (line 74) | def global_avg(self):
    method reset (line 82) | def reset(self):
  class TrainingLogs (line 88) | class TrainingLogs:
    method __new__ (line 103) | def __new__(cls, *args, **kw):
    method __init__ (line 108) | def __init__(self):
    method set_trainer_interval (line 117) | def set_trainer_interval(self, trainer, logging_interval):
    method global_meters_keys (line 125) | def global_meters_keys(self):
    method global_meters_keys (line 130) | def global_meters_keys(self, lst):
    method enable_skip_zero (line 134) | def enable_skip_zero(self, keys=None):
    method update (line 146) | def update(self, **kwargs):
    method is_enabled (line 151) | def is_enabled(self):
    method __setitem__ (line 157) | def __setitem__(self, k, v):
    method __getitem__ (line 165) | def __getitem__(self, v):
    method __getattr__ (line 169) | def __getattr__(self, attr):
    method dict (line 187) | def dict(self, use_async=False):
    method reset (line 246) | def reset(self):
    method take_snapshot (line 252) | def take_snapshot(self):
    method restore_snapshot (line 256) | def restore_snapshot(self):

FILE: paddleformers/transformers/ernie4_5_moe_vl/processor.py
  class Ernie4_5_VLProcessor (line 39) | class Ernie4_5_VLProcessor(ProcessorMixin):
    method __init__ (line 70) | def __init__(
    method _build_token_type_mapping (line 121) | def _build_token_type_mapping(self) -> Dict[Any, int]:
    method _download_image (line 128) | def _download_image(
    method _download_video (line 144) | def _download_video(self, item: Dict):
    method process_vision_info (line 154) | def process_vision_info(self, messages: List[Dict[str, Any]]):
    method __call__ (line 174) | def __call__(
    method _add_special_token (line 232) | def _add_special_token(self, token: Union[str, int], outputs: Dict) ->...
    method _add_text (line 241) | def _add_text(self, text: str, outputs: Dict) -> None:
    method _add_image (line 252) | def _add_image(self, img: Image.Image, outputs: Dict) -> None:
    method render_frame_timestamp (line 287) | def render_frame_timestamp(self, frame, timestamp, font_rate=0.1):
    method _add_video (line 290) | def _add_video(self, pixel_stack, outputs: Dict) -> None:
    method _load_and_process_video (line 328) | def _load_and_process_video(self, url: str, item: Dict) -> List[Image....
    method _set_video_frame_args (line 360) | def _set_video_frame_args(self, video_frame_args, video_meta):
    method _compute_3d_positions (line 398) | def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int...
    method _pack_outputs (line 409) | def _pack_outputs(self, outs: Dict) -> Dict[str, Any]:
    method model_input_names (line 427) | def model_input_names(self):

FILE: paddleformers/transformers/ernie4_5_moe_vl/tokenizer.py
  class Ernie4_5_VLTokenizer (line 30) | class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
    method __init__ (line 43) | def __init__(
    method space_token (line 98) | def space_token(self):
    method space_token_id (line 103) | def space_token_id(self):
    method gend_token (line 108) | def gend_token(self):
    method gend_token_id (line 113) | def gend_token_id(self):
    method im_start_id (line 118) | def im_start_id(self):
    method im_end_id (line 123) | def im_end_id(self):
    method vocab_size (line 128) | def vocab_size(self):
    method get_vocab (line 132) | def get_vocab(self):
    method _tokenize (line 138) | def _tokenize(self, text):
    method _convert_token_to_id (line 142) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 146) | def _convert_id_to_token(self, id):
    method convert_tokens_to_string (line 150) | def convert_tokens_to_string(self, tokens):
    method prepare_for_model (line 167) | def prepare_for_model(self, *args, **kwargs):
    method save_vocabulary (line 174) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st...
    method _decode (line 205) | def _decode(self, *args, **kwargs):
    method _pad (line 219) | def _pad(

FILE: paddleformers/transformers/ernie4_5_moe_vl/vision_process.py
  function get_filename (line 43) | def get_filename(url=None):
  function file_download (line 62) | def file_download(url, download_dir, save_to_disk=False, retry=0, retry_...
  function get_downloadable (line 94) | def get_downloadable(url, download_dir=RAW_VIDEO_DIR, save_to_disk=False...
  function get_downloadable_image (line 113) | def get_downloadable_image(download_path, need_exif_info, retry_max_time...
  function read_video_decord (line 187) | def read_video_decord(video_path, save_to_disk):
  function get_frame_indices (line 205) | def get_frame_indices(
  function read_frames_decord (line 271) | def read_frames_decord(
  function render_single_image_with_timestamp (line 315) | def render_single_image_with_timestamp(image: Image, number: str, rate: ...
  function timestamp_converting (line 343) | def timestamp_converting(time_stamp_in_seconds):
  function render_frame_timestamp (line 365) | def render_frame_timestamp(frame, timestamp, font_rate=0.1):

FILE: paddleformers/transformers/feature_extraction_utils.py
  class BatchFeature (line 34) | class BatchFeature(UserDict):
    method __init__ (line 47) | def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type:...
    method __getitem__ (line 51) | def __getitem__(self, item: str):
    method __getattr__ (line 61) | def __getattr__(self, item: str):
    method __getstate__ (line 67) | def __getstate__(self):
    method __setstate__ (line 70) | def __setstate__(self, state):
    method keys (line 74) | def keys(self):
    method values (line 77) | def values(self):
    method items (line 80) | def items(self):
    method convert_to_tensors (line 83) | def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorTy...
    method to (line 145) | def to(self, *args, **kwargs) -> "BatchFeature":
  class FeatureExtractionMixin (line 202) | class FeatureExtractionMixin(PushToHubMixin):
    method __init__ (line 213) | def __init__(self, **kwargs):
    method _set_processor_class (line 225) | def _set_processor_class(self, processor_class: str):
    method from_pretrained (line 230) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os....
    method save_pretrained (line 280) | def save_pretrained(self, save_directory: Union[str, os.PathLike], **k...
    method get_feature_extractor_dict (line 305) | def get_feature_extractor_dict(
    method from_dict (line 357) | def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs):
    method to_dict (line 392) | def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
    method from_json_file (line 405) | def from_json_file(cls, json_file: Union[str, os.PathLike]):
    method to_json_string (line 423) | def to_json_string(self) -> str:
    method to_json_file (line 444) | def to_json_file(self, json_file_path: Union[str, os.PathLike]):
    method __repr__ (line 455) | def __repr__(self):

FILE: paddleformers/transformers/fp8_utils.py
  function swiglu (line 26) | def swiglu(x, y=None):
  function get_sm_num (line 54) | def get_sm_num():
  function set_parameter_color (line 58) | def set_parameter_color(
  function extract_first_if_tuple (line 74) | def extract_first_if_tuple(x):
  function _get_fp8_weight_and_scale (line 78) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False):
  function fused_stack_quant (line 93) | def fused_stack_quant(expert_weight_list, transpose=False):
  function weight_quant (line 107) | def weight_quant(weight, transpose=False):
  class FP8LinearFunctionBase (line 136) | class FP8LinearFunctionBase:
    method dequantize_fp8_to_fp32 (line 138) | def dequantize_fp8_to_fp32(fp8_tensor, scale):
    method padding (line 143) | def padding(x, axis):
    method padding_and_quant_input (line 157) | def padding_and_quant_input(tensor):
    method kitchen_gemm (line 179) | def kitchen_gemm(
    method compute_fp8_linear (line 210) | def compute_fp8_linear(
    method compute_expert_w_grad (line 280) | def compute_expert_w_grad(
    method common_fp8_mlp_bwd (line 339) | def common_fp8_mlp_bwd(
    method fp8_mlp_fwd (line 435) | def fp8_mlp_fwd(x, w1, w2):
    method fp8_mlp_fwd_norm_rc (line 456) | def fp8_mlp_fwd_norm_rc(x, norm_w, norm_eps, w1, w2):
    method fp8_mlp_bwd (line 464) | def fp8_mlp_bwd(do3, x, w1, w2, apply_backward_hook=False):
    method fp8_mlp_bwd_norm_rc (line 506) | def fp8_mlp_bwd_norm_rc(do3, x, norm_w, norm_eps, w1, w2):
  class FP8LinearFunction (line 530) | class FP8LinearFunction(paddle.autograd.PyLayer):
    method forward (line 532) | def forward(ctx, x, custom_map, keep_x=False):
    method backward (line 561) | def backward(ctx, dout):
  class FP8Linear (line 599) | class FP8Linear(paddle.nn.Layer):
    method __init__ (line 600) | def __init__(self, in_features: int, out_features: int, bias_attr: boo...
    method forward (line 610) | def forward(self, x):
  function cache_fp8_weight (line 614) | def cache_fp8_weight(weight, quant_transpose=None):
  class FP8KeepXLinear (line 654) | class FP8KeepXLinear(paddle.nn.Layer):
    method __init__ (line 655) | def __init__(self, in_features: int, out_features: int, bias_attr: boo...
    method fp8_quant_weight (line 666) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 669) | def forward(self, x):
  class FusedNormFP8MLPFunction (line 673) | class FusedNormFP8MLPFunction(paddle.autograd.PyLayer):
    method forward (line 675) | def forward(ctx, x, norm_w, w1, w2, norm_eps):
    method backward (line 703) | def backward(ctx, do3):
  class FP8MlpFunction (line 730) | class FP8MlpFunction(paddle.autograd.PyLayer):
    method forward (line 732) | def forward(ctx, x, w1, w2, recompute_fwd_gate_up):
    method backward (line 756) | def backward(ctx, do3):
  class FP8Mlp (line 792) | class FP8Mlp(paddle.nn.Layer):
    method __init__ (line 793) | def __init__(
    method fp8_quant_weight (line 828) | def fp8_quant_weight(self, quant_transpose=None):
    method forward (line 832) | def forward(self, x):
  function split_group_gemm (line 839) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ...
  class FP8GroupGemmMlpFunctionNode (line 859) | class FP8GroupGemmMlpFunctionNode:
    method __init__ (line 860) | def __init__(
    method reset_statue (line 879) | def reset_statue(self):
    method clear_activation_tensors (line 885) | def clear_activation_tensors(self):
    method gen_m_indices (line 892) | def gen_m_indices(self, tokens_per_expert):
    method fwd_gate_up (line 899) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, m_i...
    method fwd_swiglu (line 947) | def fwd_swiglu(self, o1):
    method fwd_down (line 951) | def fwd_down(
    method bwd_dowm_input (line 997) | def bwd_dowm_input(self, expert_w2, unzipped_grad, o1, tokens_per_expe...
    method bwd_swiglu (line 1040) | def bwd_swiglu(self, o1, do2):
    method bwd_gate_up_input (line 1044) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, m_indic...
    method fused_transpose_split_quant (line 1082) | def fused_transpose_split_quant(self, x, scale, tokens_per_expert, pow...
    method bwd_down_weight (line 1088) | def bwd_down_weight(self, do3, o2, expert_w2, tokens_per_expert):
    method bwd_gate_up_weight (line 1122) | def bwd_gate_up_weight(
    method forward (line 1178) | def forward(self, hs_out, unzipped_probs, tokens_per_expert, m_indices...
    method backward (line 1221) | def backward(

FILE: paddleformers/transformers/fused_a2a.py
  function barrier_ep (line 28) | def barrier_ep(ep_group):
  function get_hidden_bytes (line 33) | def get_hidden_bytes(x: paddle.Tensor) -> int:
  function get_buffer (line 45) | def get_buffer(group: Group, hidden_bytes: int):
  function fused_dispatch_forward_func (line 77) | def fused_dispatch_forward_func(
  function fused_dispatch_backward_func (line 134) | def fused_dispatch_backward_func(
  function fused_combine_forward_func (line 159) | def fused_combine_forward_func(
  function fused_combine_backward_func (line 177) | def fused_combine_backward_func(
  class FusedDispatch (line 204) | class FusedDispatch(PyLayer):
    method forward (line 208) | def forward(ctx, x, token_indices, token_probs, num_experts, group, pr...
    method backward (line 221) | def backward(ctx, grad_output, grad_token_probs):
  class FusedCombine (line 226) | class FusedCombine(PyLayer):
    method forward (line 230) | def forward(ctx, x, group, states, previous_event=None):
    method backward (line 241) | def backward(ctx, grad_output):
  function fused_dispatch (line 248) | def fused_dispatch(x, token_indices, token_probs, num_experts, group: Gr...
  function fused_combine (line 264) | def fused_combine(x, group, handle, previous_event=None):
  class DispatchNode (line 285) | class DispatchNode:
    method __init__ (line 286) | def __init__(self, name="dispatch"):
    method reset_statue (line 289) | def reset_statue(self):
    method forward (line 292) | def forward(
    method backward (line 321) | def backward(
  class CombineNode (line 338) | class CombineNode:
    method __init__ (line 339) | def __init__(self, name="combine"):
    method reset_statue (line 342) | def reset_statue(self):
    method forward (line 345) | def forward(self, x, group, handle, previous_event=None, async_finish=...
    method backward (line 364) | def backward(self, grad_output, previous_event=None, async_finish=Fals...

FILE: paddleformers/transformers/gemma3_text/configuration.py
  class Gemma3TextConfig (line 27) | class Gemma3TextConfig(PretrainedConfig):
    method __init__ (line 151) | def __init__(
  class Gemma3Config (line 232) | class Gemma3Config(PretrainedConfig):
    method __init__ (line 285) | def __init__(

FILE: paddleformers/transformers/gemma3_text/modeling.py
  class Gemma3TextScaledWordEmbedding (line 45) | class Gemma3TextScaledWordEmbedding(nn.Embedding):
    method __init__ (line 50) | def __init__(self, config):
    method forward (line 61) | def forward(self, input_ids: paddle.Tensor):
  class Gemma3MLP (line 65) | class Gemma3MLP(BaseMLP):
    method __init__ (line 66) | def __init__(self, config: Gemma3TextConfig, fuse_up_gate=False):
  class Gemma3RMSNorm (line 71) | class Gemma3RMSNorm(nn.Layer):
    method __init__ (line 72) | def __init__(self, hidden_size: int, eps: float = 1e-6, input_is_paral...
    method _norm (line 84) | def _norm(self, x):
    method forward (line 91) | def forward(self, x):
    method enable_sequence_parallel (line 98) | def enable_sequence_parallel(self):
  class Gemma3RMSNormPipe (line 102) | class Gemma3RMSNormPipe(Gemma3RMSNorm):
    method __init__ (line 103) | def __init__(self, config):
    method forward (line 109) | def forward(self, x):
  class Gemma3RotaryEmbedding (line 115) | class Gemma3RotaryEmbedding(nn.Layer):
    method __init__ (line 116) | def __init__(self, config):
    method compute_default_rope_parameters (line 134) | def compute_default_rope_parameters(
    method forward (line 159) | def forward(self, x, position_ids):
  function rotate_half (line 175) | def rotate_half(x):
  function apply_rotary_pos_emb (line 182) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  class Gemma3Attention (line 191) | class Gemma3Attention(nn.Layer):
    method __init__ (line 194) | def __init__(self, config: Gemma3TextConfig, layer_idx: int):
    method forward (line 249) | def forward(
  class Gemma3DecoderLayer (line 321) | class Gemma3DecoderLayer(nn.Layer):
    method __init__ (line 322) | def __init__(self, config: Gemma3TextConfig, layer_idx: int):
    method forward (line 335) | def forward(
  class Gemma3PreTrainedModel (line 381) | class Gemma3PreTrainedModel(PretrainedModel):
    method _gen_aoa_config (line 397) | def _gen_aoa_config(cls, config: Gemma3TextConfig):
    method _gen_inv_aoa_config (line 436) | def _gen_inv_aoa_config(cls, config: Gemma3TextConfig):
  class Gemma3TextModel (line 479) | class Gemma3TextModel(Gemma3PreTrainedModel):
    method __init__ (line 482) | def __init__(self, config: Gemma3TextConfig):
    method recompute_training (line 500) | def recompute_training(
    method forward (line 532) | def forward(
  class Gemma3ForCausalLM (line 688) | class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
    method __init__ (line 695) | def __init__(self, config: Gemma3TextConfig):
    method _get_model_inputs_spec (line 702) | def _get_model_inputs_spec(self, dtype: str):
    method forward (line 709) | def forward(
  class Gemma3TextForSequenceClassification (line 794) | class Gemma3TextForSequenceClassification(Gemma3PreTrainedModel):
  class Gemma3ForCausalLMPipe (line 803) | class Gemma3ForCausalLMPipe(GeneralModelForCausalLMPipe):

FILE: paddleformers/transformers/glm4_moe/configuration.py
  class Glm4MoeConfig (line 19) | class Glm4MoeConfig(PretrainedConfig):
    method __init__ (line 128) | def __init__(

FILE: paddleformers/transformers/glm4_moe/modeling.py
  class GLMMoEModelProvider (line 55) | class GLMMoEModelProvider(GPTModelProvider):
  function eager_attention_forward (line 91) | def eager_attention_forward(
  function rotate_half (line 123) | def rotate_half(x):
  function apply_rotary_pos_emb (line 130) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  class Glm4MoeAttention (line 151) | class Glm4MoeAttention(nn.Layer):
    method __init__ (line 154) | def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = N...
    method forward (line 218) | def forward(
  class Glm4MoeTopkFlexRouter (line 289) | class Glm4MoeTopkFlexRouter(PretrainedMoEGate):
    method __init__ (line 290) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
    method forward (line 310) | def forward(self, hidden_states):
  class Glm4MoeTopkRouter (line 328) | class Glm4MoeTopkRouter(nn.Layer):
    method __init__ (line 329) | def __init__(self, config: Glm4MoeConfig):
    method get_topk_indices (line 351) | def get_topk_indices(self, scores):
    method forward (line 370) | def forward(self, hidden_states):
  class GLm4MoeNaiveMoe (line 383) | class GLm4MoeNaiveMoe(nn.Module):
    method __init__ (line 384) | def __init__(self, config):
    method sharded_state_dict (line 402) | def sharded_state_dict(
    method forward (line 417) | def forward(
  class Glm4MoeMoE (line 445) | class Glm4MoeMoE(nn.Layer):
    method __init__ (line 450) | def __init__(self, config):
    method moe (line 478) | def moe(self, hidden_states: paddle.Tensor, topk_indices: paddle.Tenso...
    method forward (line 504) | def forward(self, hidden_states):
  class AddAuxiliaryLoss (line 522) | class AddAuxiliaryLoss(paddle.autograd.PyLayer):
    method forward (line 529) | def forward(ctx, x, loss):
    method backward (line 536) | def backward(ctx, grad_output):
  class Glm4MoeFlexMoE (line 543) | class Glm4MoeFlexMoE(MoEFlexTokenLayer):
    method __init__ (line 548) | def __init__(self, config):
    method forward (line 605) | def forward(self, hidden_states):
  class Glm4MoeDecoderLayer (line 614) | class Glm4MoeDecoderLayer(nn.Layer):
    method __init__ (line 615) | def __init__(self, config: Glm4MoeConfig, layer_idx: int):
    method subbatch_recompute_forward (line 666) | def subbatch_recompute_forward(
    method attn (line 759) | def attn(
    method post_process (line 822) | def post_process(
    method forward (line 834) | def forward(
  class Glm4MoePreTrainedModel (line 864) | class Glm4MoePreTrainedModel(PretrainedModel):
    method _gen_aoa_config (line 872) | def _gen_aoa_config(cls, config: Glm4MoeConfig):
    method _gen_inv_aoa_config (line 1023) | def _gen_inv_aoa_config(cls, config: Glm4MoeConfig):
  class Glm4MoeRotaryEmbedding (line 1193) | class Glm4MoeRotaryEmbedding(nn.Layer):
    method __init__ (line 1194) | def __init__(self, config: Glm4MoeConfig, device=None):
    method compute_default_rope_parameters (line 1211) | def compute_default_rope_parameters(
    method forward (line 1239) | def forward(self, x, position_ids):
  class Glm4MoeModel (line 1256) | class Glm4MoeModel(Glm4MoePreTrainedModel):
    method __init__ (line 1259) | def __init__(self, config: Glm4MoeConfig):
    method recompute_training_full (line 1284) | def recompute_training_full(
    method forward (line 1314) | def forward(
  class Glm4MoeForCausalLM (line 1450) | class Glm4MoeForCausalLM(Glm4MoePreTrainedModel):
    method __new__ (line 1453) | def __new__(cls, config):
  class Glm4MoeForCausalLMDeprecated (line 1475) | class Glm4MoeForCausalLMDeprecated(Glm4MoePreTrainedModel):
    method __init__ (line 1480) | def __init__(self, config):
    method forward (line 1488) | def forward(
  class Glm4MoeDecoderLayerPipe (line 1545) | class Glm4MoeDecoderLayerPipe(Glm4MoeDecoderLayer):
    method forward (line 1546) | def forward(self, args):
  class Glm4MoeForCausalLMPipe (line 1625) | class Glm4MoeForCausalLMPipe(Glm4MoePreTrainedModel, GeneralModelForCaus...
    method __new__ (line 1628) | def __new__(cls, config):
  class Glm4MoeForCausalLMPipeDeprecated (line 1652) | class Glm4MoeForCausalLMPipeDeprecated(GeneralModelForCausalLMPipe):

FILE: paddleformers/transformers/glm4v_moe/configuration.py
  class RopeParameters (line 22) | class RopeParameters(TypedDict, total=False):
  class Glm4vMoeVisionConfig (line 77) | class Glm4vMoeVisionConfig(PretrainedConfig):
    method __init__ (line 132) | def __init__(
  class Glm4vMoeTextConfig (line 170) | class Glm4vMoeTextConfig(PretrainedConfig):
    method __init__ (line 258) | def __init__(
  class Glm4vMoeConfig (line 332) | class Glm4vMoeConfig(PretrainedConfig):
    method __init__ (line 378) | def __init__(
    method __setattr__ (line 411) | def __setattr__(self, key, value):
    method __getattribute__ (line 421) | def __getattribute__(self, key):

FILE: paddleformers/transformers/glm4v_moe/image_processor_fast.py
  function smart_resize (line 45) | def smart_resize(
  class Glm4vFastImageProcessorKwargs (line 78) | class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
  class Glm4vImageProcessorFast (line 93) | class Glm4vImageProcessorFast(BaseImageProcessorFast):
    method __init__ (line 108) | def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]):
    method _further_process_kwargs (line 115) | def _further_process_kwargs(
    method _preprocess (line 129) | def _preprocess(
    method preprocess (line 232) | def preprocess(

FILE: paddleformers/transformers/glm4v_moe/modeling.py
  class MoeModelOutputWithPast (line 51) | class MoeModelOutputWithPast(ModelOutput):
  class Glm4vMoeModelOutputWithPast (line 64) | class Glm4vMoeModelOutputWithPast(ModelOutput):
  class Glm4vMoeTextRotaryEmbedding (line 82) | class Glm4vMoeTextRotaryEmbedding(nn.Layer):
    method __init__ (line 85) | def __init__(self, config: Glm4vMoeTextConfig, device=None, layer_type...
    method compute_default_rope_parameters (line 102) | def compute_default_rope_parameters(
    method forward (line 131) | def forward(self, x, position_ids):
  function rotate_half (line 151) | def rotate_half(x):
  function apply_multimodal_rotary_pos_emb (line 158) | def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqu...
  class Glm4vMoeTextAttention (line 211) | class Glm4vMoeTextAttention(nn.Layer):
    method __init__ (line 214) | def __init__(self, config: Glm4vMoeTextConfig, layer_idx: Optional[int...
    method forward (line 259) | def forward(
  class Glm4vMoeTextDecoderLayer (line 325) | class Glm4vMoeTextDecoderLayer(nn.Layer):
    method __init__ (line 326) | def __init__(self, config: Glm4vMoeTextConfig, layer_idx: int):
    method forward (line 378) | def forward(
  class Glm4vMoePreTrainedModel (line 415) | class Glm4vMoePreTrainedModel(PretrainedModel):
    method _gen_aoa_config (line 438) | def _gen_aoa_config(cls, config: Glm4vMoeConfig):
    method _gen_inv_aoa_config (line 531) | def _gen_inv_aoa_config(cls, config: Glm4vMoeConfig):
  class Glm4vMoeCausalLMOutputWithPast (line 625) | class Glm4vMoeCausalLMOutputWithPast(ModelOutput):
  class Glm4vMoeVisionPatchEmbed (line 649) | class Glm4vMoeVisionPatchEmbed(nn.Layer):
    method __init__ (line 650) | def __init__(self, config: Glm4vMoeVisionConfig) -> None:
    method forward (line 660) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
  class Glm4vMoeVisionRotaryEmbedding (line 669) | class Glm4vMoeVisionRotaryEmbedding(nn.Layer):
    method __init__ (line 672) | def __init__(self, dim: int, theta: float = 10000.0) -> None:
    method forward (line 677) | def forward(self, seqlen: int) -> paddle.Tensor:
  class Glm4vMoeVisionPatchMerger (line 683) | class Glm4vMoeVisionPatchMerger(nn.Layer):
    method __init__ (line 684) | def __init__(self, config, dim: int, context_dim: int, hidden_act: str...
    method forward (line 695) | def forward(self, hidden_state: paddle.Tensor) -> paddle.Tensor:
  class Glm4vMoeVisionEmbeddings (line 701) | class Glm4vMoeVisionEmbeddings(nn.Layer):
    method __init__ (line 702) | def __init__(self, config: Glm4vMoeVisionConfig):
    method forward (line 718) | def forward(self, embeddings, lengths, image_shapes, h_coords, w_coord...
  function apply_rotary_pos_emb_vision (line 785) | def apply_rotary_pos_emb_vision(
  class Glm4vMoeVisionAttention (line 799) | class Glm4vMoeVisionAttention(nn.Layer):
    method __init__ (line 800) | def __init__(self, config: Glm4vMoeVisionConfig) -> None:
    method forward (line 823) | def forward(
  class Glm4vMoeVisionBlock (line 899) | class Glm4vMoeVisionBlock(nn.Layer):
    method __init__ (line 900) | def __init__(self, config) -> None:
    method forward (line 924) | def forward(
  class Glm4vMoeVisionModel (line 943) | class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
    method __init__ (line 948) | def __init__(self, config) -> None:
    method rot_pos_emb (line 990) | def rot_pos_emb(self, grid_thw):
    method recompute_training_full (line 1024) | def recompute_training_full(
    method forward (line 1045) | def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tenso...
  class Glm4vMoeTextModel (line 1103) | class Glm4vMoeTextModel(Glm4vMoePreTrainedModel):
    method __init__ (line 1107) | def __init__(self, config: Glm4vMoeTextConfig):
    method recompute_training_full (line 1133) | def recompute_training_full(
    method forward (line 1165) | def forward(
  class Glm4vMoeModel (line 1312) | class Glm4vMoeModel(Glm4vMoePreTrainedModel):
    method __init__ (line 1321) | def __init__(self, config):
    method get_input_embeddings (line 1327) | def get_input_embeddings(self):
    method set_input_embeddings (line 1330) | def set_input_embeddings(self, value):
    method get_rope_index (line 1333) | def get_rope_index(
    method get_video_features (line 1530) | def get_video_features(
    method get_image_features (line 1554) | def get_image_features(self, pixel_values: paddle.FloatTensor, image_g...
    method get_placeholder_mask (line 1570) | def get_placeholder_mask(
    method forward (line 1611) | def forward(
  class Glm4vMoeForConditionalGeneration (line 1724) | class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel):
    method __init__ (line 1732) | def __init__(self, config):
    method get_input_embeddings (line 1740) | def get_input_embeddings(self):
    method set_input_embeddings (line 1743) | def set_input_embeddings(self, value):
    method get_video_features (line 1746) | def get_video_features(
    method get_image_features (line 1751) | def get_image_features(self, pixel_values: paddle.FloatTensor, image_g...
    method language_model (line 1756) | def language_model(self):
    method visual (line 1760) | def visual(self):
    method forward (line 1763) | def forward(
    method prepare_inputs_for_generation (line 1869) | def prepare_inputs_for_generation(
    method _get_image_nums_and_video_nums (line 1918) | def _get_image_nums_and_video_nums(
    method _expand_inputs_for_generation (line 1968) | def _expand_inputs_for_generation(

FILE: paddleformers/transformers/glm4v_moe/processor.py
  class Glm4vVideosProcessorKwargs (line 41) | class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
  class Glm4vProcessorKwargs (line 45) | class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
  class Glm4vProcessor (line 57) | class Glm4vProcessor(ProcessorMixin):
    method __init__ (line 77) | def __init__(self, image_processor=None, tokenizer=None, video_process...
    method __call__ (line 92) | def __call__(
    method _get_num_multimodal_tokens (line 224) | def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=Non...
    method post_process_image_text_to_text (line 262) | def post_process_image_text_to_text(
    method replace_frame_token_id (line 289) | def replace_frame_token_id(self, timestamp_sec):

FILE: paddleformers/transformers/glm4v_moe/video_processor.py
  class Glm4vVideoProcessorInitKwargs (line 44) | class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False):
  class Glm4vVideoProcessor (line 52) | class Glm4vVideoProcessor(BaseVideoProcessor):
    method __init__ (line 73) | def __init__(self, **kwargs: Unpack[Glm4vVideoProcessorInitKwargs]):
    method _further_process_kwargs (line 80) | def _further_process_kwargs(
    method sample_frames (line 94) | def sample_frames(
    method _preprocess (line 144) | def _preprocess(

FILE: paddleformers/transformers/glm_ocr/configuration.py
  class GlmOcrVisionConfig (line 21) | class GlmOcrVisionConfig(PretrainedConfig):
    method __init__ (line 68) | def __init__(
  class GlmOcrTextConfig (line 106) | class GlmOcrTextConfig(PretrainedConfig):
    method __init__ (line 126) | def __init__(
  class GlmOcrConfig (line 174) | class GlmOcrConfig(PretrainedConfig):
    method __setattr__ (line 221) | def __setattr__(self, key, value):
    method __getattribute__ (line 238) | def __getattribute__(self, key):
    method __init__ (line 250) | def __init__(

FILE: paddleformers/transformers/glm_ocr/image_processor.py
  function is_scaled_image (line 44) | def is_scaled_image(image: np.ndarray) -> bool:
  function make_batched_images (line 53) | def make_batched_images(images) -> List[ImageInput]:
  function smart_resize (line 74) | def smart_resize(
  class Glm46VImageProcessor (line 133) | class Glm46VImageProcessor(BaseImageProcessor):
    method __init__ (line 169) | def __init__(
    method get_smarted_resize (line 202) | def get_smarted_resize(self, height, width, min_pixels=None, max_pixel...
    method set_pixels (line 219) | def set_pixels(self, min_pixels: Optional[int] = None, max_pixels: Opt...
    method get_number_of_image_patches (line 232) | def get_number_of_image_patches(
    method _preprocess (line 268) | def _preprocess(
    method preprocess (line 412) | def preprocess(

FILE: paddleformers/transformers/glm_ocr/modeling.py
  class GlmOcrVisionMlp (line 39) | class GlmOcrVisionMlp(nn.Layer):
    method __init__ (line 40) | def __init__(self, config, bias: bool = True):
    method forward (line 49) | def forward(self, hidden_state):
  function repeat_kv (line 53) | def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
  function rotate_half_llm (line 65) | def rotate_half_llm(x):
  function apply_rotary_pos_emb (line 72) | def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
  class GlmOcrTextAttention (line 91) | class GlmOcrTextAttention(nn.Layer):
    method __init__ (line 97) | def __init__(self, config: GlmOcrTextConfig, layer_idx: int | None = N...
    method forward (line 116) | def forward(
  class GlmOcrVisionRotaryEmbedding (line 160) | class GlmOcrVisionRotaryEmbedding(nn.Layer):
    method __init__ (line 163) | def __init__(self, dim: int, theta: float = 10000.0) -> None:
    method forward (line 170) | def forward(self, seqlen: int) -> paddle.Tensor:
  class GlmOcrTextMLP (line 176) | class GlmOcrTextMLP(nn.Layer):
    method __init__ (line 177) | def __init__(self, config):
    method forward (line 185) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
  class GlmOcrTextDecoderLayer (line 194) | class GlmOcrTextDecoderLayer(nn.Layer):
    method __init__ (line 195) | def __init__(self, config, layer_idx: int):
    method forward (line 215) | def forward(
  class GlmOcrPreTrainedModel (line 253) | class GlmOcrPreTrainedModel(PretrainedModel):
    method _init_weights (line 279) | def _init_weights(self, layer):
    method _gen_aoa_config (line 286) | def _gen_aoa_config(cls, config: GlmOcrConfig):
    method _gen_inv_aoa_config (line 372) | def _gen_inv_aoa_config(cls, config: GlmOcrConfig):
  class GlmOcrModelOutputWithPast (line 477) | class GlmOcrModelOutputWithPast(ModelOutput):
  function rotate_half (line 485) | def rotate_half(x):
  function apply_rotary_pos_emb_vision (line 492) | def apply_rotary_pos_emb_vision(q, k, cos, sin):
  class GlmOcrVisionAttention (line 507) | class GlmOcrVisionAttention(nn.Layer):
    method __init__ (line 508) | def __init__(self, config: GlmOcrVisionConfig) -> None:
    method forward (line 529) | def forward(
  class GlmOcrVisionBlock (line 605) | class GlmOcrVisionBlock(nn.Layer):
    method __init__ (line 606) | def __init__(self, config) -> None:
    method forward (line 617) | def forward(
  class GlmOcrVisionPatchMerger (line 636) | class GlmOcrVisionPatchMerger(nn.Layer):
    method __init__ (line 637) | def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: ...
    method forward (line 647) | def forward(self, hidden_state: paddle.Tensor) -> paddle.Tensor:
  class GlmOcrVisionPatchEmbed (line 653) | class GlmOcrVisionPatchEmbed(nn.Layer):
    method __init__ (line 654) | def __init__(self, config: GlmOcrVisionConfig) -> None:
    method forward (line 669) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
  class GlmOcrVisionModel (line 679) | class GlmOcrVisionModel(GlmOcrPreTrainedModel):
    method __init__ (line 688) | def __init__(self, config) -> None:
    method rot_pos_emb (line 715) | def rot_pos_emb(self, grid_thw: paddle.Tensor):
    method forward (line 758) | def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tenso...
  class GlmOcrTextRotaryEmbedding (line 797) | class GlmOcrTextRotaryEmbedding(nn.Layer):
    method __init__ (line 798) | def __init__(self, config):
    method compute_default_rope_parameters (line 811) | def compute_default_rope_parameters(config):
    method apply_mrope (line 826) | def apply_mrope(self, freqs, mrope_section):
    method forward (line 838) | def forward(self, hidden_states: paddle.Tensor, position_ids: paddle.T...
  class GlmOcrTextModel (line 860) | class GlmOcrTextModel(GlmOcrPreTrainedModel):
    method __init__ (line 864) | def __init__(self, config: GlmOcrTextConfig):
    method forward (line 887) | def forward(
  function masked_scatter (line 970) | def masked_scatter(inputs: paddle.Tensor, mask: paddle.Tensor, updates: ...
  class GlmOcrModel (line 987) | class GlmOcrModel(GlmOcrPreTrainedModel):
    method __init__ (line 994) | def __init__(self, config):
    method get_input_embeddings (line 1000) | def get_input_embeddings(self):
    method set_input_embeddings (line 1003) | def set_input_embeddings(self, value):
    method get_rope_index (line 1006) | def get_rope_index(self, input_ids, image_grid_thw=None, video_grid_th...
    method get_video_features (line 1139) | def get_video_features(
    method get_image_features (line 1171) | def get_image_features(
    method get_placeholder_mask (line 1194) | def get_placeholder_mask(
    method forward (line 1243) | def forward(
  class GlmOcrCausalLMOutputWithPast (line 1321) | class GlmOcrCausalLMOutputWithPast(ModelOutput):
  class GlmOcrForConditionalGeneration (line 1341) | class GlmOcrForConditionalGeneration(GlmOcrPreTrainedModel, GenerationMi...
    method __init__ (line 1346) | def __init__(self, config):
    method get_input_embeddings (line 1355) | def get_input_embeddings(self):
    method set_input_embeddings (line 1358) | def set_input_embeddings(self, value):
    method get_video_features (line 1361) | def get_video_features(self, pixel_values_videos, video_grid_thw=None,...
    method get_image_features (line 1368) | def get_image_features(self, pixel_values, image_grid_thw=None, **kwar...
    method forward (line 1375) | def forward(
    method prepare_inputs_for_generation (line 1430) | def prepare_inputs_for_generation(
    method update_model_kwargs_for_generation (line 1479) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is...
    method _get_image_nums_and_video_nums (line 1501) | def _get_image_nums_and_video_nums(
    method _expand_inputs_for_generation (line 1533) | def _expand_inputs_for_generation(

FILE: paddleformers/transformers/glm_ocr/processor.py
  class Glm46VProcessorKwargs (line 26) | class Glm46VProcessorKwargs(ProcessingKwargs, total=False):
  class Glm46VProcessor (line 35) | class Glm46VProcessor(ProcessorMixin):
    method __init__ (line 40) | def __init__(self, image_processor=None, tokenizer=None, chat_template...
    method apply_chat_template (line 49) | def apply_chat_template(
    method __call__ (line 181) | def __call__(
    method _get_num_multimodal_tokens (line 226) | def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
    method post_process_image_text_to_text (line 242) | def post_process_image_text_to_text(

FILE: paddleformers/transformers/gpt_oss/configuration.py
  class GptOssConfig (line 20) | class GptOssConfig(PretrainedConfig):
    method __init__ (line 28) | def __init__(

FILE: paddleformers/transformers/gpt_oss/modeling.py
  function is_casual_mask (line 42) | def is_casual_mask(attention_mask):
  function _make_causal_mask (line 49) | def _make_causal_mask(input_ids_shape, past_key_values_length):
  function _expand_2d_mask (line 65) | def _expand_2d_mask(mask, dtype, tgt_length):
  class GptOssExperts (line 79) | class GptOssExperts(nn.Layer):
    method __init__ (line 80) | def __init__(self, config):
    method forward (line 111) | def forward(self, hidden_states: paddle.Tensor, router_indices=None, r...
  class GptOssTopKRouter (line 182) | class GptOssTopKRouter(nn.Layer):
    method __init__ (line 183) | def __init__(self, config):
    method forward (line 199) | def forward(self, hidden_states):
  class GptOssMLP (line 209) | class GptOssMLP(nn.Layer):
    method __init__ (line 210) | def __init__(self, config):
    method forward (line 216) | def forward(self, hidden_states):
  class GptOssRotaryEmbedding (line 226) | class GptOssRotaryEmbedding(nn.Layer):
    method __init__ (line 227) | def __init__(self, config: GptOssConfig, device=None):
    method compute_default_rope_parameters (line 248) | def compute_default_rope_parameters(
    method forward (line 274) | def forward(self, x, position_ids):
  function _apply_rotary_emb (line 290) | def _apply_rotary_emb(
  function apply_rotary_pos_emb (line 301) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  class GptOssAttention (line 309) | class GptOssAttention(nn.Layer):
    method __init__ (line 315) | def __init__(self, config, layer_idx=0):
    method forward (line 390) | def forward(
  class GptOssDecoderLayer (line 469) | class GptOssDecoderLayer(nn.Layer):
    method __init__ (line 470) | def __init__(self, config: GptOssConfig, layer_idx: int):
    method forward (line 499) | def forward(
  class GptOssPreTrainedModel (line 562) | class GptOssPreTrainedModel(PretrainedModel):
    method _gen_aoa_config (line 570) | def _gen_aoa_config(cls, config: GptOssConfig):
    method _gen_inv_aoa_config (line 606) | def _gen_inv_aoa_config(cls, config: GptOssConfig):
  class GptOssModel (line 639) | class GptOssModel(GptOssPreTrainedModel):
    method __init__ (line 647) | def __init__(self, config: GptOssConfig):
    method recompute_training_full (line 683) | def recompute_training_full(
    method forward (line 718) | def forward(
  function load_balancing_loss_func (line 873) | def load_balancing_loss_func(
  class GptOssForCausalLM (line 948) | class GptOssForCausalLM(GptOssPreTrainedModel):
    method __init__ (line 952) | def __init__(self, config: GptOssConfig):
    method get_input_embeddings (line 962) | def get_input_embeddings(self):
    method set_input_embeddings (line 965) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 968) | def get_output_embeddings(self):
    method set_output_embeddings (line 971) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 974) | def set_decoder(self, decoder):
    method get_decoder (line 977) | def get_decoder(self):
    method prepare_inputs_for_generation (line 980) | def prepare_inputs_for_generation(
    method _get_model_inputs_spec (line 1011) | def _get_model_inputs_spec(self, dtype: str):
    method forward (line 1018) | def forward(
  class GptOssForCausalLMPipe (line 1099) | class GptOssForCausalLMPipe(GeneralModelForCausalLMPipe):

FILE: paddleformers/transformers/gpt_provider.py
  class GPTModel (line 57) | class GPTModel(FleetGPTModel, PretrainedModel):
  function local_layer_spec (line 67) | def local_layer_spec(config: "GPTModelProvider") -> LayerSpec:
  class GPTModelProvider (line 85) | class GPTModelProvider(GPTConfig, ModelProviderMixin[GPTModel]):
    method provide (line 150) | def provide(self, pre_process=None, post_process=None, vp_stage=None, ...
  function mtp_block_spec (line 204) | def mtp_block_spec(config: "GPTModelProvider", vp_stage: Optional[int] =...

FILE: paddleformers/transformers/image_processing_utils.py
  class PaddleImageProcessingMixin (line 40) | class PaddleImageProcessingMixin:
    method __init__ (line 63) | def __init__(self, **kwargs):
    method _wrap_return_tensor_methods (line 67) | def _wrap_return_tensor_methods(self):
    method _wrap_single_method (line 91) | def _wrap_single_method(self, method_name):
    method __call__ (line 151) | def __call__(self, images, *args, **kwargs) -> BatchFeature:
    method from_pretrained (line 156) | def from_pretrained(
    method get_image_processor_dict (line 166) | def get_image_processor_dict(
    method from_dict (line 256) | def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
    method to_dict (line 300) | def to_dict(self):
  function warp_image_processormixin (line 314) | def warp_image_processormixin(hf_image_processormixin_class: ImageProces...
  function warp_base_image_processor (line 320) | def warp_base_image_processor(hf_base_image_processor_class: BaseImagePr...
  class ImageProcessingMixin (line 326) | class ImageProcessingMixin(PaddleImageProcessingMixin, ImageProcessingMi...
    method init (line 327) | def init(self, *args, **kwargs):
  class BaseImageProcessor (line 331) | class BaseImageProcessor(PaddleImageProcessingMixin, BaseImageProcessor_...
    method init (line 332) | def init(self, *args, **kwargs):

FILE: paddleformers/transformers/image_processing_utils_fast.py
  function validate_fast_preprocess_arguments (line 56) | def validate_fast_preprocess_arguments(
  function safe_squeeze (line 94) | def safe_squeeze(tensor: "paddle.Tensor", axis: Optional[int] = None) ->...
  function max_across_indices (line 107) | def max_across_indices(values: Iterable[Any]) -> list[Any]:
  function get_max_height_width (line 114) | def get_max_height_width(images: list["paddle.Tensor"]) -> tuple[int, ...]:
  function divide_to_patches (line 124) | def divide_to_patches(
  class DefaultFastImageProcessorKwargs (line 148) | class DefaultFastImageProcessorKwargs(TypedDict, total=False):
  class BaseImageProcessorFast (line 170) | class BaseImageProcessorFast(BaseImageProcessor):
    method __init__ (line 193) | def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]):
    method is_fast (line 218) | def is_fast(self) -> bool:
    method pad (line 224) | def pad(
    method resize (line 292) | def resize(
    method compile_friendly_resize (line 344) | def compile_friendly_resize(
    method rescale (line 365) | def rescale(
    method normalize (line 383) | def normalize(
    method _fuse_mean_std_and_rescale_factor (line 403) | def _fuse_mean_std_and_rescale_factor(
    method rescale_and_normalize (line 418) | def rescale_and_normalize(
    method center_crop (line 445) | def center_crop(
    method convert_to_rgb (line 486) | def convert_to_rgb(
    method filter_out_unused_kwargs (line 502) | def filter_out_unused_kwargs(self, kwargs: dict):
    method _prepare_images_structure (line 515) | def _prepare_images_structure(
    method _process_image (line 534) | def _process_image(
    method _prepare_image_like_inputs (line 571) | def _prepare_image_like_inputs(
    method _further_process_kwargs (line 615) | def _further_process_kwargs(
    method _validate_preprocess_kwargs (line 659) | def _validate_preprocess_kwargs(
    method __call__ (line 693) | def __call__(self, images: ImageInput, *args, **kwargs: Unpack[Default...
    method preprocess (line 696) | def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[Defau...
    method _preprocess_image_like_inputs (line 722) | def _preprocess_image_like_inputs(
    method _preprocess (line 742) | def _preprocess(
    method to_dict (line 790) | def to_dict(self):

FILE: paddleformers/transformers/image_transforms.py
  function is_paddle_tensor (line 39) | def is_paddle_tensor(tensor):
  function to_channel_dimension_format (line 43) | def to_channel_dimension_format(
  function rescale (line 80) | def rescale(
  function to_pil_image (line 110) | def to_pil_image(
  function get_size_with_aspect_ratio (line 151) | def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple...
  function get_resize_output_image_size (line 191) | def get_resize_output_image_size(
  function resize (line 256) | def resize(
  function normalize (line 315) | def normalize(
  function center_crop (line 376) | def center_crop(
  function _center_to_corners_format_paddle (line 471) | def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> ...
  function _center_to_corners_format_numpy (line 481) | def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.nda...
  function center_to_corners_format (line 492) | def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
  function _corners_to_center_format_paddle (line 511) | def _corners_to_center_format_paddle(bboxes_corners: "paddle.Tensor") ->...
  function _corners_to_center_format_numpy (line 522) | def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.nd...
  function corners_to_center_format (line 536) | def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
  function rgb_to_id (line 557) | def rgb_to_id(color):
  function id_to_rgb (line 568) | def id_to_rgb(id_map):
  class PaddingMode (line 587) | class PaddingMode(ExplicitEnum):
  function pad (line 598) | def pad(
  function convert_to_rgb (line 684) | def convert_to_rgb(image: ImageInput) -> ImageInput:
  function _group_images_by_shape (line 701) | def _group_images_by_shape(nested_images, is_nested: bool = False):
  function _reconstruct_nested_structure (line 717) | def _reconstruct_nested_structure(indices, processed_images):
  function group_images_by_shape (line 743) | def group_images_by_shape(
  function reorder_images (line 794) | def reorder_images(

FILE: paddleformers/transformers/image_utils.py
  function is_paddle_tensor (line 57) | def is_paddle_tensor(tensor):
  function is_pil_image (line 61) | def is_pil_image(img):
  function is_numpy_array (line 65) | def is_numpy_array(img):
  function to_numpy (line 69) | def to_numpy(obj):
  class ChannelDimension (line 83) | class ChannelDimension(ExplicitEnum):
  class ImageType (line 88) | class ImageType(ExplicitEnum):
  function get_image_type (line 94) | def get_image_type(image):
  function is_valid_image (line 104) | def is_valid_image(img):
  function is_valid_list_of_images (line 108) | def is_valid_list_of_images(images: list):
  function valid_images (line 112) | def valid_images(imgs):
  function is_batched (line 124) | def is_batched(img):
  function make_list_of_images (line 130) | def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageIn...
  function make_flat_list_of_images (line 167) | def make_flat_list_of_images(
  function make_nested_list_of_images (line 205) | def make_nested_list_of_images(
  function to_numpy_array (line 244) | def to_numpy_array(img) -> np.ndarray:
  function infer_channel_dimension_format (line 253) | def infer_channel_dimension_format(
  function get_channel_dimension_axis (line 288) | def get_channel_dimension_axis(image: np.ndarray) -> int:
  function get_image_size (line 307) | def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = No...
  function get_image_size_for_max_height_width (line 331) | def get_image_size_for_max_height_width(
  function is_valid_annotation_coco_detection (line 362) | def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List,...
  function is_valid_annotation_coco_panoptic (line 378) | def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, ...
  function valid_coco_detection_annotations (line 395) | def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Uni...
  function valid_coco_panoptic_annotations (line 399) | def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Unio...
  function load_image (line 403) | def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
  function validate_preprocess_arguments (line 436) | def validate_preprocess_arguments(
  class ImageFeatureExtractionMixin (line 488) | class ImageFeatureExtractionMixin:
    method _ensure_format_supported (line 493) | def _ensure_format_supported(self, image):
    method to_pil_image (line 500) | def to_pil_image(self, image, rescale=None):
    method convert_rgb (line 530) | def convert_rgb(self, image):
    method rescale (line 544) | def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.n...
    method to_numpy_array (line 551) | def to_numpy_array(self, image, rescale=None, channel_first=True):
    method expand_dims (line 583) | def expand_dims(self, image):
    method normalize (line 603) | def normalize(self, image, mean, std, rescale=False):
    method resize (line 648) | def resize(self, image, size, resample=None, default_to_square=True, m...
    method center_crop (line 715) | def center_crop(self, image, size):
    method flip_channel_order (line 790) | def flip_channel_order(self, image):
    method rotate (line 807) | def rotate(self, image, angle, resample=None, expand=0, center=None, t...
  function validate_kwargs (line 832) | def validate_kwargs(valid_processor_keys: list[str], captured_kwargs: li...
  class SizeDict (line 840) | class SizeDict:
    method __getitem__ (line 852) | def __getitem__(self, key):

FILE: paddleformers/transformers/kimi_k2/configuration.py
  class KimiK2Config (line 20) | class KimiK2Config(PretrainedConfig):
    method __init__ (line 142) | def __init__(

FILE: paddleformers/transformers/kimi_k2/modeling.py
  class KimiK2Provider (line 24) | class KimiK2Provider(GPTModelProvider):
    method __post_init__ (line 33) | def __post_init__(config):
  class KimiK2PretrainedModel (line 37) | class KimiK2PretrainedModel(PretrainedModel):
    method _gen_aoa_config (line 54) | def _gen_aoa_config(cls, config: KimiK2Config):
  class KimiK2ForCausalLM (line 86) | class KimiK2ForCausalLM(KimiK2PretrainedModel):
    method __new__ (line 89) | def __new__(cls, config, have_criterion=True):
  class KimiK2ForCausalLMPipe (line 126) | class KimiK2ForCausalLMPipe(KimiK2PretrainedModel, GeneralModelForCausal...
    method __new__ (line 129) | def __new__(cls, config):

FILE: paddleformers/transformers/kimi_k2/tokenizer.py
  class KimiK2TikTokenTokenizer (line 31) | class KimiK2TikTokenTokenizer(PreTrainedTokenizer):
    method __init__ (line 74) | def __init__(
    method encode (line 152) | def encode(self, text: str, **kwargs) -> List[int]:
    method decode (line 204) | def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
    method _split_whitespaces_or_nonwhitespaces (line 223) | def _split_whitespaces_or_nonwhitespaces(s: str, max_consecutive_slice...
    method pre_tokenizer_process (line 246) | def pre_tokenizer_process(self, text: str) -> List[str]:
    method vocab_size (line 256) | def vocab_size(self) -> int:
    method get_vocab (line 259) | def get_vocab(self) -> Dict[str, int]:
    method _tokenize (line 262) | def _tokenize(self, text: str, **kwargs) -> List[str]:
    method _convert_token_to_id (line 265) | def _convert_token_to_id(self, token: str) -> int:
    method _convert_id_to_token (line 268) | def _convert_id_to_token(self, index: int) -> str:
    method clean_up_tokenization (line 272) | def clean_up_tokenization(out_string: str) -> str:
    method convert_tokens_to_string (line 275) | def convert_tokens_to_string(self, tokens: List[str]) -> str:
    method save_vocabulary (line 280) | def save_vocabulary(self, save_directory: str, filename_prefix: Option...

FILE: paddleformers/transformers/kimi_k25/media_utils.py
  class VideoSpec (line 34) | class VideoSpec:
    method __post_init__ (line 43) | def __post_init__(self):
  class ImageInput (line 56) | class ImageInput(TypedDict):
  class VideoChunkInput (line 61) | class VideoChunkInput(TypedDict):
  function _read_video_decord (line 70) | def _read_video_decord(
  function _read_video_paddlecodec (line 121) | def _read_video_paddlecodec(
  function get_video_meta (line 217) | def get_video_meta(video_src: bytes | str | os.PathLike, accurate: bool ...
  function timestamp_as_str (line 230) | def timestamp_as_str(timestamp: float, timestamp_mode: str = "hh:mm:ss.f...
  function navit_resize_image (line 248) | def navit_resize_image(
  function navit_resize_video (line 297) | def navit_resize_video(
  function real_sample_fps_and_max_num_frames (line 333) | def real_sample_fps_and_max_num_frames(
  function _to_pil (line 348) | def _to_pil(data: str | bytes):
  function ensure_media_type (line 364) | def ensure_media_type(media: MediaInput) -> MediaInput:
  function image_in_tensor (line 378) | def image_in_tensor(
  function image_to_np (line 452) | def image_to_np(
  function navit_patchify (line 527) | def navit_patchify(pixel_values: paddle.Tensor, patch_size: int) -> dict...

FILE: paddleformers/transformers/kimi_k25/processor.py
  class KimiK25Processor (line 22) | class KimiK25Processor(ProcessorMixin):
    method __init__ (line 41) | def __init__(
    method update_raw_text (line 53) | def update_raw_text(self, text: str, video_prompts: list[str]) -> str:
    method preprocess_medias (line 65) | def preprocess_medias(self, medias: list[dict], **kwargs) -> list[dict]:
    method __call__ (line 79) | def __call__(
    method _extract_medias_from_messages (line 126) | def _extract_medias_from_messages(messages: list[dict]) -> list[dict]:
    method apply_chat_template (line 156) | def apply_chat_template(self, messages, **kwargs):
    method batch_decode (line 159) | def batch_decode(self, *args, **kwargs):
    method decode (line 162) | def decode(self, *args, **kwargs):
    method model_input_names (line 166) | def model_input_names(self):

FILE: paddleformers/transformers/kimi_k25/tokenizer.py
  class TikTokenTokenizer (line 34) | class TikTokenTokenizer(PreTrainedTokenizer):
    method __init__ (line 77) | def __init__(
    method encode (line 162) | def encode(self, text: str, allow_special_tokens: bool = True, **kwarg...
    method decode (line 223) | def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
    method _split_whitespaces_or_nonwhitespaces (line 242) | def _split_whitespaces_or_nonwhitespaces(s: str, max_consecutive_slice...
    method pre_tokenizer_process (line 265) | def pre_tokenizer_process(self, text: str) -> List[str]:
    method vocab_size (line 275) | def vocab_size(self) -> int:
    method get_vocab (line 278) | def get_vocab(self) -> Dict[str, int]:
    method _tokenize (line 281) | def _tokenize(self, text: str, **kwargs) -> List[str]:
    method _convert_token_to_id (line 284) | def _convert_token_to_id(self, token: str) -> int:
    method _convert_id_to_token (line 287) | def _convert_id_to_token(self, index: int) -> str:
    method clean_up_tokenization (line 291) | def clean_up_tokenization(out_string: str) -> str:
    method convert_tokens_to_string (line 294) | def convert_tokens_to_string(self, tokens: List[str]) -> str:
    method save_vocabulary (line 299) | def save_vocabulary(self, save_directory: str, filename_prefix: Option...
    method apply_chat_template (line 311) | def apply_chat_template(
  function deep_sort_dict (line 346) | def deep_sort_dict(obj: Any) -> Any:

FILE: paddleformers/transformers/kimi_k25/tool_declaration_ts.py
  class _SchemaRegistry (line 30) | class _SchemaRegistry:
    method __init__ (line 33) | def __init__(self):
    method register_definitions (line 37) | def register_definitions(self, defs: dict[str, Any]):
    method resolve_ref (line 44) | def resolve_ref(self, ref: str) -> dict[str, Any]:
  function _format_description (line 58) | def _format_description(description: str, indent: str = "") -> str:
  class _BaseType (line 62) | class _BaseType:
    method __init__ (line 66) | def __init__(
    method to_typescript_style (line 75) | def to_typescript_style(self, indent: str = "") -> str:
    method format_docstring (line 78) | def format_docstring(self, indent: str) -> str:
  class _ParameterTypeScalar (line 89) | class _ParameterTypeScalar(_BaseType):
    method __init__ (line 92) | def __init__(self, type: str, extra_props: dict[str, Any] | None = None):
    method to_typescript_style (line 103) | def to_typescript_style(self, indent: str = "") -> str:
  class _ParameterTypeObject (line 110) | class _ParameterTypeObject(_BaseType):
    method __init__ (line 114) | def __init__(self, json_schema_object: dict[str, Any], registry: _Sche...
    method to_typescript_style (line 146) | def to_typescript_style(self, indent: str = "") -> str:
  class _ParameterTypeArray (line 182) | class _ParameterTypeArray(_BaseType):
    method __init__ (line 185) | def __init__(self, json_schema_object: dict[str, Any], registry: _Sche...
    method to_typescript_style (line 192) | def to_typescript_style(self, indent: str = "") -> str:
  class _ParameterTypeEnum (line 209) | class _ParameterTypeEnum(_BaseType):
    method __init__ (line 213) | def __init__(self, json_schema_object: dict[str, Any]):
    method to_typescript_style (line 242) | def to_typescript_style(self, indent: str = "") -> str:
  class _ParameterTypeAnyOf (line 246) | class _ParameterTypeAnyOf(_BaseType):
    method __init__ (line 249) | def __init__(
    method to_typescript_style (line 257) | def to_typescript_style(self, indent: str = "") -> str:
  class _ParameterTypeUnion (line 261) | class _ParameterTypeUnion(_BaseType):
    method __init__ (line 264) | def __init__(self, json_schema_object: dict[str, Any]):
    method to_typescript_style (line 278) | def to_typescript_style(self, indent: str = "") -> str:
  class _ParameterTypeRef (line 282) | class _ParameterTypeRef(_BaseType):
    method __init__ (line 286) | def __init__(self, json_schema_object: dict[str, Any], registry: _Sche...
    method to_typescript_style (line 298) | def to_typescript_style(self, indent: str = "") -> str:
  class _Parameter (line 314) | class _Parameter:
    method parse_extended (line 326) | def parse_extended(cls, attributes: dict[str, Any]) -> "_Parameter":
    method to_typescript_style (line 337) | def to_typescript_style(self, indent: str = "") -> str:
  function _parse_parameter_type (line 354) | def _parse_parameter_type(
  function _openai_function_to_typescript_style (line 387) | def _openai_function_to_typescript_style(
  function encode_tools_to_typescript_style (line 437) | def encode_tools_to_typescript_style(

FILE: paddleformers/transformers/kimi_k25/vision_processor.py
  function resampling (line 46) | def resampling(
  class KimiK25VisionProcessor (line 60) | class KimiK25VisionProcessor(BaseImageProcessor):
    method __init__ (line 63) | def __init__(
    method media_tokens_calculator (line 72) | def media_tokens_calculator(self, media: MediaInput):
    method make_chunk_prompt (line 78) | def make_chunk_prompt(cls, timestamp_text: str) -> str:
    method split_video_chunks (line 81) | def split_video_chunks(
    method get_resize_config (line 116) | def get_resize_config(self, media_input: MediaInput) -> dict:
    method resize_image (line 163) | def resize_image(
    method preprocess (line 185) | def preprocess(
    method __repr__ (line 240) | def __repr__(self):
    method to_dict (line 243) | def to_dict(self) -> Dict[str, Any]:
    method from_dict (line 251) | def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
    method to_json_string (line 262) | def to_json_string(self):

FILE: paddleformers/transformers/kto_criterion.py
  class KTOCriterion (line 33) | class KTOCriterion(nn.Layer):
    method __init__ (line 36) | def __init__(self, config, kto_config=None, ignore_label=0, use_infohu...
    method _nested_gather (line 61) | def _nested_gather(self, tensors):
    method kto_logps (line 80) | def kto_logps(self, logits, response_labels, response_kl_labels, respo...
    method kto_loss (line 181) | def kto_loss(
    method forward (line 213) | def forward(

FILE: paddleformers/transformers/legacy/tokenizer_utils.py
  function convert_to_unicode (line 80) | def convert_to_unicode(text):
  function whitespace_tokenize (line 96) | def whitespace_tokenize(text):
  function _is_whitespace (line 111) | def _is_whitespace(char):
  function _is_control (line 125) | def _is_control(char):
  function _is_punctuation (line 137) | def _is_punctuation(char):
  function _is_end_of_word (line 152) | def _is_end_of_word(text):
  function _is_start_of_word (line 158) | def _is_start_of_word(text):
  function _insert_one_token_to_ordered_list (line 164) | def _insert_one_token_to_ordered_list(token_list: List[str], new_token: ...
  function is_chinese_char (line 177) | def is_chinese_char(cp):
  function _is_nonnormalized_char (line 202) | def _is_nonnormalized_char(char):
  function _is_nonnormalized_numeric (line 217) | def _is_nonnormalized_numeric(char):
  function normalize_chars (line 231) | def normalize_chars(text):
  function _is_symbol (line 253) | def _is_symbol(char):
  function tokenize_special_chars (line 263) | def tokenize_special_chars(text):
  class Trie (line 282) | class Trie:
    method __init__ (line 288) | def __init__(self):
    method add (line 291) | def add(self, word: str):
    method split (line 320) | def split(self, text: str) -> List[str]:
    method cut_text (line 473) | def cut_text(self, text, offsets):
  function tokenize_chinese_chars (line 497) | def tokenize_chinese_chars(text):
  class ChatTemplate (line 518) | class ChatTemplate:
    method _compile_jinja_template (line 525) | def _compile_jinja_template(chat_template) -> Template:
    method render_conversation (line 549) | def render_conversation(
    method render_query (line 580) | def render_query(self, query: str, index: int = 0, context_data: Dict[...
    method _init_context_data (line 587) | def _init_context_data(self, context_data: Dict[str, Union[int, str]] ...
    method render_system (line 592) | def render_system(self, context_data: Dict[str, Union[int, str]] = {})...
    method __call__ (line 599) | def __call__(self, conversations: list[list[str]] | str, context_data:...
    method from_dict (line 632) | def from_dict(cls, config: dict):
    method from_file (line 636) | def from_file(cls, file: str):
  class ChatTemplateMixin (line 642) | class ChatTemplateMixin:
    method apply_chat_template (line 645) | def apply_chat_template(
    method _apply_chat_template_paddle (line 689) | def _apply_chat_template_paddle(
    method _apply_chat_template (line 707) | def _apply_chat_template(
    method encode_chat_inputs (line 741) | def encode_chat_inputs(
    method _encode_chat_inputs_paddle (line 772) | def _encode_chat_inputs_paddle(self, conversations: List[List[str, str...
    method _encode_chat_inputs_openai_format (line 797) | def _encode_chat_inputs_openai_format(
    method _encode_chat_inputs (line 839) | def _encode_chat_inputs(
    method _extract_non_learnable_parts (line 899) | def _extract_non_learnable_parts(self, origin_msg: List[Dict[str, str]...
    method from_pretrained (line 913) | def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
    method init_chat_template (line 941) | def init_chat_template(self, chat_template: str | dict):
    method save_resources (line 966) | def save_resources(self, save_directory):
  class PretrainedTokenizer (line 977) | class PretrainedTokenizer(ChatTemplateMixin, PretrainedTokenizerBase):
    method _pre_init (line 1024) | def _pre_init(self, original_init, *args, **kwargs):
    method _build_special_tokens_map_extended (line 1050) | def _build_special_tokens_map_extended(self, **kwargs):
    method vocab_size (line 1067) | def vocab_size(self) -> int:
    method is_fast (line 1074) | def is_fast(self) -> bool:
    method get_added_vocab (line 1077) | def get_added_vocab(self) -> Dict[str, int]:
    method __len__ (line 1086) | def __len__(self):
    method _add_tokens (line 1092) | def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], ...
    method _create_trie (line 1156) | def _create_trie(self, unique_no_split_tokens):
    method prepare_for_tokenization (line 1165) | def prepare_for_tokenization(self, text, is_split_into_words=False, **...
    method tokenize (line 1188) | def tokenize(self, text: TextInput, **kwargs) -> List[str]:
    method _tokenize (line 1259) | def _tokenize(self, text, **kwargs):
    method convert_tokens_to_ids (line 1268) | def convert_tokens_to_ids(self, tokens):
    method _convert_token_to_id_with_added_voc (line 1281) | def _convert_token_to_id_with_added_voc(self, token):
    method _convert_token_to_id (line 1289) | def _convert_token_to_id(self, token):
    method convert_tokens_to_string (line 1293) | def convert_tokens_to_string(self, tokens):
    method convert_ids_to_tokens (line 1306) | def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
    method _convert_id_to_token (line 1327) | def _convert_id_to_token(self, index):
    method load_vocabulary (line 1332) | def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_toke...
    method save_vocabulary (line 1364) | def save_vocabulary(filepath, vocab):
    method get_special_tokens_mask (line 1381) | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, alrea...
    method num_special_tokens_to_add (line 1408) | def num_special_tokens_to_add(self, pair):
    method _encode_plus (line 1423) | def _encode_plus(
    method _batch_encode_plus (line 1500) | def _batch_encode_plus(
    method _batch_prepare_for_model (line 1602) | def _batch_prepare_for_model(
    method _get_bert_like_offset_mapping (line 1762) | def _get_bert_like_offset_mapping(self, text: str):
    method get_offset_mapping (line 1845) | def get_offset_mapping(self, text: str, split_tokens: Optional[List[st...
    method _decode (line 1933) | def _decode(
  class BPETokenizer (line 1976) | class BPETokenizer(PretrainedTokenizer):
    class Encoder (line 2006) | class Encoder(object):
      method __init__ (line 2007) | def __init__(self, encoder, bpe_merges, errors="replace", special_to...
      method _bytes_to_unicode (line 2024) | def _bytes_to_unicode(self):
      method _get_pairs (line 2053) | def _get_pairs(self, word):
      method bpe (line 2064) | def bpe(self, token):
      method tokenize (line 2106) | def tokenize(self, text):
      method tokenize_old (line 2122) | def tokenize_old(self, text):
      method is_special_token (line 2125) | def is_special_token(self, tok):
      method tokenize_bpe (line 2136) | def tokenize_bpe(self, token):
      method encode (line 2145) | def encode(self, text):
      method decode (line 2151) | def decode(self, tokens):
    method __init__ (line 2175) | def __init__(
    method _tokenize (line 2194) | def _tokenize(self, text, is_sentencepiece=True):
    method _get_encoder (line 2205) | def _get_encoder(self, encoder_json_path, vocab_bpe_path):

FILE: paddleformers/transformers/legacy/tokenizer_utils_base.py
  function import_protobuf_decode_error (line 63) | def import_protobuf_decode_error(error_message=""):
  class AddedToken (line 84) | class AddedToken:
    method __init__ (line 92) | def __init__(
    method __getstate__ (line 102) | def __getstate__(self):
    method __str__ (line 105) | def __str__(self):
    method __repr__ (line 108) | def __repr__(self) -> str:
  class EncodingFast (line 112) | class EncodingFast:
  class ExplicitEnum (line 118) | class ExplicitEnum(Enum):
    method _missing_ (line 124) | def _missing_(cls, value):
  class PaddingStrategy (line 130) | class PaddingStrategy(ExplicitEnum):
  class TensorType (line 141) | class TensorType(ExplicitEnum):
  function to_py_obj (line 169) | def to_py_obj(obj):
  function _is_numpy (line 185) | def _is_numpy(x):
  class TruncationStrategy (line 189) | class TruncationStrategy(ExplicitEnum):
  class CharSpan (line 201) | class CharSpan(NamedTuple):
  class TokenSpan (line 214) | class TokenSpan(NamedTuple):
  class BatchEncoding (line 227) | class BatchEncoding(UserDict):
    method __init__ (line 247) | def __init__(
    method n_sequences (line 270) | def n_sequences(self) -> Optional[int]:
    method is_fast (line 279) | def is_fast(self) -> bool:
    method __getitem__ (line 286) | def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
    method __getattr__ (line 303) | def __getattr__(self, item: str):
    method __getstate__ (line 309) | def __getstate__(self):
    method __setstate__ (line 312) | def __setstate__(self, state):
    method keys (line 319) | def keys(self):
    method values (line 322) | def values(self):
    method items (line 325) | def items(self):
    method encodings (line 333) | def encodings(self) -> Optional[List[EncodingFast]]:
    method tokens (line 340) | def tokens(self, batch_index: int = 0) -> List[str]:
    method sequence_ids (line 355) | def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
    method words (line 376) | def words(self, batch_index: int = 0) -> List[Optional[int]]:
    method word_ids (line 397) | def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
    method token_to_sequence (line 413) | def token_to_sequence(self, batch_or_token_index: int, token_index: Op...
    method token_to_word (line 452) | def token_to_word(self, batch_or_token_index: int, token_index: Option...
    method word_to_tokens (line 490) | def word_to_tokens(
    method token_to_chars (line 541) | def token_to_chars(self, batch_or_token_index: int, token_index: Optio...
    method char_to_token (line 577) | def char_to_token(
    method word_to_chars (line 618) | def word_to_chars(
    method char_to_word (line 663) | def char_to_word(self, batch_or_char_index: int, char_index: Optional[...
    method convert_to_tensors (line 702) | def convert_to_tensors(
  class SpecialTokensMixin (line 753) | class SpecialTokensMixin:
    method __init__ (line 791) | def __init__(self, verbose=True, **kwargs):
    method sanitize_special_tokens (line 824) | def sanitize_special_tokens(self) -> int:
    method add_special_tokens (line 836) | def add_special_tokens(
    method add_tokens (line 930) | def add_tokens(
    method _add_extra_special_tokens (line 976) | def _add_extra_special_tokens(cls, extra_sp_token: Union[str, AddedTok...
    method _add_tokens (line 980) | def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], ...
    method bos_token (line 984) | def bos_token(self) -> str:
    method eos_token (line 994) | def eos_token(self) -> str:
    method unk_token (line 1004) | def unk_token(self) -> str:
    method sep_token (line 1014) | def sep_token(self) -> str:
    method pad_token (line 1025) | def pad_token(self) -> str:
    method cls_token (line 1035) | def cls_token(self) -> str:
    method mask_token (line 1046) | def mask_token(self) -> str:
    method additional_special_tokens (line 1057) | def additional_special_tokens(self) -> List[str]:
    method bos_token (line 1068) | def bos_token(self, value):
    method eos_token (line 1072) | def eos_token(self, value):
    method unk_token (line 1076) | def unk_token(self, value):
    method sep_token (line 1080) | def sep_token(self, value):
    method pad_token (line 1084) | def pad_token(self, value):
    method cls_token (line 1088) | def cls_token(self, value):
    method mask_token (line 1092) | def mask_token(self, value):
    method additional_special_tokens (line 1096) | def additional_special_tokens(self, value):
    method bos_token_id (line 1100) | def bos_token_id(self) -> Optional[int]:
    method eos_token_id (line 1110) | def eos_token_id(self) -> Optional[int]:
    method unk_token_id (line 1120) | def unk_token_id(self) -> Optional[int]:
    method sep_token_id (line 1129) | def sep_token_id(self) -> Optional[int]:
    method pad_token_id (line 1139) | def pad_token_id(self) -> Optional[int]:
    method pad_token_type_id (line 1148) | def pad_token_type_id(self) -> int:
    method cls_token_id (line 1155) | def cls_token_id(self) -> Optional[int]:
    method mask_token_id (line 1167) | def mask_token_id(self) -> Optional[int]:
    method additional_special_tokens_ids (line 1177) | def additional_special_tokens_ids(self) -> List[int]:
    method bos_token_id (line 1185) | def bos_token_id(self, value):
    method eos_token_id (line 1189) | def eos_token_id(self, value):
    method unk_token_id (line 1193) | def unk_token_id(self, value):
    method sep_token_id (line 1197) | def sep_token_id(self, value):
    method pad_token_id (line 1201) | def pad_token_id(self, value):
    method cls_token_id (line 1205) | def cls_token_id(self, value):
    method mask_token_id (line 1209) | def mask_token_id(self, value):
    method additional_special_tokens_ids (line 1213) | def additional_special_tokens_ids(self, values):
    method special_tokens_map (line 1217) | def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
    method special_tokens_map_extended (line 1242) | def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedTok...
    method all_special_tokens (line 1265) | def all_special_tokens(self) -> List[str]:
    method all_special_tokens_extended (line 1276) | def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
    method all_special_ids (line 1296) | def all_special_ids(self) -> List[int]:
  class PretrainedTokenizerBase (line 1305) | class PretrainedTokenizerBase(SpecialTokensMixin):
    method __init__ (line 1387) | def __init__(self, **kwargs):
    method max_len_single_sentence (line 1428) | def max_len_single_sentence(self) -> int:
    method max_len_sentences_pair (line 1435) | def max_len_sentences_pair(self) -> int:
    method max_len_single_sentence (line 1442) | def max_len_single_sentence(self, value) -> int:
    method _switch_to_input_mode (line 1455) | def _switch_to_input_mode(self):
    method max_len_sentences_pair (line 1462) | def max_len_sentences_pair(self, value) -> int:
    method _set_processor_class (line 1475) | def _set_processor_class(self, processor_class: str):
    method __repr__ (line 1479) | def __repr__(self) -> str:
    method get_vocab (line 1489) | def get_vocab(self) -> Dict[str, int]:
    method from_pretrained (line 1502) | def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
    method _from_pretrained (line 1623) | def _from_pretrained(
    method save_pretrained (line 1818) | def save_pretrained(self, save_directory, filename_prefix: Optional[st...
    method _save_pretrained (line 1914) | def _save_pretrained(
    method save_resources (line 1937) | def save_resources(self, save_directory):
    method save_to_hf_hub (line 1951) | def save_to_hf_hub(
    method save_to_aistudio (line 2011) | def save_to_aistudio(
    method tokenize (line 2059) | def tokenize(self, text: str, pair: Optional[str] = None, add_special_...
    method num_special_tokens_to_add (line 2079) | def num_special_tokens_to_add(self, pair: bool = False) -> int:
    method _get_padding_truncation_strategies (line 2082) | def _get_padding_truncation_strategies(
    method __call__ (line 2219) | def __call__(
    method encode (line 2492) | def encode(
    method encode_plus (line 2571) | def encode_plus(
    method _encode_plus (line 2645) | def _encode_plus(
    method batch_encode (line 2670) | def batch_encode(
    method _batch_encode_plus (line 2761) | def _batch_encode_plus(
    method pad (line 2793) | def pad(
    method create_token_type_ids_from_sequences (line 2982) | def create_token_type_ids_from_sequences(
    method build_inputs_with_special_tokens (line 3002) | def build_inputs_with_special_tokens(
    method build_offset_mapping_with_special_tokens (line 3022) | def build_offset_mapping_with_special_tokens(self, offset_mapping_0, o...
    method prepare_for_model (line 3042) | def prepare_for_model(
    method truncate_sequences (line 3197) | def truncate_sequences(
    method _pad (line 3316) | def _pad(
    method convert_tokens_to_string (line 3462) | def convert_tokens_to_string(self, tokens: List[str]) -> str:
    method decode_token (line 3475) | def decode_token(
    method batch_decode (line 3508) | def batch_decode(
    method decode (line 3541) | def decode(
    method _decode (line 3577) | def _decode(
    method get_special_tokens_mask (line 3586) | def get_special_tokens_mask(
    method clean_up_tokenization (line 3618) | def clean_up_tokenization(out_string: str) -> str:
    method _eventual_warn_about_too_long_sequence (line 3642) | def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_l...

FILE: paddleformers/transformers/linear_utils.py
  class ColumnSequenceParallelLinearPass (line 43) | class ColumnSequenceParallelLinearPass(object):
  class RowSequenceParallelLinearPass (line 51) | class RowSequenceParallelLinearPass(object):

FILE: paddleformers/transformers/llama/auto_dist_config.py
  function get_dist_config (line 18) | def get_dist_config(model, prefix=""):

FILE: paddleformers/transformers/llama/modeling.py
  function rotate_half (line 40) | def rotate_half(x: paddle.Tensor) -> paddle.Tensor:
  function apply_rotary_pos_emb (line 50) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  class LLamaAttention (line 71) | class LLamaAttention(nn.Layer):
    method __init__ (line 72) | def __init__(self, config: LlamaConfig, layer_idx: int):
    method forward (line 141) | def forward(
  class LlamaDecoderLayer (line 189) | class LlamaDecoderLayer(nn.Layer):
    method __init__ (line 190) | def __init__(self, config: LlamaConfig, layer_idx: int):
    method forward (line 213) | def forward(
  class LlamaRotaryEmbedding (line 248) | class LlamaRotaryEmbedding(nn.Layer):
    method __init__ (line 249) | def __init__(self, config):
    method compute_default_rope_parameters (line 269) | def compute_default_rope_parameters(
    method forward (line 294) | def forward(self, x, position_ids):
  class LlamaPretrainedModel (line 310) | class LlamaPretrainedModel(PretrainedModel):
    method _gen_aoa_config (line 324) | def _gen_aoa_config(cls, config: LlamaConfig):
    method _gen_inv_aoa_config (line 356) | def _gen_inv_aoa_config(cls, config: LlamaConfig):
  class LlamaModel (line 387) | class LlamaModel(LlamaPretrainedModel):
    method __init__ (line 388) | def __init__(self, config: LlamaConfig):
    method forward (line 414) | def forward(
    method recompute_training (line 523) | def recompute_training(
  class LlamaForCausalLM (line 553) | class LlamaForCausalLM(LlamaPretrainedModel):
    method __init__ (line 556) | def __init__(self, config: LlamaConfig):
    method forward (line 564) | def forward(
    method auto_dist_config (line 628) | def auto_dist_config(self, prefix=""):
  class LlamaForCausalLMPipe (line 633) | class LlamaForCausalLMPipe(GeneralModelForCausalLMPipe):

FILE: paddleformers/transformers/masking_utils.py
  function prepare_sliding_window_startend_row_indices (line 22) | def prepare_sliding_window_startend_row_indices(
  function create_causal_masks_and_row_indices (line 54) | def create_causal_masks_and_row_indices(
  function create_causal_mask_and_row_indices (line 200) | def create_causal_mask_and_row_indices(
  function create_sliding_window_causal_mask_and_row_indices (line 277) | def create_sliding_window_causal_mask_and_row_indices(

FILE: paddleformers/transformers/mc2_parallel_linear.py
  function is_mc2_valid (line 38) | def is_mc2_valid():
  class MC2ColumnParallelCoreLinear (line 47) | class MC2ColumnParallelCoreLinear(PyLayer):
    method forward (line 49) | def forward(ctx, input_, weight, group):
    method backward (line 57) | def backward(ctx, dy):
  class MC2RowParallelCoreLinear (line 77) | class MC2RowParallelCoreLinear(PyLayer):
    method forward (line 79) | def forward(ctx, input_, weight, group):
    method backward (line 92) | def backward(ctx, dy):
  class MC2ColumnSeqParallelCoreLinear (line 104) | class MC2ColumnSeqParallelCoreLinear(PyLayer):
    method forward (line 106) | def forward(ctx, input_, weight, group):
    method backward (line 131) | def backward(ctx, grad_output):
  class MC2RowSeqParallelCoreLinear (line 161) | class MC2RowSeqParallelCoreLinear(PyLayer):
    method forward (line 163) | def forward(ctx, input_, weight, group):
    method backward (line 186) | def backward(ctx, grad_output):
  class MC2ColumnSeqParallelLinear (line 209) | class MC2ColumnSeqParallelLinear(ColumnSequenceParallelLinear):
    method forward (line 210) | def forward(self, x):
  class MC2RowSeqParallelLinear (line 215) | class MC2RowSeqParallelLinear(RowSequenceParallelLinear):
    method forward (line 216) | def forward(self, x):

FILE: paddleformers/transformers/model_outputs.py
  function tuple_output (line 31) | def tuple_output(outputs: Tuple[Tensor], loss: Optional[Tensor] = None):
  function convert_encoder_output (line 45) | def convert_encoder_output(encoder_output):
  function layer_init_wrapper (line 61) | def layer_init_wrapper(func):
  function _transformer_encoder_layer_fwd (line 75) | def _transformer_encoder_layer_fwd(self, src, src_mask=None, cache=None,...
  function _transformer_decoder_layer_fwd (line 107) | def _transformer_decoder_layer_fwd(
  function _transformer_decoder_fwd (line 185) | def _transformer_decoder_fwd(
  function _transformer_encoder_fwd (line 270) | def _transformer_encoder_fwd(
  function _get_wrap_setattr (line 373) | def _get_wrap_setattr(cls):
  function is_tensor (line 392) | def is_tensor(x):
  class ModelOutput (line 399) | class ModelOutput(OrderedDict):
    method __post_init__ (line 413) | def __post_init__(self):
    method __delitem__ (line 465) | def __delitem__(self, *args, **kwargs):
    method setdefault (line 468) | def setdefault(self, *args, **kwargs):
    method pop (line 471) | def pop(self, *args, **kwargs):
    method update (line 474) | def update(self, *args, **kwargs):
    method __getitem__ (line 477) | def __getitem__(self, k):
    method __setattr__ (line 484) | def __setattr__(self, name, value):
    method __setitem__ (line 490) | def __setitem__(self, key, value):
    method to_tuple (line 496) | def to_tuple(self) -> Tuple[Any]:
  class BaseModelOutput (line 513) | class BaseModelOutput(ModelOutput):
  class BaseModelOutputWithNoAttention (line 539) | class BaseModelOutputWithNoAttention(ModelOutput):
  class BaseModelOutputWithPooling (line 558) | class BaseModelOutputWithPooling(ModelOutput):
  class BaseModelOutputWithPast (line 590) | class BaseModelOutputWithPast(ModelOutput):
  class BaseModelOutputWithPastAndCrossAttentions (line 629) | class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
  class BaseModelOutputWithPastAndMTP (line 680) | class BaseModelOutputWithPastAndMTP(ModelOutput):
  class BaseModelOutputWithPoolingAndCrossAttentions (line 723) | class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
  class SequenceClassifierOutput (line 772) | class SequenceClassifierOutput(ModelOutput):
  class TokenClassifierOutput (line 801) | class TokenClassifierOutput(ModelOutput):
  class QuestionAnsweringModelOutput (line 830) | class QuestionAnsweringModelOutput(ModelOutput):
  class MultipleChoiceModelOutput (line 862) | class MultipleChoiceModelOutput(ModelOutput):
  class MaskedLMOutput (line 893) | class MaskedLMOutput(ModelOutput):
  class CausalLMOutputWithPast (line 922) | class CausalLMOutputWithPast(ModelOutput):
  class CausalLMOutputWithCrossAttentions (line 959) | class CausalLMOutputWithCrossAttentions(ModelOutput):
  class Seq2SeqModelOutput (line 1003) | class Seq2SeqModelOutput(ModelOutput):
  class Seq2SeqLMOutput (line 1070) | class Seq2SeqLMOutput(ModelOutput):
  class Seq2SeqQuestionAnsweringModelOutput (line 1135) | class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
  class Seq2SeqSequenceClassifierOutput (line 1196) | class Seq2SeqSequenceClassifierOutput(ModelOutput):
  class SequenceClassifierOutputWithPast (line 1254) | class SequenceClassifierOutputWithPast(ModelOutput):
  class BackboneOutput (line 1290) | class BackboneOutput(ModelOutput):
  class BaseModelOutputWithPoolingAndNoAttention (line 1317) | class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
  class ImageClassifierOutputWithNoAttention (line 1339) | class ImageClassifierOutputWithNoAttention(ModelOutput):
  class DepthEstimatorOutput (line 1360) | class DepthEstimatorOutput(ModelOutput):
  class SemanticSegmenterOutput (line 1390) | class SemanticSegmenterOutput(ModelOutput):
  class Seq2SeqSpectrogramOutput (line 1421) | class Seq2SeqSpectrogramOutput(ModelOutput):
  class MoEModelOutputWithPast (line 1481) | class MoEModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 1523) | class MoECausalLMOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPastAndMTP (line 1575) | class MoECausalLMOutputWithPastAndMTP(MoECausalLMOutputWithPast):

FILE: paddleformers/transformers/model_provider.py
  class DistributedDataParallelConfig (line 59) | class DistributedDataParallelConfig:
    method __init__ (line 60) | def __init__(self):
  class ModelProviderMixin (line 64) | class ModelProviderMixin(abc.ABC, Generic[ModelT]):
    method provide (line 83) | def provide(
    method provide_distributed_model (line 102) | def provide_distributed_model(
    method initialize_model_parallel (line 201) | def initialize_model_parallel(
    method meta_model (line 232) | def meta_model(self) -> list[ModelT]:
    method pre_wrap_hook (line 241) | def pre_wrap_hook(self) -> Callable[[list[FleetLayer]], list[FleetLaye...
    method register_pre_wrap_hook (line 264) | def register_pre_wrap_hook(
    method post_wrap_hook (line 287) | def post_wrap_hook(self) -> Callable[[list[FleetLayer]], list[FleetLay...
    method register_post_wrap_hook (line 310) | def register_post_wrap_hook(
  function get_model (line 333) | def get_model(
  function _create_model (line 458) | def _create_model(
  function _print_num_params (line 526) | def _print_num_params(model: list[FleetLayer]) -> None:

FILE: paddleformers/transformers/model_utils.py
  function fit_bf16_to_uint16_np (line 118) | def fit_bf16_to_uint16_np(tensor):
  function dy2st_nocheck_guard_context (line 124) | def dy2st_nocheck_guard_context():
  function unwrap_optimizer (line 132) | def unwrap_optimizer(optimizer, optimizer_instances=()):
  function prune_linear_layer (line 153) | def prune_linear_layer(layer: nn.Linear, index: paddle.Tensor, dim: int ...
  function find_pruneable_heads_and_indices (line 184) | def find_pruneable_heads_and_indices(
  function apply_chunking_to_forward (line 208) | def apply_chunking_to_forward(
  function unwrap_model (line 275) | def unwrap_model(model, *args, **kwargs):
  function _add_variant (line 292) | def _add_variant(weights_name: str, variant=None) -> str:
  function dtype_guard (line 302) | def dtype_guard(dtype="float32"):
  function no_init_weights (line 315) | def no_init_weights(_enable=True):
  function get_parameter_dtype (line 331) | def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype:
  function _split_keys_evenly (line 351) | def _split_keys_evenly(keys: list, n: int) -> list:
  function _load_part_state_dict (line 377) | def _load_part_state_dict(
  function load_state_dict (line 510) | def load_state_dict(
  function prepare_safe_save_state_dict (line 621) | def prepare_safe_save_state_dict(state_dict, save_to_hf=True):
  function resolve_weight_file_from_hf_hub (line 632) | def resolve_weight_file_from_hf_hub(
  function register_base_model (line 678) | def register_base_model(cls):
  class BackboneMixin (line 707) | class BackboneMixin:
    method forward_with_filtered_kwargs (line 708) | def forward_with_filtered_kwargs(self, *args, **kwargs):
  function _partion_for_pipeline_mode (line 718) | def _partion_for_pipeline_mode(keys):
  function shard_checkpoint (line 745) | def shard_checkpoint(
  function load_sharded_checkpoint (line 857) | def load_sharded_checkpoint(model, folder, variant=None, strict=True, pr...
  function faster_set_state_dict (line 946) | def faster_set_state_dict(model, state_dict, model_state_dict=None, stri...
  function _load_state_dict_into_model (line 998) | def _load_state_dict_into_model(model_to_load, state_dict, start_prefix,...
  function _convert_state_dict_dtype_and_shape (line 1033) | def _convert_state_dict_dtype_and_shape(state_dict, model_to_load_state_...
  function _load_state_dict_into_meta_model (line 1054) | def _load_state_dict_into_meta_model(
  function _parse_size (line 1119) | def _parse_size(size_str: str) -> int:
  function clean_unrelated_safetensors (line 1144) | def clean_unrelated_safetensors(save_dir):
  function get_common_folder (line 1174) | def get_common_folder(file_list):
  class PretrainedModel (line 1184) | class PretrainedModel(Layer, GenerationMixin, ConversionMixin):
    method __init__ (line 1250) | def __init__(self, *args, **kwargs):
    method _post_init (line 1283) | def _post_init(self, original_init, *args, **kwargs):
    method _init_weights (line 1311) | def _init_weights(self, layer):
    method _initialize_weights (line 1317) | def _initialize_weights(self, layer):
    method init_weights (line 1326) | def init_weights(self):
    method _from_config (line 1343) | def _from_config(cls, config, **kwargs):
    method from_config (line 1365) | def from_config(cls, config, **kwargs):
    method set_inference_config (line 1376) | def set_inference_config(cls, config, predictor_args, **kwargs):
    method confirm_inference_model (line 1451) | def confirm_inference_model(cls, predictor_args, **kwargs):
    method base_model (line 1463) | def base_model(self):
    method model_name_list (line 1472) | def model_name_list(self):
    method can_generate (line 1480) | def can_generate(self) -> bool:
    method recompute_enable (line 1491) | def recompute_enable(self):
    method recompute_disable (line 1503) | def recompute_disable(self):
    method get_memory_footprint (line 1515) | def get_memory_footprint(self, return_buffers=True):
    method get_model_flops (line 1531) | def get_model_flops(self, *args, **kwargs):
    method get_hardware_flops (line 1537) | def get_hardware_flops(self, *args, **kwargs):
    method get_input_embeddings (line 1543) | def get_input_embeddings(self) -> nn.Embedding:
    method set_input_embeddings (line 1568) | def set_input_embeddings(self, value: Embedding):
    method get_output_embeddings (line 1592) | def get_output_embeddings(self) -> Optional[Embedding]:
    method set_output_embeddings (line 1604) | def set_output_embeddings(self, new_embeddings):
    method get_decoder (line 1611) | def get_decoder(self):
    method set_decoder (line 1626) | def set_decoder(self, decoder):
    method tie_weights (line 1637) | def tie_weights(self):
    method resize_position_embeddings (line 1677) | def resize_position_embeddings(self, new_num_position_embeddings: int):
    method constructed_from_pretrained_config (line 1692) | def constructed_from_pretrained_config(cls, init_func=None) -> bool:
    method save_model_config (line 1699) | def save_model_config(self, save_dir: str, **kwargs):
    method save_to_hf_hub (line 1710) | def save_to_hf_hub(
    method save_to_aistudio (line 1772) | def save_to_aistudio(
    method resize_token_embeddings (line 1836) | def resize_token_embeddings(self, new_num_tokens: Optional[int] = None...
    method _update_init_config (line 1868) | def _update_init_config(self, init_config: dict, key: str, value: Any):
    method _get_resized_embeddings (line 1885) | def _get_resized_embeddings(
    method __setattr__ (line 1936) | def __setattr__(self, name, value):
    method _resolve_model_file_path (line 1941) | def _resolve_model_file_path(
    method _load_pretrained_model (line 2178) | def _load_pretrained_model(
    method from_pretrained (line 2699) | def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
    method _get_key_renaming_mapping (line 3110) | def _get_key_renaming_mapping(
    method save_pretrained (line 3140) | def save_pretrained(
    method merge_auto_dist_configs (line 3386) | def merge_auto_dist_configs(self, configs):
    method _generate_auto_dist_config (line 3455) | def _generate_auto_dist_config(self, auto_dist_degree):
  class PipelinePretrainedModel (line 3507) | class PipelinePretrainedModel(PretrainedModel):
    method __init_hook__ (line 3508) | def __init_hook__(self):
    method __init__ (line 3514) | def __init__(self, config, *args, **kwargs):
    method add_sequential_layer (line 3518) | def add_sequential_layer(self, layer_desc, name_prefix=""):
    method get_sequential_layers (line 3522) | def get_sequential_layers(self):
    method get_sequential_name_prefixes (line 3526) | def get_sequential_name_prefixes(self):
    method _set_pipeline_name_mapping (line 3530) | def _set_pipeline_name_mapping(self, mappings=None):
    method get_shardlayer_prefix (line 3595) | def get_shardlayer_prefix(self, name_splited, shared_layer_class=Share...
    method state_dict (line 3623) | def state_dict(self, *args, **kwargs):
    method sharded_state_dict (line 3636) | def sharded_state_dict(self, *args, **kwargs):
    method set_state_dict (line 3669) | def set_state_dict(self, state_dict, *args, **kwargs):
  function load_sharded_checkpoint_as_one (line 3684) | def load_sharded_checkpoint_as_one(folder, variant=None, return_numpy=Fa...
  function load_tp_checkpoint (line 3759) | def load_tp_checkpoint(folder, cls, config, return_numpy=False, convert_...
  function clean_model_class_name (line 3814) | def clean_model_class_name(class_name, suffixes_to_strip: Union[str, Lis...
  function save_full_param (line 3837) | def save_full_param(
  function replace_name_and_gen_index (line 3929) | def replace_name_and_gen_index(path, total_size, save_peft=False):
  class HFFormatFullParamSaver (line 4003) | class HFFormatFullParamSaver:
    method __init__ (line 4004) | def __init__(
    method get_full_param_iter (line 4025) | def get_full_param_iter(self):
    method determin_saver_based_group (line 4047) | def determin_saver_based_group(self):
    method save_checkpoint (line 4063) | def save_checkpoint(self, path, max_shard_size="16GB", save_peft=False):
  class EMAStateHFFormatFullParamSaver (line 4086) | class EMAStateHFFormatFullParamSaver(HFFormatFullParamSaver):
    method __init__ (line 4087) | def __init__(
    method get_full_param_iter (line 4110) | def get_full_param_iter(self):
  class SonicMoEHFFormatFullParamSaver (line 4138) | class SonicMoEHFFormatFullParamSaver(HFFormatFullParamSaver):
    method __init__ (line 4139) | def __init__(
    method deinterleave_gate_up_proj (line 4161) | def deinterleave_gate_up_proj(self, w, moe_intermediate_size):
    method get_full_param_iter (line 4172) | def get_full_param_iter(self):

FILE: paddleformers/transformers/modeling_rope_utils.py
  function standardize_rope_params (line 25) | def standardize_rope_params(config, rope_theta: float | dict[str, float]...
  function dynamic_rope_update (line 81) | def dynamic_rope_update(rope_forward):
  function _compute_linear_scaling_rope_parameters (line 190) | def _compute_linear_scaling_rope_parameters(
  function _compute_dynamic_ntk_parameters (line 246) | def _compute_dynamic_ntk_parameters(
  function _compute_yarn_parameters (line 319) | def _compute_yarn_parameters(
  function _compute_longrope_parameters (line 458) | def _compute_longrope_parameters(
  function _compute_llama3_parameters (line 550) | def _compute_llama3_parameters(
  function _check_received_keys (line 648) | def _check_received_keys(
  function _validate_default_rope_parameters (line 677) | def _validate_default_rope_parameters(
  function _validate_linear_scaling_rope_parameters (line 686) | def _validate_linear_scaling_rope_parameters(
  function _validate_dynamic_scaling_rope_parameters (line 699) | def _validate_dynamic_scaling_rope_parameters(
  function _validate_yarn_parameters (line 713) | def _validate_yarn_parameters(
  function _validate_longrope_parameters (line 780) | def _validate_longrope_parameters(rope_parameters: dict, config: Pretrai...
  function _validate_llama3_parameters (line 828) | def _validate_llama3_parameters(rope_parameters: dict, config: Pretraine...
  function rope_config_validation (line 881) | def rope_config_validation(config: PretrainedConfig, ignore_keys: Option...

FILE: paddleformers/transformers/modelscope_utils.py
  class UnauthorizedError (line 21) | class UnauthorizedError(Exception):
  class EntryNotFoundError (line 25) | class EntryNotFoundError(Exception):
  function _add_subfolder (line 29) | def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -...
  function modelscope_download (line 35) | def modelscope_download(

FILE: paddleformers/transformers/moe_gate.py
  class MoEGateMixin (line 30) | class MoEGateMixin:
    method gate_score_func (line 31) | def gate_score_func(self, logits: paddle.Tensor) -> paddle.Tensor:
    method gumbel_rsample (line 54) | def gumbel_rsample(self, logits: paddle.Tensor) -> paddle.Tensor:
    method uniform_sample (line 58) | def uniform_sample(self, logits: paddle.Tensor) -> paddle.Tensor:
    method _one_hot_to_float (line 63) | def _one_hot_to_float(self, x, num_classes):
    method _one_hot_to_int64 (line 69) | def _one_hot_to_int64(self, x, num_classes):
    method _capacity (line 75) | def _capacity(
    method _cal_aux_loss (line 99) | def _cal_aux_loss(self, gates, mask):
    method _cal_seq_aux_loss (line 126) | def _cal_seq_aux_loss(self, probs, top_k, routing_map):
    method _cal_z_loss (line 163) | def _cal_z_loss(self, logits) -> paddle.Tensor:
    method _cal_orthogonal_loss (line 176) | def _cal_orthogonal_loss(self) -> paddle.Tensor:
  class PretrainedMoEGate (line 187) | class PretrainedMoEGate(nn.Layer, MoEGateMixin):
    method __init__ (line 188) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
    method _priority (line 225) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle....
    method _topk_greedy (line 247) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle....
    method _topk_group_limited_greedy (line 262) | def _topk_group_limited_greedy(
    method _topk_noaux_tc (line 294) | def _topk_noaux_tc(
    method top1gating (line 334) | def top1gating(
    method top2gating (line 406) | def top2gating(
    method topkgating (line 482) | def topkgating(
    method topkgating_nodrop (line 564) | def topkgating_nodrop(self, gates: paddle.Tensor):

FILE: paddleformers/transformers/moe_layer.py
  function dispatching (line 31) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity):
  function combining (line 71) | def combining(x, combine_weights, scatter_index):
  class _AllToAll (line 95) | class _AllToAll(paddle.autograd.PyLayer):
    method forward (line 97) | def forward(
    method backward (line 141) | def backward(ctx: Any, *grad_output: Tensor) -> Tuple[Tensor]:
  class MoELayer (line 154) | class MoELayer(nn.Layer):
    method __init__ (line 155) | def __init__(
    method _parse_moe_expert_parallel (line 216) | def _parse_moe_expert_parallel(self, moe_num_experts, expert_model_par...
    method _post_init (line 226) | def _post_init(self):
    method forward (line 237) | def forward(
  class MoEFlexTokenLayer (line 341) | class MoEFlexTokenLayer(nn.Layer):
    method __init__ (line 342) | def __init__(self, config, moe_num_experts, expert_class, expert_kwarg...
    method _post_init (line 370) | def _post_init(self):
    method expert_forward (line 381) | def expert_forward(self, dispatched_input, tokens_per_expert):
    method forward (line 397) | def forward(self, hidden_states: paddle.Tensor):
    method _parse_moe_expert_parallel (line 406) | def _parse_moe_expert_parallel(self, moe_num_experts, expert_model_par...

FILE: paddleformers/transformers/moe_layer_auto.py
  function dispatching (line 31) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity):
  function combining (line 71) | def combining(x, combine_weights, scatter_index):
  class LocalGatePart1 (line 95) | class LocalGatePart1(LocalLayer):
    method __init__ (line 96) | def __init__(self, config, gate: PretrainedMoEGate, ipp=None):
    method forward (line 114) | def forward(self, hidden_state, gate_weight, e_score_correction_bias, ...
  class LocalGateAndDispatch (line 131) | class LocalGateAndDispatch(LocalLayer):
    method __init__ (line 132) | def __init__(self, gate: PretrainedMoEGate, ipp=None):
    method forward (line 145) | def forward(self, reshaped_input, scores):
  class LocalCombine (line 151) | class LocalCombine(LocalLayer):
    method __init__ (line 152) | def __init__(self, ipp=None):
    method forward (line 158) | def forward(self, combine_weights, expert_output, dtype="float32", out...
  class MoELayer (line 169) | class MoELayer(nn.Layer):
    method __init__ (line 170) | def __init__(
    method _redistribute_experts (line 212) | def _redistribute_experts(self, experts, moe_group: str):
    method _parse_moe_expert_parallel (line 221) | def _parse_moe_expert_parallel(self, moe_num_experts, config):
    method _post_init (line 238) | def _post_init(self):
    method expert_forward (line 249) | def expert_forward(self, dispatched_input):
    method forward (line 270) | def forward(

FILE: paddleformers/transformers/ofa_utils.py
  function prepare_qkv_ofa (line 30) | def prepare_qkv_ofa(self, query, key, value, cache=None):
  function mha_ofa_forward (line 52) | def mha_ofa_forward(self, query, key, value, attn_mask=None, cache=None):
  function encoder_ofa_forward (line 96) | def encoder_ofa_forward(
  function encoder_layer_ofa_forward (line 127) | def encoder_layer_ofa_forward(self, src, src_mask=None, cache=None, outp...
  function reorder_head (line 151) | def reorder_head(layer, index):
  function reorder_neuron (line 189) | def reorder_neuron(layer, index, dim=0):
  function reorder_neuron_head (line 214) | def reorder_neuron_head(model, head_importance, neuron_importance):
  function compute_neuron_head_importance (line 230) | def compute_neuron_head_importance(

FILE: paddleformers/transformers/paddleocr_vl/configuration.py
  class PaddleOCRVisionConfig (line 23) | class PaddleOCRVisionConfig(PretrainedConfig):
    method __init__ (line 28) | def __init__(
  class PaddleOCRVLConfig (line 76) | class PaddleOCRVLConfig(PretrainedConfig):
    method __init__ (line 81) | def __init__(
    method __getattribute__ (line 210) | def __getattribute__(self, key):

FILE: paddleformers/transformers/paddleocr_vl/image_processor.py
  function is_scaled_image (line 44) | def is_scaled_image(image: np.ndarray) -> bool:
  function make_batched_images (line 55) | def make_batched_images(images) -> List[List[ImageInput]]:
  function adjust_size (line 78) | def adjust_size(size, patch_size):
  function smart_resize (line 85) | def smart_resize(
  class PaddleOCRVLImageProcessor (line 129) | class PaddleOCRVLImageProcessor(BaseImageProcessor):
    method __init__ (line 135) | def __init__(
    method set_pixels (line 169) | def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
    method get_smarted_resize (line 182) | def get_smarted_resize(self, height, width, min_pixels=None, max_pixel...
    method _preprocess (line 198) | def _preprocess(
    method preprocess (line 293) | def preprocess(

FILE: paddleformers/transformers/paddleocr_vl/modeling.py
  function rotate_half (line 61) | def rotate_half(x):
  function _ensure_cos_sin_dim (line 68) | def _ensure_cos_sin_dim(cos, sin, dim_needed):
  function apply_multimodal_rotary_pos_emb (line 80) | def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqu...
  function apply_rotary_pos_emb_vision (line 96) | def apply_rotary_pos_emb_vision(q, k, cos, sin):
  function apply_fused_rope (line 109) | def apply_fused_rope(query_states, key_states, rope_theta):
  class PaddleOCRAttention (line 128) | class PaddleOCRAttention(nn.Layer):
    method __init__ (line 129) | def __init__(self, config: PaddleOCRVisionConfig):
    method forward (line 165) | def forward(
  class PaddleOCRVisionEmbeddings (line 216) | class PaddleOCRVisionEmbeddings(nn.Layer):
    method __init__ (line 217) | def __init__(self, config: PaddleOCRVisionConfig):
    method forward (line 245) | def forward(
  class PaddleOCRMLP (line 296) | class PaddleOCRMLP(nn.Layer):
    method __init__ (line 297) | def __init__(self, config: PaddleOCRVisionConfig):
    method forward (line 314) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
  class PaddleOCREncoderLayer (line 321) | class PaddleOCREncoderLayer(nn.Layer):
    method __init__ (line 322) | def __init__(self, config: PaddleOCRVisionConfig):
    method forward (line 348) | def forward(
  class PaddleOCRVisionRotaryEmbedding (line 386) | class PaddleOCRVisionRotaryEmbedding(nn.Layer):
    method __init__ (line 387) | def __init__(self, dim: int, theta: float = 10000.0) -> None:
    method rope_init (line 393) | def rope_init(self):
    method forward (line 398) | def forward(self, seqlen: int) -> paddle.Tensor:
  class PaddleOCREncoder (line 404) | class PaddleOCREncoder(nn.Layer):
    method __init__ (line 405) | def __init__(self, config: PaddleOCRVisionConfig):
    method flatten_list (line 421) | def flatten_list(image_grid_thw):
    method get_position_ids_vectorized (line 431) | def get_position_ids_vectorized(image_grid_thw, dtype="int64"):
    method build_window_index (line 458) | def build_window_index(self, image_grid, window_size):
    method recompute_training (line 493) | def recompute_training(
    method forward (line 530) | def forward(
  class PaddleOCRVisionTransformer (line 647) | class PaddleOCRVisionTransformer(nn.Layer):
    method __init__ (line 648) | def __init__(self, config: PaddleOCRVisionConfig):
    method forward (line 664) | def forward(
  class PaddleOCRVisionPreTrainedModel (line 776) | class PaddleOCRVisionPreTrainedModel(PretrainedModel):
  class PaddleOCRVisionModel (line 791) | class PaddleOCRVisionModel(PaddleOCRVisionPreTrainedModel):
    method __init__ (line 795) | def __init__(self, config: PaddleOCRVisionConfig):
    method get_input_embeddings (line 800) | def get_input_embeddings(self) -> nn.Layer:
    method forward (line 803) | def forward(
  class Projector (line 834) | class Projector(nn.Layer):
    method __init__ (line 835) | def __init__(self, text_config: PaddleOCRVLConfig, vision_config: Padd...
    method forward (line 865) | def forward(self, image_features, image_grid_thw):
  class PaddleOCRRotaryEmbedding (line 972) | class PaddleOCRRotaryEmbedding(nn.Layer):
    method __init__ (line 973) | def __init__(self, config: PaddleOCRVLConfig):
    method compute_default_rope_parameters (line 990) | def compute_default_rope_parameters(
    method forward (line 1016) | def forward(self, x, position_ids):
  class Ernie4_5Attention (line 1034) | class Ernie4_5Attention(nn.Layer):
    method __init__ (line 1037) | def __init__(self, config, layer_idx=0):
    method forward (line 1105) | def forward(
  class Ernie4_5DecoderLayer (line 1189) | class Ernie4_5DecoderLayer(nn.Layer):
    method __init__ (line 1196) | def __init__(self, config, layer_idx):
    method forward (line 1235) | def forward(
  class Ernie4_5PretrainedModel (line 1304) | class Ernie4_5PretrainedModel(PretrainedModel):
    method _gen_aoa_config (line 1325) | def _gen_aoa_config(cls, config: PaddleOCRVLConfig):
    method _gen_inv_aoa_config (line 1407) | def _gen_inv_aoa_config(cls, config: PaddleOCRVLConfig):
  class Ernie4_5Model (line 1490) | class Ernie4_5Model(Ernie4_5PretrainedModel):
    method __init__ (line 1493) | def __init__(self, config: PaddleOCRVLConfig):
    method recompute_training (line 1520) | def recompute_training(
    method forward (line 1563) | def forward(
  class PaddleOCRVLCausalLMOutputWithPast (line 1724) | class PaddleOCRVLCausalLMOutputWithPast(ModelOutput):
  class PaddleOCRVLModel (line 1733) | class PaddleOCRVLModel(Ernie4_5PretrainedModel):
    method __init__ (line 1736) | def __init__(self, config: PaddleOCRVLConfig):
  class PaddleOCRVLForConditionalGeneration (line 1742) | class PaddleOCRVLForConditionalGeneration(Ernie4_5PretrainedModel, Gener...
    method __init__ (line 1749) | def __init__(self, config: PaddleOCRVLConfig):
    method get_input_embeddings (line 1764) | def get_input_embeddings(self):
    method set_input_embeddings (line 1767) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 1770) | def get_output_embeddings(self):
    method set_output_embeddings (line 1773) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 1776) | def set_decoder(self, decoder):
    method get_decoder (line 1779) | def get_decoder(self):
    method freeze_vision (line 1782) | def freeze_vision(self):
    method get_rope_index (line 1786) | def get_rope_index(
    method prepare_attention_mask_for_generation (line 1914) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i...
    method prepare_inputs_for_generation (line 1920) | def prepare_inputs_for_generation(
    method forward (line 1967) | def forward(

FILE: paddleformers/transformers/phi3/configuration.py
  class Phi3Config (line 20) | class Phi3Config(PretrainedConfig):
    method __init__ (line 31) | def __init__(
    method _rope_parameters_adjustment (line 105) | def _rope_parameters_adjustment(self):
    method _rope_parameters_validation (line 115) | def _rope_parameters_validation(self):

FILE: paddleformers/transformers/processing_utils.py
  class _LazyAutoProcessorMapping (line 68) | class _LazyAutoProcessorMapping(dict):
    method __getitem__ (line 81) | def __getitem__(self, key):
    method __contains__ (line 88) | def __contains__(self, key):
    method keys (line 91) | def keys(self):
  function _get_modality_for_attribute (line 105) | def _get_modality_for_attribute(attribute_name: str) -> str:
  class VideosKwargs (line 124) | class VideosKwargs(TypedDict, total=False):
  class ProcessingKwargs (line 206) | class ProcessingKwargs(ProcessingKwargs_hf):
  class AllKwargsForChatTemplate (line 213) | class AllKwargsForChatTemplate(AllKwargsForChatTemplate_hf):
  class MultiModalData (line 219) | class MultiModalData:
    method __contains__ (line 234) | def __contains__(self, key):
    method __getitem__ (line 237) | def __getitem__(self, key):
  class PaddleProcessorMixin (line 243) | class PaddleProcessorMixin:
    method __init__ (line 248) | def __init__(self, *args, **kwargs):
    method __call__ (line 251) | def __call__(
    method check_argument_for_proper_class (line 261) | def check_argument_for_proper_class(self, argument_name, argument):
    method to_dict (line 283) | def to_dict(self, legacy_serialization=True) -> dict[str, Any]:
    method to_json_string (line 344) | def to_json_string(self, legacy_serialization=True) -> str:
    method to_json_file (line 355) | def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy...
    method save_pretrained (line 366) | def save_pretrained(self, save_directory, push_to_hub: bool = False, l...
    method get_processor_dict (line 480) | def get_processor_dict(
    method from_args_and_dict (line 638) | def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kw...
    method from_pretrained (line 691) | def from_pretrained(
    method get_attributes (line 701) | def get_attributes(cls):
    method _load_tokenizer_from_pretrained (line 718) | def _load_tokenizer_from_pretrained(
    method _get_arguments_from_pretrained (line 738) | def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path,...
    method get_possibly_dynamic_module (line 814) | def get_possibly_dynamic_module(module_name):
    method batch_decode (line 833) | def batch_decode(self, *args, **kwargs):
    method decode (line 842) | def decode(self, *args, **kwargs):
    method model_input_names (line 852) | def model_input_names(self):
    method apply_chat_template (line 860) | def apply_chat_template(
  function warp_processormixin (line 1058) | def warp_processormixin(hf_processormixin_class: ProcessorMixin_hf):
  class ProcessorMixin (line 1062) | class ProcessorMixin(PaddleProcessorMixin, ProcessorMixin_hf):
    method init (line 1063) | def init(self, *args, **kwargs):