SYMBOL INDEX (6547 symbols across 355 files) FILE: examples/experiments/deepseek_v3_pretrain/config/configuration.py class DeepseekV2FastConfig (line 23) | class DeepseekV2FastConfig(PretrainedConfig): method __init__ (line 132) | def __init__( FILE: examples/experiments/deepseek_v3_pretrain/convert_ckpt_to_sft.py function paddle_name_to_hf_names (line 45) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]: function _handle_expert_weights (line 134) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s... function _handle_shared_expert_weights (line 149) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional... function _handle_mlp_weights (line 162) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]: function _is_need_transpose (line 172) | def _is_need_transpose(key): function prepare_tensor (line 191) | def prepare_tensor(key, value): function load_pretrained_ckpt (line 218) | def load_pretrained_ckpt(ckpt_path, output_path): FILE: examples/experiments/deepseek_v3_pretrain/fp8_linear.py function fp8_linear (line 54) | def fp8_linear( function register_scale (line 95) | def register_scale(self): class Linear (line 109) | class Linear(PD_Linear): method __init__ (line 110) | def __init__(self, *args, **kwargs): class ColumnParallelLinear (line 116) | class ColumnParallelLinear(PD_ColumnParallelLinear): method __init__ (line 117) | def __init__(self, *args, **kwargs): class RowParallelLinear (line 123) | class RowParallelLinear(PD_RowParallelLinear): method __init__ (line 124) | def __init__(self, *args, **kwargs): class ColumnSequenceParallelLinear (line 130) | class ColumnSequenceParallelLinear(PD_ColumnSequenceParallelLinear): method __init__ (line 131) | def __init__(self, *args, **kwargs): class RowSequenceParallelLinear (line 137) | class RowSequenceParallelLinear(PD_RowSequenceParallelLinear): method __init__ (line 138) | def __init__(self, *args, **kwargs): FILE: examples/experiments/deepseek_v3_pretrain/kernel.py function act_quant_kernel (line 30) | def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr): function act_quant (line 51) | def act_quant(x: paddle.Tensor, block_size: int = 128) -> Tuple[paddle.T... function weight_dequant_kernel (line 74) | def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.cons... function weight_dequant (line 100) | def weight_dequant(x: paddle.Tensor, s: paddle.Tensor, block_size: int =... function fp8_gemm_kernel (line 130) | def fp8_gemm_kernel( function fp8_gemm (line 190) | def fp8_gemm(a: paddle.Tensor, a_s: paddle.Tensor, b: paddle.Tensor, b_s... FILE: examples/experiments/deepseek_v3_pretrain/load_hf_ckpt.py function paddle_name_to_hf_names_ds_v2 (line 53) | def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]: function paddle_name_to_hf_names (line 128) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]: function _get_hf_prefix (line 196) | def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str: function _handle_expert_weights (line 206) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s... function _handle_shared_expert_weights (line 221) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional... function _handle_mlp_weights (line 234) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]: function prepare_tensor (line 244) | def prepare_tensor(tensor, dst_shape, *, force_transpose=False): function load_huggingface_ckpt (line 274) | def load_huggingface_ckpt(model, huggingface_ckpt_path): FILE: examples/experiments/deepseek_v3_pretrain/modeling.py function swiglu (line 109) | def swiglu(x, y=None): function get_use_casual_mask (line 133) | def get_use_casual_mask(): function set_global_step (line 138) | def set_global_step(cur_step): function get_global_step (line 143) | def get_global_step(): function rms_norm_fused (line 148) | def rms_norm_fused(x_in, w, eps, use_fast_ln=False): function cast_if_needed (line 156) | def cast_if_needed(x, dtype): function fusion_rms_norm (line 163) | def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln... class LMHeadFunction (line 186) | class LMHeadFunction(paddle.autograd.PyLayer): method forward (line 188) | def forward(ctx, x, weight, transpose_y): method backward (line 195) | def backward(ctx, dout): function parallel_matmul (line 225) | def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_para... class DeepseekV2MLP (line 255) | class DeepseekV2MLP(nn.Layer): method __init__ (line 256) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, int... method forward (line 302) | def forward(self, x): class MoEGate (line 308) | class MoEGate(PretrainedMoEGate): method __init__ (line 309) | def __init__( method forward (line 355) | def forward(self, hidden_states): class DeepseekV2MoE (line 407) | class DeepseekV2MoE(MoELayer): method __init__ (line 412) | def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, nor... method fp8_quant_weight (line 491) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None): method forward (line 554) | def forward(self, hidden_states): method post_process (line 579) | def post_process(self, hidden_states, final_hidden_states, l_aux): class DeepseekV2RotaryEmbedding (line 590) | class DeepseekV2RotaryEmbedding(nn.Layer): method __init__ (line 591) | def __init__(self, dim, max_position_embeddings=2048, base=10000): method _set_cos_sin_cache (line 606) | def _set_cos_sin_cache(self, seq_len): method forward (line 619) | def forward(self, x, seq_len=None): class DeepseekV2Attention (line 632) | class DeepseekV2Attention(nn.Layer): method __init__ (line 635) | def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: ... method fp8_quant_weight (line 745) | def fp8_quant_weight(self, quant_transpose=None): method _init_rope (line 752) | def _init_rope(self): method _shape (line 784) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): method forward (line 787) | def forward( class DeepseekV2DecoderLayer (line 932) | class DeepseekV2DecoderLayer(nn.Layer): method __init__ (line 933) | def __init__( method fp8_quant_weight (line 974) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None): method forward (line 983) | def forward( method self_attn_compute (line 1081) | def self_attn_compute(self, hidden_states, **kwargs): method pre_dispatch_compute (line 1131) | def pre_dispatch_compute(self, hidden_states): method expert_forward_compute (line 1138) | def expert_forward_compute(self, intermediate_hidden_states, dispatche... method post_combine_compute (line 1151) | def post_combine_compute(self, residual, hidden_states, final_hidden_s... class DeepseekV2MTPLayer (line 1166) | class DeepseekV2MTPLayer(DeepseekV2DecoderLayer): method __init__ (line 1167) | def __init__( method forward (line 1179) | def forward( class DeepseekV2PretrainedModelFast (line 1216) | class DeepseekV2PretrainedModelFast(PretrainedModel): method _get_model_flops (line 1221) | def _get_model_flops(self, batch_size=1, seq_length=None, **kwargs): method _get_hardware_flops (line 1234) | def _get_hardware_flops(self, *args, **kwargs): method _get_name_mappings (line 1238) | def _get_name_mappings(cls, config: DeepseekV2FastConfig) -> list[Stat... method _get_tensor_parallel_mappings (line 1298) | def _get_tensor_parallel_mappings(cls, config: DeepseekV2FastConfig, i... method _init_weights (line 1398) | def _init_weights(self, layer): method step_flex_token (line 1459) | def step_flex_token(self, cur_step): class DeepseekV2ModelFast (line 1464) | class DeepseekV2ModelFast(DeepseekV2PretrainedModelFast): method __init__ (line 1472) | def __init__(self, config: DeepseekV2FastConfig): method get_input_embeddings (line 1502) | def get_input_embeddings(self): method set_input_embeddings (line 1505) | def set_input_embeddings(self, value): method _prepare_decoder_attention_mask (line 1509) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_... method recompute_training_full (line 1544) | def recompute_training_full( method forward (line 1575) | def forward( class DeepseekV2PretrainingCriterionFast (line 1786) | class DeepseekV2PretrainingCriterionFast(nn.Layer): method __init__ (line 1792) | def __init__(self, config: DeepseekV2FastConfig): method forward (line 1803) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non... function yarn_find_correction_dim (line 1853) | def yarn_find_correction_dim(num_rotations, dim, base=10000, max_positio... function yarn_find_correction_range (line 1858) | def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_p... function yarn_linear_ramp_mask (line 1864) | def yarn_linear_ramp_mask(min, max, dim): class DeepseekV2YarnRotaryEmbedding (line 1873) | class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding): method __init__ (line 1874) | def __init__( method _set_cos_sin_cache (line 1894) | def _set_cos_sin_cache(self, seq_len): class RmsNormFunction (line 1928) | class RmsNormFunction(paddle.autograd.PyLayer): method forward (line 1930) | def forward(ctx, x, scale, epsilon): method backward (line 1939) | def backward(ctx, grad_output): class DeepseekV2RMSNorm (line 1953) | class DeepseekV2RMSNorm(nn.Layer): method __init__ (line 1954) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps... method forward (line 1978) | def forward(self, hidden_states): method extra_repr (line 1991) | def extra_repr(self): function apply_rotary_pos_emb (line 1995) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion... class FusedNormGateFunc (line 2049) | class FusedNormGateFunc(paddle.autograd.PyLayer): method set_temporary_vars (line 2056) | def set_temporary_vars(cls, norm_output, invar): method clear_temporary_vars (line 2061) | def clear_temporary_vars(cls): method forward (line 2066) | def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps): method backward (line 2076) | def backward(ctx, d_gate_logits, d_norm_output): class TemporaryVarContext (line 2100) | class TemporaryVarContext: method __init__ (line 2101) | def __init__(self, norm_output, invar): method __enter__ (line 2105) | def __enter__(self): method __exit__ (line 2108) | def __exit__(self, exc_type, exc_val, exc_tb): function balance_expert_assignment (line 2112) | def balance_expert_assignment(n, m, k): class FakeGate (line 2123) | class FakeGate(paddle.autograd.PyLayer): method forward (line 2125) | def forward(ctx, hidden_states, weight, fakse_gate_restrict_balance=Fa... method backward (line 2141) | def backward(ctx, grad_output): class AddAuxiliaryLoss (line 2145) | class AddAuxiliaryLoss(paddle.autograd.PyLayer): method forward (line 2152) | def forward(ctx, x, loss): method backward (line 2158) | def backward(ctx, grad_output): function qkv_pre_process_no_fuse (line 2166) | def qkv_pre_process_no_fuse( function rearrange_kv (line 2203) | def rearrange_kv(kv, k_pe, qk_nope_head_dim, num_heads): function enable_to_static (line 2214) | def enable_to_static(value): function qkv_pre_process (line 2223) | def qkv_pre_process( function manul_fwd (line 2266) | def manul_fwd( class MemroyRecomputeAttnFunc (line 2318) | class MemroyRecomputeAttnFunc(paddle.autograd.PyLayer): method forward (line 2320) | def forward( method backward (line 2515) | def backward(ctx, dout): class MemroyRecomputeAttn (line 2828) | class MemroyRecomputeAttn(paddle.nn.Layer): method __init__ (line 2829) | def __init__( method fp8_quant_weight (line 2907) | def fp8_quant_weight(self, quant_transpose=None): method forward (line 2911) | def forward(self, q_init, kv_init, position_ids): class FusedRMSLinearFunc (line 2941) | class FusedRMSLinearFunc(paddle.autograd.PyLayer): method forward (line 2943) | def forward(ctx, x, rms_norm_weight, q_down_weight, kv_down_weight, eps): method backward (line 2964) | def backward(ctx, d_q, d_kv): class FusedRMSLinear (line 3014) | class FusedRMSLinear(paddle.nn.Layer): method __init__ (line 3015) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None: method fp8_quant_weight (line 3039) | def fp8_quant_weight(self, quant_transpose=None): method forward (line 3042) | def forward(self, x): class FusedRMSLinearSingleFunc (line 3047) | class FusedRMSLinearSingleFunc(paddle.autograd.PyLayer): method forward (line 3049) | def forward(ctx, x, rms_norm_weight, linear_weight, eps): method backward (line 3058) | def backward(ctx, d_q, d_kv): class FusedRMSLinearSingle (line 3069) | class FusedRMSLinearSingle(paddle.nn.Layer): method __init__ (line 3070) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None: method forward (line 3087) | def forward(self, x): class FastCrossEntropyFunction (line 3092) | class FastCrossEntropyFunction(paddle.autograd.PyLayer): method forward (line 3094) | def forward(ctx, preds, labels): method backward (line 3102) | def backward(ctx, dout): class DeepseekV2LMHead (line 3112) | class DeepseekV2LMHead(nn.Layer): method __init__ (line 3113) | def __init__(self, config: DeepseekV2FastConfig, embedding_weight=None): method forward (line 3149) | def forward(self, hidden_states, tensor_parallel_output=None): method extra_repr (line 3171) | def extra_repr(self): FILE: examples/experiments/deepseek_v3_pretrain/modeling_pp.py function check_accept_none_grad (line 81) | def check_accept_none_grad(): function parse_args (line 97) | def parse_args(args): function return_args (line 127) | def return_args(hidden_states, attention_mask=None, attn_mask_startend_r... function get_attr (line 142) | def get_attr(layer, name): function calc_stream_wait (line 149) | def calc_stream_wait(group_id): class TensorMeta (line 154) | class TensorMeta: method __init__ (line 157) | def __init__(self, tensor): class PostProcessNode (line 162) | class PostProcessNode(ScheduleNode): method __init__ (line 163) | def __init__( method forward_without_residual (line 187) | def forward_without_residual(self, inputs): method forward (line 231) | def forward(self, inputs): method backward (line 277) | def backward(self, output_grad): class DecoderLayerNode (line 333) | class DecoderLayerNode(ScheduleNode): method __init__ (line 334) | def __init__( method dispatch_forward (line 363) | def dispatch_forward(self, inputs, previous_event=None, allocate_on_co... method combine_forward (line 409) | def combine_forward(self, inputs, previous_event=None): method dispatch_backward (line 425) | def dispatch_backward(self, output_grad): method combine_backward (line 464) | def combine_backward(self, output_grad): method forward (line 491) | def forward(self, inputs): method backward (line 511) | def backward(self, output_grad=None, scaler=None): class OverlapedScheduleChunk (line 534) | class OverlapedScheduleChunk: method __init__ (line 535) | def __init__(self, forward_nodes, backward_nodes, use_fuion=True): method forward_backward (line 546) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa... class DecoderBackwardScheduleChunk (line 559) | class DecoderBackwardScheduleChunk: method __init__ (line 560) | def __init__(self, nodes): method backward (line 563) | def backward(self, output_grad, combine_bw_event_to_wait=None, pp_stre... class OverlapedScheduleNode (line 573) | class OverlapedScheduleNode: method __init__ (line 574) | def __init__(self, forward_node, backward_node, name=""): method forward_backward (line 580) | def forward_backward(self, inputs, output_grad, event_to_wait=None): class FusionFp8DecoderLayerNode (line 608) | class FusionFp8DecoderLayerNode(ScheduleNode): method __init__ (line 609) | def __init__( method attn_forward (line 633) | def attn_forward(self, inputs): method dispatch_forward (line 669) | def dispatch_forward(self, inputs, previous_event=None, async_finish=F... method mlp_forward (line 697) | def mlp_forward(self, inputs): method combine_forward (line 736) | def combine_forward(self, inputs, async_finish=False, previous_event=N... method post_process_forward (line 762) | def post_process_forward(self, inputs, with_residual=True): method post_process_backward (line 785) | def post_process_backward(self, output_grad, event_to_wait=None): method combine_backward (line 819) | def combine_backward(self, output_grad, previous_event=None, async_fin... method mlp_backward (line 877) | def mlp_backward(self, output_grad): method dispatch_backward (line 909) | def dispatch_backward(self, output_grad, async_finish=False, previous_... method attn_backward (line 958) | def attn_backward(self, output_grad): method backward_for_fusion (line 1016) | def backward_for_fusion(self, output_grad, combine_bw_event_to_wait=No... method forward (line 1087) | def forward(self, inputs): method backward (line 1097) | def backward(self, output_grad=None, scaler=None): class DenseDecoderLayerNode (line 1108) | class DenseDecoderLayerNode(ScheduleNode): method __init__ (line 1109) | def __init__( method forward (line 1119) | def forward(self, inputs): method backward (line 1124) | def backward(self, output_grad=None, scaler=None): class OverlapedFUsionScheduleNode (line 1131) | class OverlapedFUsionScheduleNode: method __init__ (line 1132) | def __init__(self, forward_node, backward_node, name=""): method forward_backward (line 1140) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa... class OverlapedDenseFusionScheduleNode (line 1276) | class OverlapedDenseFusionScheduleNode: method __init__ (line 1277) | def __init__(self, forward_node, backward_node, name=""): method forward_backward (line 1286) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa... function build_overlapped_nodes (line 1372) | def build_overlapped_nodes(config: DeepseekV2FastConfig, forward_chunk, ... class EmbeddingFunction (line 1436) | class EmbeddingFunction(paddle.autograd.PyLayer): method forward (line 1438) | def forward(ctx, x, weight): method backward (line 1447) | def backward(ctx, dout): class DeepseekV2EmbeddingPipe (line 1458) | class DeepseekV2EmbeddingPipe(nn.Layer): method __init__ (line 1459) | def __init__(self, config: DeepseekV2FastConfig): method embedding_weight (line 1474) | def embedding_weight(self): method forward (line 1477) | def forward(self, args): method build_schedule_node (line 1557) | def build_schedule_node(self): class DeepseekV2DecoderLayerPipe (line 1561) | class DeepseekV2DecoderLayerPipe(DeepseekV2DecoderLayer): method forward (line 1562) | def forward(self, args): method attn_compute (line 1621) | def attn_compute(self, args): method attn_compute_for_fusion (line 1656) | def attn_compute_for_fusion(self, args): method mlp_compute (line 1694) | def mlp_compute(self, inputs): method post_process_compute (line 1741) | def post_process_compute(self, inputs): method post_process_compute_for_fusion (line 1777) | def post_process_compute_for_fusion(self, inputs): method attn_compute_dense (line 1802) | def attn_compute_dense(self, args): method mlp_compute_dense (line 1820) | def mlp_compute_dense(self, inputs): method build_schedule_node (line 1834) | def build_schedule_node(self): class DeepseekV2MTPLayerPipe (line 1900) | class DeepseekV2MTPLayerPipe(DeepseekV2MTPLayer): method forward (line 1901) | def forward(self, args): method attn_compute_for_fusion (line 1969) | def attn_compute_for_fusion(self, args): method build_schedule_node (line 2016) | def build_schedule_node(self): class DeepseekV2RMSNormPipe (line 2035) | class DeepseekV2RMSNormPipe(nn.Layer): method __init__ (line 2036) | def __init__(self, config): method forward (line 2041) | def forward(self, args): method build_schedule_node (line 2056) | def build_schedule_node(self): class DeepseekV2LMHeadPipe (line 2060) | class DeepseekV2LMHeadPipe(DeepseekV2LMHead): method __init__ (line 2061) | def __init__(self, config, embedding_weight=None): method embedding_weight (line 2065) | def embedding_weight(self): method forward (line 2068) | def forward(self, args: Union[Tuple, paddle.Tensor]): method build_schedule_node (line 2078) | def build_schedule_node(self): class DeepseekV2PretrainingCriterionPipe (line 2082) | class DeepseekV2PretrainingCriterionPipe(DeepseekV2PretrainingCriterionF... method forward (line 2083) | def forward(self, logits, labels): method build_schedule_node (line 2094) | def build_schedule_node(self): class DeepseekV2ForCausalLMPipe (line 2098) | class DeepseekV2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): method step_flex_token (line 2117) | def step_flex_token(self, cur_step): method _prepare_pipeline_inputs_func (line 2121) | def _prepare_pipeline_inputs_func(cls, inputs): method __init__ (line 2144) | def __init__(self, config: DeepseekV2FastConfig): method fp8_quant_weight (line 2309) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=True): method get_loss_fn (line 2322) | def get_loss_fn(self, config): method overlapped_forward_backward (line 2325) | def overlapped_forward_backward( FILE: examples/experiments/deepseek_v3_pretrain/moe_gate.py class PretrainedMoEGate (line 29) | class PretrainedMoEGate(nn.Layer, MoEGateMixin): method __init__ (line 30) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs): method _priority (line 69) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.... method _topk_greedy (line 91) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle.... method _topk_group_limited_greedy (line 106) | def _topk_group_limited_greedy( method _topk_noaux_tc (line 138) | def _topk_noaux_tc( method top1gating (line 175) | def top1gating( method top2gating (line 245) | def top2gating( method _cal_seq_aux_loss (line 321) | def _cal_seq_aux_loss(self, gates, top_k, topk_idx) -> paddle.Tensor: method topkgating (line 359) | def topkgating( method topkgating_nodrop (line 438) | def topkgating_nodrop(self, gates: paddle.Tensor): FILE: examples/experiments/deepseek_v3_pretrain/moe_layer.py function record_stream_for_multi_input (line 58) | def record_stream_for_multi_input(x): function stop_gradient_for_multi_input (line 66) | def stop_gradient_for_multi_input(x): class MoELayer (line 73) | class MoELayer(nn.Layer): method __init__ (line 74) | def __init__( method update_flex_token (line 149) | def update_flex_token(self): method _parse_moe_expert_parallel (line 165) | def _parse_moe_expert_parallel(self, n_routed_experts, expert_model_pa... method _post_init (line 175) | def _post_init(self): method forward (line 186) | def forward( method forward_drop_token (line 207) | def forward_drop_token( method expert_forward (line 326) | def expert_forward(self, dispatched_input): method forward_flex_token (line 337) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,... method get_tokens_per_expert (line 380) | def get_tokens_per_expert(self): method set_tokens_per_expert (line 383) | def set_tokens_per_expert(self, tokens_per_expert_list): method pre_dispatch_compute (line 386) | def pre_dispatch_compute(self, hidden_states): method post_dispatch_compute (line 394) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis... method pre_combine_compute (line 400) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p... method post_combine_compute (line 406) | def post_combine_compute(self, hidden_states): class MoEFlexTokenLayer (line 411) | class MoEFlexTokenLayer(nn.Layer): method __init__ (line 412) | def __init__(self, config, n_routed_experts, expert_class, expert_kwar... method expert_forward (line 428) | def expert_forward(self, dispatched_input, tokens_per_expert): method forward (line 440) | def forward(self, hidden_states: paddle.Tensor): method forward_flex_token (line 451) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,... method get_tokens_per_expert (line 494) | def get_tokens_per_expert(self): method set_tokens_per_expert (line 497) | def set_tokens_per_expert(self, tokens_per_expert_list): method pre_dispatch_compute (line 500) | def pre_dispatch_compute(self, hidden_states): method post_dispatch_compute (line 508) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis... method pre_combine_compute (line 514) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p... method post_combine_compute (line 520) | def post_combine_compute(self, hidden_states): class Fp8DispatchQuantNode (line 525) | class Fp8DispatchQuantNode: method __init__ (line 526) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, name="fp8_... method forward (line 533) | def forward(self, hidden_states, probs, routing_map): method backward (line 561) | def backward(self, hs_grad, token_probs_grad): class Fp8DispatchNode (line 573) | class Fp8DispatchNode: method __init__ (line 574) | def __init__(self, token_dispatcher, name="fp8_dispatch_node"): method forward (line 580) | def forward( method backward (line 609) | def backward( class Fp8CombineNode (line 628) | class Fp8CombineNode: method __init__ (line 629) | def __init__(self, token_dispatcher, name="fp8_combine_node"): method forward (line 635) | def forward(self, hidden_states_out, previous_event=None, async_finish... method backward (line 650) | def backward(self, output_combine_grad, previous_event=None, async_fin... class Fp8CombineQuantNode (line 661) | class Fp8CombineQuantNode: method __init__ (line 662) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, moe_group=... method forward (line 669) | def forward(self, output_combine): method backward (line 678) | def backward(self, output_grad, event_to_wait=None): class FusionMlpNode (line 704) | class FusionMlpNode: method __init__ (line 709) | def __init__( method set_recompute_fwd_gate_up (line 745) | def set_recompute_fwd_gate_up(self, recompute_fwd_gate_up): method reset_statue (line 748) | def reset_statue(self): method prepare_env_subbatch (line 773) | def prepare_env_subbatch(self, unzipped_tokens=None, unzipped_tokens_s... method gemm_forward_subbatch (line 790) | def gemm_forward_subbatch( method gemm_backward_subbatch (line 825) | def gemm_backward_subbatch( method forward (line 886) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro... method backward (line 1024) | def backward(self, hidden_states_out_grad): class FusionMoeNode (line 1127) | class FusionMoeNode: method __init__ (line 1128) | def __init__( method forward (line 1161) | def forward(self, hidden_states, probs, routing_map): method backward (line 1189) | def backward(self, output_grad): class FusionMoe (line 1204) | class FusionMoe(paddle.autograd.PyLayer): method forward (line 1206) | def forward( method backward (line 1225) | def backward(ctx, output_grad): FILE: examples/experiments/deepseek_v3_pretrain/moe_utils.py function _clear_to_zero_allocation (line 29) | def _clear_to_zero_allocation(self): function _holder_size (line 45) | def _holder_size(self): function topk_to_permuted_indices (line 57) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk): function permute_fast (line 69) | def permute_fast( function unpermute_fast (line 90) | def unpermute_fast( class UnZipNode (line 132) | class UnZipNode: method __init__ (line 133) | def __init__(self, name="unzip"): method reset_statue (line 138) | def reset_statue(self): method forward (line 143) | def forward( method backward (line 189) | def backward(self, dx, total_zipped_tokens, probs_grad, dispatched_ind... class ZipNode (line 203) | class ZipNode: method __init__ (line 204) | def __init__(self, name="zip"): method forward (line 208) | def forward( method backward (line 218) | def backward( class PermuteNode (line 264) | class PermuteNode: method __init__ (line 265) | def __init__(self, token_dispatcher, name="permute"): method reset_status (line 269) | def reset_status(self): method forward (line 273) | def forward(self, hidden_states, hidden_states_scale, dispatched_indic... method backward (line 287) | def backward(self, out_grad, dispatched_probs): class UnPermuteNode (line 300) | class UnPermuteNode: method __init__ (line 301) | def __init__(self, token_dispatcher, name="unpermute"): method reset_status (line 305) | def reset_status(self): method forward (line 314) | def forward( method backward (line 352) | def backward(self, out_grad, out_grad_scale): function tokens_zip_unique_add_with_subbatch (line 383) | def tokens_zip_unique_add_with_subbatch(zipped, unzipped, index_unzipped... function merge_subbatch_cast (line 409) | def merge_subbatch_cast(x, dtype): function get_env_device (line 420) | def get_env_device(): FILE: examples/experiments/deepseek_v3_pretrain/run_pretrain.py class PreTrainingArguments (line 65) | class PreTrainingArguments(TrainingArguments): method __post_init__ (line 122) | def __post_init__(self): class DataArguments (line 180) | class DataArguments: class ModelArguments (line 213) | class ModelArguments: function create_pretrained_dataset (line 244) | def create_pretrained_dataset( function get_train_data_file (line 318) | def get_train_data_file(args): class PretrainingTrainer (line 343) | class PretrainingTrainer(Trainer): method __init__ (line 344) | def __init__(self, *args, **kwargs): method evaluate (line 348) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre... method _get_eval_sampler (line 388) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]: method _get_train_sampler (line 398) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: function main (line 409) | def main(): FILE: examples/experiments/deepseek_v3_pretrain/token_dispatcher.py class _DeepepManager (line 30) | class _DeepepManager(_DispatchManager): method __init__ (line 52) | def __init__( method setup_metadata (line 73) | def setup_metadata(self, routing_map: paddle.Tensor, probs: paddle.Ten... method dispatch (line 81) | def dispatch( method _indices_to_multihot (line 93) | def _indices_to_multihot(self, indices, probs): method get_dispatched_metadata (line 118) | def get_dispatched_metadata(self) -> paddle.Tensor: method get_number_of_tokens_per_expert (line 121) | def get_number_of_tokens_per_expert(self) -> paddle.Tensor: method combine (line 127) | def combine(self, hidden_states: paddle.Tensor) -> paddle.Tensor: method get_permuted_hidden_states_by_experts (line 133) | def get_permuted_hidden_states_by_experts(self, hidden_states: paddle.... method get_permuted_hidden_states_by_experts_fast (line 145) | def get_permuted_hidden_states_by_experts_fast( method get_restored_hidden_states_by_experts (line 155) | def get_restored_hidden_states_by_experts(self, hidden_states: paddle.... method get_restored_hidden_states_by_experts_fast (line 167) | def get_restored_hidden_states_by_experts_fast( class MoETokenDispatcher (line 186) | class MoETokenDispatcher: method __init__ (line 191) | def __init__(self, ep_group) -> None: method ep_group (line 198) | def ep_group(self): method ep_size (line 203) | def ep_size(self): method token_permutation (line 208) | def token_permutation(self, tokens: paddle.Tensor, probs: paddle.Tenso... method token_unpermutation (line 222) | def token_unpermutation(self, expert_output: paddle.Tensor, bias: padd... class MoEFlexTokenDispatcher (line 235) | class MoEFlexTokenDispatcher(MoETokenDispatcher): method __init__ (line 240) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m... method token_permutation (line 252) | def token_permutation( method token_unpermutation (line 265) | def token_unpermutation( class MoEFlexTokenDispatcherFast (line 276) | class MoEFlexTokenDispatcherFast: method __init__ (line 281) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m... method ep_group (line 294) | def ep_group(self): method ep_size (line 299) | def ep_size(self): method pre_dispatch (line 303) | def pre_dispatch(self, hidden_states, probs, routing_map): method post_dispatch (line 313) | def post_dispatch(self, hidden_states, dispatched_indices): method pre_combine (line 321) | def pre_combine(self, hidden_states, token_permuted_indices, prob_perm... method post_combine (line 327) | def post_combine(self, hidden_states): method token_permutation (line 331) | def token_permutation( method token_unpermutation (line 349) | def token_unpermutation( class PreDispatchNode (line 367) | class PreDispatchNode: method __init__ (line 368) | def __init__(self, token_dispatcher): method reset_status (line 372) | def reset_status(self): method forward (line 378) | def forward(self, routing_map, probs): method backward (line 393) | def backward(self, token_probs_g): FILE: examples/experiments/ernie_pretrain/ernie/config.py function get_config (line 24) | def get_config(verbose=False): FILE: examples/experiments/ernie_pretrain/ernie/model_config.py class ModelConfig (line 22) | class ModelConfig: FILE: examples/experiments/ernie_pretrain/ernie/pretrain.py function log_trainer_start (line 76) | def log_trainer_start(): function load_huggingface_checkpoint (line 86) | def load_huggingface_checkpoint(model, args): function get_expected_state_dict (line 181) | def get_expected_state_dict(model, **kwargs): function update_model_config_from_args (line 249) | def update_model_config_from_args(config: ErnieMoEConfig, model_args: di... function get_tp_split_ckpt (line 259) | def get_tp_split_ckpt(args, path): class AllArguments (line 271) | class AllArguments(PreTrainingArguments): method __post_init__ (line 272) | def __post_init__(self): class ExpConfig (line 277) | class ExpConfig: function create_pretrained_dataset (line 283) | def create_pretrained_dataset(args): function main (line 329) | def main(): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/fp8_quant_weight_callback.py function enable_in_dict_config (line 25) | def enable_in_dict_config(config, key): class FP8QuantWeightCallback (line 32) | class FP8QuantWeightCallback(TrainerCallback): method on_step_begin (line 33) | def on_step_begin(self, args, state, control, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/gc_callback.py class GCCallback (line 20) | class GCCallback(TrainerCallback): method on_train_begin (line 21) | def on_train_begin(self, args, state, control, **kwargs): method on_step_end (line 25) | def on_step_end(self, args, state, control, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/logging_callback.py class LoggingCallback (line 22) | class LoggingCallback(TrainerCallback): method __init__ (line 23) | def __init__( method on_log (line 28) | def on_log(self, args, state, control, logs=None, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/moe_correction_bias_adjust_callback.py class MoECorrectionBiasAdjustCallback (line 28) | class MoECorrectionBiasAdjustCallback(TrainerCallback): method __init__ (line 29) | def __init__(self, lr, use_sp): method on_optimizer_end (line 34) | def on_optimizer_end(self, args, state, control, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/moe_logging_callback.py function tensor_md5 (line 45) | def tensor_md5(tensor): class GlobalRNGCallback (line 51) | class GlobalRNGCallback(TrainerCallback): method on_step_end (line 52) | def on_step_end(self, args, state, control, model, **kwargs): class MoeLoggingCallback (line 57) | class MoeLoggingCallback(TrainerCallback): method __init__ (line 58) | def __init__(self, optimizer): method on_log (line 69) | def on_log(self, args, state, control, logs=None, **kwargs): method on_step_end (line 75) | def on_step_end(self, args, state, control, model, **kwargs): method on_save (line 114) | def on_save(self, args, state, control, model, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/ortho_loss_callback.py class OrthogonalCallback (line 22) | class OrthogonalCallback(TrainerCallback): method __init__ (line 23) | def __init__(self, ortho_loss_lambda): method on_optimizer_end (line 26) | def on_optimizer_end(self, args, state, control, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/sp_grad_sync_callback.py class SPGradSyncCallback (line 28) | class SPGradSyncCallback(TrainerCallback): method __init__ (line 29) | def __init__(self, model): method on_optimizer_begin (line 41) | def on_optimizer_begin(self, args, state, control, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/callbacks/tensorboard_callback.py function is_tensorboard_available (line 29) | def is_tensorboard_available(): function rewrite_logs (line 33) | def rewrite_logs(d): class TensorBoardCallback (line 49) | class TensorBoardCallback(TrainerCallback): method __init__ (line 50) | def __init__( method _init_summary_writer (line 91) | def _init_summary_writer(self, args, log_dir=None): method on_train_begin (line 96) | def on_train_begin(self, args, state, control, **kwargs): method on_log (line 120) | def on_log(self, args, state, control, logs=None, **kwargs): method on_train_end (line 183) | def on_train_end(self, args, state, control, **kwargs): FILE: examples/experiments/ernie_pretrain/ernie/src/clip/moe_clip.py class ClipGradForMOEByGlobalNorm (line 28) | class ClipGradForMOEByGlobalNorm(ClipGradBase): method __init__ (line 29) | def __init__( method __str__ (line 49) | def __str__(self): method get_l2_norm_pow (line 53) | def get_l2_norm_pow(params_grads, sum_dtype=None): method _dygraph_clip (line 101) | def _dygraph_clip(self, params_grads): FILE: examples/experiments/ernie_pretrain/ernie/src/lr_schedulers/cosine_lr.py function get_cosine_schedule_with_warmup (line 24) | def get_cosine_schedule_with_warmup( FILE: examples/experiments/ernie_pretrain/ernie/src/lr_schedulers/wsd_lr.py function get_wsd_schedule_with_warmup (line 20) | def get_wsd_schedule_with_warmup( FILE: examples/experiments/ernie_pretrain/ernie/src/tokenizers/tokenization_eb_v2.py class ErnieBotTokenizer (line 29) | class ErnieBotTokenizer(PretrainedTokenizer): method __init__ (line 40) | def __init__( method space_token (line 72) | def space_token(self): method space_token_id (line 76) | def space_token_id(self): method gend_token (line 80) | def gend_token(self): method gend_token_id (line 84) | def gend_token_id(self): method im_start_id (line 88) | def im_start_id(self): method im_end_id (line 92) | def im_end_id(self): method vocab_size (line 96) | def vocab_size(self): method get_vocab (line 99) | def get_vocab(self): method _tokenize (line 104) | def _tokenize(self, text): method _convert_token_to_id (line 107) | def _convert_token_to_id(self, token): method _convert_id_to_token (line 110) | def _convert_id_to_token(self, id): method convert_tokens_to_string (line 113) | def convert_tokens_to_string(self, tokens): method prepare_for_model (line 126) | def prepare_for_model(self, *args, **kwargs): method save_vocabulary (line 131) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st... method tokenize (line 147) | def tokenize(self, text: TextInput, **kwargs) -> List[str]: method _decode (line 169) | def _decode(self, *args, **kwargs): method _pad (line 179) | def _pad( function add_special_tokens (line 239) | def add_special_tokens( FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/data_parallel.py class DataParallel (line 22) | class DataParallel(paddle.DataParallel): method init_reducer (line 23) | def init_reducer(self): function sync_dp_moe_params_across_sharding (line 74) | def sync_dp_moe_params_across_sharding(model: paddle.nn.Layer) -> None: FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/dygraph_optimizer/hybrid_parallel_optimizer.py class HybridParallelClipGrad (line 37) | class HybridParallelClipGrad: method __init__ (line 38) | def __init__(self, clip, hcg, timers=None): method _global_norm (line 53) | def _global_norm( method _dygraph_clip (line 142) | def _dygraph_clip(self, params_grads): method _comm_and_clip (line 277) | def _comm_and_clip( method __getattr__ (line 330) | def __getattr__(self, item): method __call__ (line 333) | def __call__(self, params_grads): class HybridParallelOptimizer (line 337) | class HybridParallelOptimizer(HPBase): method __init__ (line 338) | def __init__(self, optimizer, hcg, strategy): FILE: examples/experiments/ernie_pretrain/ernie/src/trainers/pretraining_trainer.py function distributed_optimizer_maybe_overwrite (line 97) | def distributed_optimizer_maybe_overwrite( class PreTrainingArguments (line 122) | class PreTrainingArguments(TrainingArguments): method use_moe (line 274) | def use_moe(self): # noqa: F811 method use_moe (line 278) | def use_moe(self, value): method need_data (line 283) | def need_data(self): method combine_batch (line 287) | def combine_batch(self): method reeao_dataset_rank (line 291) | def reeao_dataset_rank(self): method reeao_dataset_world_size (line 295) | def reeao_dataset_world_size(self): method __post_init__ (line 298) | def __post_init__(self): class WeightedDistributedSampler (line 412) | class WeightedDistributedSampler(PaddleNLPDistributedBatchSampler): method __init__ (line 413) | def __init__( method set_epoch (line 459) | def set_epoch(self, epoch=0, consumed_samples=0): method gen_data_seq (line 464) | def gen_data_seq(self): method load_data_seq_from_cache (line 477) | def load_data_seq_from_cache(self): method gen_data_seq_weighted (line 490) | def gen_data_seq_weighted(self, num_examples, data_type=None): method roundup_and_shard (line 568) | def roundup_and_shard(self, indices): method __len__ (line 590) | def __len__(self): method __iter__ (line 593) | def __iter__(self): class DummySampler (line 661) | class DummySampler(PaddleNLPDistributedBatchSampler): method __init__ (line 662) | def __init__(self, dataset, batch_size=1, **kwargs): method __len__ (line 665) | def __len__(self): method __iter__ (line 668) | def __iter__(self): class PretrainingTrainer (line 673) | class PretrainingTrainer(Trainer): method __init__ (line 674) | def __init__(self, args=None, model=None, callbacks=[], **kwargs): method autocast_smart_context_manager (line 695) | def autocast_smart_context_manager(self): method _load_optimizer_state (line 727) | def _load_optimizer_state(self, checkpoint): method _save_moe_weights (line 776) | def _save_moe_weights(self, output_dir): method _wrap_model (line 823) | def _wrap_model(self, model, training=True): method _new_gradclip (line 990) | def _new_gradclip(self): method evaluate (line 1036) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre... method prediction_pipeline_step (line 1067) | def prediction_pipeline_step(self, model, inputs, prediction_loss_only... method restore_dataloader_status (line 1073) | def restore_dataloader_status(self): method _get_eval_sampler (line 1118) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]: method _get_train_sampler (line 1128) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: method _maybe_log_save_evaluate (line 1138) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_... method create_scheduler (line 1302) | def create_scheduler(self, num_training_steps): method create_optimizer (line 1326) | def create_optimizer(self, lr_scheduler=None): method save_model (line 1388) | def save_model(self, output_dir=None): method _load_rng_state (line 1394) | def _load_rng_state(self, checkpoint): FILE: examples/experiments/ernie_pretrain/ernie/src/utils/logging.py function setup_logger_output_file (line 41) | def setup_logger_output_file(outputpath, local_rank): FILE: examples/experiments/ernie_pretrain/ernie/src/utils/misc.py class SmoothedValue (line 39) | class SmoothedValue: method __init__ (line 40) | def __init__( method update (line 49) | def update(self, value): method global_avg (line 60) | def global_avg(self): method reset (line 63) | def reset(self): class TrainingLogs (line 68) | class TrainingLogs: method __new__ (line 71) | def __new__(cls, *args, **kw): method __init__ (line 76) | def __init__(self): method set_trainer_interval (line 84) | def set_trainer_interval(self, trainer, logging_interval): method global_meters_keys (line 89) | def global_meters_keys(self): method global_meters_keys (line 93) | def global_meters_keys(self, lst): method enable_skip_zero (line 96) | def enable_skip_zero(self, keys=[]): method update (line 104) | def update(self, **kwargs): method is_enabled (line 108) | def is_enabled(self): method __setitem__ (line 111) | def __setitem__(self, k, v): method __getitem__ (line 119) | def __getitem__(self, v): method __getattr__ (line 122) | def __getattr__(self, attr): method dict (line 129) | def dict(self, use_async=False): method reset (line 183) | def reset(self): method take_snapshot (line 188) | def take_snapshot(self): method restore_snapshot (line 191) | def restore_snapshot(self): FILE: examples/experiments/ernie_pretrain/ernie/src/utils/seed_utils.py function set_seed (line 26) | def set_seed(seed): FILE: examples/experiments/ernie_pretrain/ernie/src/utils/training_utils.py function reset_per_device_batch_size (line 20) | def reset_per_device_batch_size(global_batch_size, per_device_train_batc... FILE: examples/experiments/ernie_pretrain/models/comm_utils.py function scatter (line 33) | def scatter(input, group=None, axis=0): function mp_slice (line 51) | def mp_slice(x, indices=None, group=None, axis=0): function all_gather_varlen (line 68) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True): function scatter_varlen (line 90) | def scatter_varlen(x, recv_tensor, indices, src_rank, group, sync_op=True): function all_gather (line 112) | def all_gather(input, group=None, axis=0): function reduce_scatter (line 131) | def reduce_scatter(input, group=None): function subbatch (line 148) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar... function gather_varlen (line 193) | def gather_varlen(input, dst, group, offload_pp_data_chunk_size=0, all_s... function profile (line 293) | def profile(name, use_event=True): FILE: examples/experiments/ernie_pretrain/models/ernie/configuration.py class ErnieMoEConfig (line 60) | class ErnieMoEConfig(PretrainedConfig): method __init__ (line 72) | def __init__( method __setattr__ (line 402) | def __setattr__(self, name: str, value): method register_nonsaveable_keys (line 413) | def register_nonsaveable_keys(self, keys): method use_moe (line 422) | def use_moe(self) -> bool: method to_json_string (line 425) | def to_json_string(self, use_diff: bool = True) -> str: FILE: examples/experiments/ernie_pretrain/models/ernie/modeling.py function get_triangle_upper_mask (line 127) | def get_triangle_upper_mask(x, mask=None): function gqa_qkv_split_func (line 139) | def gqa_qkv_split_func( function gqa_qkv_merge_func (line 169) | def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_h... function parallel_matmul (line 190) | def parallel_matmul( function calc_lm_head_logits (line 231) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para... function finfo (line 261) | def finfo(dtype: paddle.dtype = None): function masked_fill (line 279) | def masked_fill(x, mask, value): function mem_eff_attn (line 284) | def mem_eff_attn(query, key, value, pack_offset, drop_prob=0.0, dtype=pa... function inbatch_pack_offset_to_attn_mask_start_row_indices (line 321) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs... function scaled_dot_product_attention (line 336) | def scaled_dot_product_attention( function _make_causal_mask (line 454) | def _make_causal_mask(input_ids_shape, past_key_values_length, dtype): function _expand_mask (line 468) | def _expand_mask(mask, dtype, tgt_length): class FusedDropoutImpl (line 483) | class FusedDropoutImpl(nn.Layer): method __init__ (line 484) | def __init__(self, prob, mode): method forward (line 491) | def forward(self, x, y): class RMSNorm (line 499) | class RMSNorm(nn.Layer): method __init__ (line 500) | def __init__(self, config): method forward (line 514) | def forward(self, hidden_states): class RotaryEmbedding (line 530) | class RotaryEmbedding(nn.Layer): method __init__ (line 531) | def __init__(self, dim, max_position_embeddings=4096, base=10000): method forward (line 547) | def forward(self, x, seq_len=None): method rotate_half (line 555) | def rotate_half(cls, x): method apply_rotary_pos_emb (line 562) | def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, positio... class RopeEmbeddingLegacy (line 580) | class RopeEmbeddingLegacy(nn.Layer): method __init__ (line 581) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a... method forward (line 588) | def forward(self, seq_length, position_ids=None): method apply_rotary (line 604) | def apply_rotary(self, rp, q, k): method apply_rotary_3d (line 626) | def apply_rotary_3d(self, rp, q, k, position_ids): method forward_single (line 694) | def forward_single(self, position_ids): method apply_rotary_single (line 709) | def apply_rotary_single(x, rope_emb): class ErnieMLP (line 717) | class ErnieMLP(nn.Layer): method __init__ (line 718) | def __init__(self, config): method forward (line 798) | def forward(self, x): class ErnieAttention (line 835) | class ErnieAttention(nn.Layer): method __init__ (line 836) | def __init__(self, config, layer_idx=0): method forward (line 997) | def forward( method rope_attn (line 1095) | def rope_attn( class ErnieDecoderLayer (line 1195) | class ErnieDecoderLayer(nn.Layer): method __init__ (line 1196) | def __init__(self, config, layer_idx=0): method forward (line 1210) | def forward( class ErniePretrainedModel (line 1265) | class ErniePretrainedModel(PretrainedModel): method _get_name_mappings (line 1270) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa... method _get_tensor_parallel_mappings (line 1341) | def _get_tensor_parallel_mappings(cls, config, is_split=True): method _init_weights (line 1426) | def _init_weights(self, layer): class ErnieModel (line 1474) | class ErnieModel(ErniePretrainedModel): method __init__ (line 1475) | def __init__(self, config: ErnieMoEConfig): method get_input_embeddings (line 1502) | def get_input_embeddings(self): method set_input_embeddings (line 1505) | def set_input_embeddings(self, value): method _prepare_decoder_attention_mask (line 1509) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ... method recompute_training (line 1528) | def recompute_training( method forward (line 1558) | def forward( class FusedHeadParallelCrossEntropy (line 1692) | class FusedHeadParallelCrossEntropy(PyLayer): method forward (line 1694) | def forward( method backward (line 1812) | def backward(ctx, loss_all_grad, labels_all_grad): class ErniePretrainingCriterion (line 1930) | class ErniePretrainingCriterion(paddle.nn.Layer): method __init__ (line 1931) | def __init__(self, config, return_tuple=True): method forward (line 1946) | def forward(self, prediction_scores, masked_lm_labels): method forward_impl_with_fused_head_loss_fn (line 2002) | def forward_impl_with_fused_head_loss_fn(self, masked_lm_labels, hidde... method forward_impl_with_calc_logits (line 2037) | def forward_impl_with_calc_logits(self, masked_lm_labels, hidden_state... method loss_impl (line 2049) | def loss_impl(self, prediction_scores, masked_lm_labels): method forward_impl (line 2055) | def forward_impl(self, prediction_scores, masked_lm_labels): class ErnieLMHead (line 2110) | class ErnieLMHead(nn.Layer): method __init__ (line 2111) | def __init__(self, config): method forward (line 2150) | def forward(self, hidden_states, tensor_parallel_output=None): method sharded_state_dict (line 2169) | def sharded_state_dict( class ErnieForCausalLM (line 2178) | class ErnieForCausalLM(ErniePretrainedModel): method __init__ (line 2181) | def __init__(self, config): method _post_init (line 2213) | def _post_init(self, original_init, *args, **kwargs): method get_input_embeddings (line 2222) | def get_input_embeddings(self): method set_input_embeddings (line 2225) | def set_input_embeddings(self, value): method get_output_embeddings (line 2228) | def get_output_embeddings(self): method set_output_embeddings (line 2231) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 2234) | def set_decoder(self, decoder): method get_decoder (line 2237) | def get_decoder(self): method prepare_attention_mask_for_generation (line 2241) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos... method prepare_inputs_for_generation (line 2254) | def prepare_inputs_for_generation( method update_model_kwargs_for_generation (line 2285) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is... method forward (line 2323) | def forward( method sharded_state_dict (line 2382) | def sharded_state_dict(self, *args, **kwargs): FILE: examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py class BaseModelOutputWithPastAndCrossAttentions (line 96) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput): class CausalLMOutputWithCrossAttentions (line 103) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput): function get_gate (line 123) | def get_gate( function build_mpdp_group (line 177) | def build_mpdp_group(): function _parse_moe_group (line 198) | def _parse_moe_group( function moe_ep2mp (line 245) | def moe_ep2mp(state_dict: Dict[str, paddle.Tensor], config: ErnieMoEConf... function moe_statedict_cherry_pick (line 294) | def moe_statedict_cherry_pick(state_dict: Dict[str, paddle.Tensor], conf... function moe_statedict_upcycle (line 319) | def moe_statedict_upcycle( class ErnieMoeMLP (line 491) | class ErnieMoeMLP(ErnieMLP): method __init__ (line 492) | def __init__(self, config, is_shared_expert=False): method forward (line 504) | def forward(self, x, use_comm=True): class ErnieMoeDenseExpert (line 566) | class ErnieMoeDenseExpert(nn.Layer): method __init__ (line 567) | def __init__(self, config): method forward (line 615) | def forward(self, x): class BMMLinear (line 642) | class BMMLinear(nn.Layer): method __init__ (line 643) | def __init__(self, experts, d_in, d_out, use_bias=False): method forward (line 651) | def forward(self, x): class ErnieMoeMLPFused (line 657) | class ErnieMoeMLPFused(nn.Layer): method __init__ (line 658) | def __init__(self, config): method __len__ (line 676) | def __len__(self): method __iter__ (line 679) | def __iter__(self): method forward (line 682) | def forward(self, x): class FusedLinearAddNormFunc (line 692) | class FusedLinearAddNormFunc(paddle.autograd.PyLayer): method forward (line 694) | def forward(ctx, x, residual, linear_weight, rms_norm_weight, eps): method backward (line 704) | def backward(ctx, d_rms_norm_out, d_residual_out): class FusedLinearAddNorm (line 723) | class FusedLinearAddNorm(paddle.nn.Layer): method __init__ (line 724) | def __init__(self, hidden_size, eps=1e-6) -> None: method forward (line 742) | def forward(self, x, residual): class FusedRMSLinearFunc (line 746) | class FusedRMSLinearFunc(paddle.autograd.PyLayer): method forward (line 748) | def forward(ctx, x, rms_norm_weight, linear_weight, eps): method backward (line 756) | def backward(ctx, d_qkv): class FusedRMSLinear (line 766) | class FusedRMSLinear(paddle.nn.Layer): method __init__ (line 767) | def __init__(self, hidden_size, eps=1e-6, num_heads=1, num_key_value_h... method forward (line 786) | def forward(self, x): class ErnieMoEAttention (line 790) | class ErnieMoEAttention(ErnieAttention): method __init__ (line 791) | def __init__(self, config, layer_idx): method forward (line 814) | def forward( class FakeMoERouterLoss (line 934) | class FakeMoERouterLoss(PyLayer): method forward (line 936) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss): method backward (line 944) | def backward(ctx, out_grad): class ErnieDecoderLayer (line 953) | class ErnieDecoderLayer(nn.Layer): method __init__ (line 954) | def __init__(self, config, layer_idx): method training (line 1040) | def training(self): method training (line 1044) | def training(self, new): method fp8_quant_weight (line 1050) | def fp8_quant_weight(self): method _init_gate_and_experts (line 1055) | def _init_gate_and_experts(self, layer_idx): method _init_shared_experts (line 1106) | def _init_shared_experts(self): method _init_dense_experts (line 1124) | def _init_dense_experts(self, layer_idx): method forward (line 1147) | def forward( method model_parallel_dropout (line 1231) | def model_parallel_dropout(self): class ErniePretrainedModel (line 1238) | class ErniePretrainedModel(PretrainedModel): method _get_name_mappings (line 1243) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa... method _get_tensor_parallel_mappings (line 1313) | def _get_tensor_parallel_mappings(cls, config, is_split=True): method _init_weights (line 1431) | def _init_weights(self, layer): class ErnieModel (line 1510) | class ErnieModel(ErniePretrainedModel): method __init__ (line 1511) | def __init__(self, config: ErnieMoEConfig): method get_input_embeddings (line 1576) | def get_input_embeddings(self): method set_input_embeddings (line 1579) | def set_input_embeddings(self, value): method _prepare_decoder_attention_mask (line 1583) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ... method recompute_training (line 1602) | def recompute_training( method forward (line 1657) | def forward( class ErniePretrainingCriterion (line 1890) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase): method __init__ (line 1891) | def __init__(self, config, return_tuple=True): method forward (line 1906) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non... class ErnieMoEForCausalLM (line 1959) | class ErnieMoEForCausalLM(ErniePretrainedModel): method __init__ (line 1962) | def __init__(self, config): method _post_init (line 1987) | def _post_init(self, original_init, *args, **kwargs): method set_state_dict (line 2019) | def set_state_dict(self, state_dict, *args, **kwargs): method get_input_embeddings (line 2037) | def get_input_embeddings(self): method set_input_embeddings (line 2040) | def set_input_embeddings(self, value): method get_output_embeddings (line 2043) | def get_output_embeddings(self): method set_output_embeddings (line 2046) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 2049) | def set_decoder(self, decoder): method get_decoder (line 2052) | def get_decoder(self): method prepare_attention_mask_for_generation (line 2056) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos... method prepare_inputs_for_generation (line 2069) | def prepare_inputs_for_generation( method update_model_kwargs_for_generation (line 2101) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is... method forward (line 2140) | def forward( method sharded_state_dict (line 2209) | def sharded_state_dict(self, *args, **kwargs): FILE: examples/experiments/ernie_pretrain/models/ernie/modeling_pp.py class ErnieEmbeddingPipe (line 90) | class ErnieEmbeddingPipe(nn.Layer): method __init__ (line 91) | def __init__(self, config): method embedding_weight (line 107) | def embedding_weight(self): method forward (line 110) | def forward(self, args): class MTPEmbeddingPipe (line 204) | class MTPEmbeddingPipe(ErnieEmbeddingPipe): method __init__ (line 205) | def __init__(self, config): method embedding_weight (line 209) | def embedding_weight(self): method forward (line 212) | def forward(self, args): class EmptyLayer (line 231) | class EmptyLayer(nn.Layer): method __init__ (line 232) | def __init__(self): method forward (line 235) | def forward(self, x): class ErnieDecoderLayerPipe (line 239) | class ErnieDecoderLayerPipe(ErnieDecoderLayer): method __init__ (line 240) | def __init__(self, config, layer_idx, use_full_recompute=False): method forward (line 247) | def forward(self, args): class RMSNormPipe (line 351) | class RMSNormPipe(RMSNorm): method __init__ (line 352) | def __init__(self, config): method forward (line 357) | def forward(self, args): class ErnieMoELMHeadPipe (line 385) | class ErnieMoELMHeadPipe(ErnieMoELMHead): method forward (line 386) | def forward(self, args): class MTPLayer (line 397) | class MTPLayer(nn.Layer): method __init__ (line 398) | def __init__(self, config): method forward (line 432) | def forward(self, args): method forward_impl (line 441) | def forward_impl(self, *args): class ErniePretrainingCriterionPipe (line 518) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion): method __init__ (line 519) | def __init__(self, config): method forward (line 522) | def forward(self, logits, labels): class PipelinePretrainedModel (line 536) | class PipelinePretrainedModel(PretrainedModel): method __init__ (line 537) | def __init__(self, config, *args, **kwargs): method init (line 541) | def init(self, config, *args, **kwargs): method add_sequential_layer (line 546) | def add_sequential_layer(self, layer_desc, name_prefix=""): method get_sequential_layers (line 549) | def get_sequential_layers(self): method get_sequential_name_prefixs (line 552) | def get_sequential_name_prefixs(self): method get_shardlayer_prefix (line 555) | def get_shardlayer_prefix(self, name_splited): method _set_pipeline_name_mapping (line 566) | def _set_pipeline_name_mapping(self, mappings=None): method _check_shared_model_state (line 626) | def _check_shared_model_state(self): method state_dict (line 647) | def state_dict(self, *args, **kwargs): method _init_weights (line 659) | def _init_weights(self, layer): method sharded_state_dict (line 731) | def sharded_state_dict(self, *args, **kwargs): function get_pp_vp_split_layers (line 765) | def get_pp_vp_split_layers(config): class ErnieMoEForCausalLMPipe (line 797) | class ErnieMoEForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): method _prepare_pipeline_inputs_func (line 810) | def _prepare_pipeline_inputs_func(cls, data): method __init__ (line 844) | def __init__( method get_loss_fn (line 995) | def get_loss_fn(self, config): method rename_model_params (line 998) | def rename_model_params(self, func): method fp8_quant_weight (line 1005) | def fp8_quant_weight(self): method _post_init (line 1011) | def _post_init(self, original_init, *args, **kwargs): method set_state_dict (line 1028) | def set_state_dict(self, state_dict, *args, **kwargs): FILE: examples/experiments/ernie_pretrain/models/fp8_linear.py function fp8_gemm (line 44) | def fp8_gemm( function padding (line 94) | def padding(x, axis): class Fp8FusedMlpFunc (line 118) | class Fp8FusedMlpFunc(paddle.autograd.PyLayer): method forward (line 128) | def forward(ctx, x, w1, w2): method backward (line 208) | def backward(ctx, do3): class MemEfficientFp8FusedMlpFunc (line 320) | class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer): method forward (line 333) | def forward(ctx, x, w1, w2): method backward (line 393) | def backward(ctx, do3): class Fp8FusedMlp (line 515) | class Fp8FusedMlp(paddle.nn.Layer): method __init__ (line 526) | def __init__(self, config): method forward (line 557) | def forward(self, x): FILE: examples/experiments/ernie_pretrain/models/moe/moe_layer.py function set_grad_in_dtype_non_consistent (line 65) | def set_grad_in_dtype_non_consistent(ctx): class Fp8MoeGateDispatchAndQuant (line 71) | class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer): method forward (line 75) | def forward( method backward (line 127) | def backward(ctx, *grads): function recompute_fwd_gate_up_func (line 146) | def recompute_fwd_gate_up_func(config, layer_idx): class MoEStatics (line 156) | class MoEStatics(nn.Layer): method __init__ (line 157) | def __init__(self, config, layer_idx): class GateCombine (line 188) | class GateCombine(PyLayer): method forward (line 190) | def forward(ctx, x, combine_weights, scatter_index): method backward (line 198) | def backward(ctx, grad_y, *_): class FusionFP8Expert (line 207) | class FusionFP8Expert(paddle.autograd.PyLayer): method forward (line 209) | def forward(ctx, hidden_states, custom_map): method backward (line 226) | def backward(ctx, output_grad): class AlltoAll (line 237) | class AlltoAll(PyLayer): method forward (line 239) | def forward(ctx, x, group, sync_op=True): method backward (line 252) | def backward(ctx, *dx): class AlltoAllExpertOverlap (line 256) | class AlltoAllExpertOverlap(PyLayer): method forward (line 258) | def forward(ctx, input, group, num_local_experts, forward_func_dict, i... method backward (line 294) | def backward(ctx, out_grad): class AlltoAllAsync (line 313) | class AlltoAllAsync(PyLayer): method forward (line 315) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): method backward (line 336) | def backward(ctx, dx_out, *fn_out_grads): function dispatching (line 356) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): function combining_fused (line 383) | def combining_fused(x, combine_weights, scatter_index, hard_gate=False): class ReshapeKeepGradDtype (line 392) | class ReshapeKeepGradDtype(PyLayer): method forward (line 394) | def forward(ctx, x, shape): method backward (line 400) | def backward(ctx, grad): class MOELayer (line 404) | class MOELayer(nn.Layer): method __init__ (line 412) | def __init__( method forward_experts (line 515) | def forward_experts(self, dispatched_input): method fp8_quant_weight (line 551) | def fp8_quant_weight(self): method fused_gate_logits_process (line 582) | def fused_gate_logits_process(self, gate_logits, token_type_ids, offlo... method gate_distpach_and_quant (line 595) | def gate_distpach_and_quant(self, input, token_type_ids): method gate_and_distpach (line 680) | def gate_and_distpach(self, input, token_type_ids): method _calc_router_loss (line 792) | def _calc_router_loss( method calc_router_loss_and_logging (line 823) | def calc_router_loss_and_logging( method combine_expert_output (line 846) | def combine_expert_output(self, expert_output, combine_weights, scatte... method forward_single_stage (line 854) | def forward_single_stage(self, dispatched_input, stage_id): method all2all_expert_overlap (line 858) | def all2all_expert_overlap(self, x, group): method forward (line 885) | def forward( method sharded_state_dict (line 1059) | def sharded_state_dict( class FP8FusedWLCHFunc (line 1071) | class FP8FusedWLCHFunc(paddle.autograd.PyLayer): method forward (line 1073) | def forward( method backward (line 1126) | def backward(ctx, output_grad): class MlpNode (line 1141) | class MlpNode: method __init__ (line 1142) | def __init__(self, custom_map, max_topk, recompute_fwd_gate_up=False, ... method reset_status (line 1157) | def reset_status(self): method release_mem (line 1165) | def release_mem(self): method forward (line 1170) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro... method backward (line 1211) | def backward(self, hidden_states_out_grad): class Fp8FusedMoeFunc (line 1235) | class Fp8FusedMoeFunc(paddle.autograd.PyLayer): method forward (line 1237) | def forward( method backward (line 1261) | def backward(ctx, output_grad): FILE: examples/experiments/ernie_pretrain/models/moe/token_dispatcher/fp8_utils.py function _get_fp8_weight_and_scale (line 43) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False): function fused_stack_transpose_quant (line 63) | def fused_stack_transpose_quant(weight_list, transpose=False): function split_group_gemm (line 81) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ... function has_config (line 119) | def has_config(config_map, key): class ExpertsGroupGemmNode (line 123) | class ExpertsGroupGemmNode: method __init__ (line 136) | def __init__(self, experts, custom_map, name="moe_experts_node"): method reset_status (line 160) | def reset_status(self): method fwd_gate_up (line 166) | def fwd_gate_up(self, x_bf16, expert_w1, expert_w_count, tokens_per_ex... method fwd_swiglu (line 223) | def fwd_swiglu(self, o1): method fwd_down (line 239) | def fwd_down(self, o1, unzipped_probs, expert_w_count, tokens_per_expe... method fwd_down_no_probs (line 300) | def fwd_down_no_probs(self, o1, expert_w2, expert_w_count, tokens_per_... method bwd_down_input (line 358) | def bwd_down_input(self, expert_w2, unzipped_grad, tokens_per_expert, ... method bwd_down_input_no_prob (line 428) | def bwd_down_input_no_prob(self, expert_w2, unzipped_grad, tokens_per_... method bwd_swiglu (line 468) | def bwd_swiglu(self, o1, do2): method bwd_gate_up_input (line 485) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, expecte... method bwd_down_weight (line 543) | def bwd_down_weight(self, out_grad, o2, expert_w2): method bwd_gate_up_weight (line 627) | def bwd_gate_up_weight(self, do1, input_x, expert_w1): method forward (line 706) | def forward(self, hs_out, unzipped_probs, tokens_per_expert): method backward (line 725) | def backward(self, out_grad, tokens_per_expert, dispatched_indices, ex... method forward_no_prob (line 740) | def forward_no_prob(self, hs_out, tokens_per_expert): method backward_no_prob (line 752) | def backward_no_prob(self, out_grad, tokens_per_expert): class ExpertsGroupGemmContiguousNode (line 774) | class ExpertsGroupGemmContiguousNode: method __init__ (line 787) | def __init__( method reset_status (line 832) | def reset_status(self): method gen_m_indices (line 841) | def gen_m_indices(self, tokens_per_expert): method fwd_gate_up (line 862) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, sca... method fwd_swiglu (line 936) | def fwd_swiglu(self, o1): method fwd_down (line 940) | def fwd_down(self, o1, unzipped_probs, expert_w2, num_expert): method bwd_down_input (line 1006) | def bwd_down_input(self, expert_w2, unzipped_grad, o1): method bwd_swiglu (line 1083) | def bwd_swiglu(self, o1, do2): method bwd_gate_up_input (line 1087) | def bwd_gate_up_input(self, do1, expert_w1): method fused_transpose_split_quant (line 1144) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal... method bwd_down_weight (line 1169) | def bwd_down_weight(self, do3, o2, expert_w2): method bwd_gate_up_weight (line 1245) | def bwd_gate_up_weight(self, do1, input_x, expert_w1): method forward (line 1309) | def forward( method backward (line 1333) | def backward(self, out_grad, a2a_async_fn=None): class ExpertsGroupGemmWLCHNode (line 1426) | class ExpertsGroupGemmWLCHNode(ExpertsGroupGemmContiguousNode): method __init__ (line 1442) | def __init__( method gen_m_indices (line 1478) | def gen_m_indices(self, tokens_per_expert): method fused_transpose_split_quant (line 1498) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal... FILE: examples/experiments/ernie_pretrain/models/moe/token_dispatcher/moe_utils.py function inplace_offload (line 24) | def inplace_offload(x): function inplace_offload_if_needed (line 41) | def inplace_offload_if_needed(x, threshold=2 * 1024 * 1024 * 1024): function topk_to_permuted_indices_single (line 61) | def topk_to_permuted_indices_single(x, num_tokens, expert_id, topk): function topk_to_permuted_indices (line 81) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk): function permute (line 105) | def permute( function unpermute (line 128) | def unpermute( class UnZipNode (line 163) | class UnZipNode: method __init__ (line 178) | def __init__(self, token_dispatcher, name="unzip"): method reset_status (line 190) | def reset_status(self): method forward (line 196) | def forward( method backward (line 240) | def backward(self, dx, hidden_states_out_grad, probs_grad, dispatched_... class ZipNode (line 268) | class ZipNode: method __init__ (line 281) | def __init__(self, token_dispatcher, name="zip"): method forward (line 292) | def forward( method backward (line 326) | def backward( FILE: examples/experiments/ernie_pretrain/models/moe/top2_gate.py function cal_aux_loss_func (line 35) | def cal_aux_loss_func( function masked_fill (line 85) | def masked_fill(x, mask, value): class CalAuxLossFunctor (line 90) | class CalAuxLossFunctor(paddle.autograd.PyLayer): method forward (line 92) | def forward( method backward (line 122) | def backward(ctx, out_grad): function cast_if_needed (line 130) | def cast_if_needed(x, dtype): class FusedGateDetachMatmul (line 134) | class FusedGateDetachMatmul(paddle.autograd.PyLayer): method forward (line 136) | def forward(ctx, x, w): method backward (line 142) | def backward(ctx, y_grad): function gate_detach_matmul (line 155) | def gate_detach_matmul(x, weight, use_fuse): function compute_optimal_transport (line 164) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:... class Top2Gate (line 178) | class Top2Gate(nn.Layer): method __init__ (line 191) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->... method _create_gate_parameter (line 259) | def _create_gate_parameter(self): method forward (line 269) | def forward( method get_capacity (line 301) | def get_capacity(self, num_tokens, cap_factor=None): method top2_gating (line 316) | def top2_gating(self, logits, cap=None, correction_bias=None): method _cal_aux_loss (line 388) | def _cal_aux_loss( method _cal_orthogonal_loss (line 433) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None): method _cal_orthogonal_loss_opt_each_weight (line 448) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): function cal_orthogonal_loss_opt_each_weight_func (line 455) | def cal_orthogonal_loss_opt_each_weight_func(weight, moe_k, use_group, e... class TopKGateFused (line 473) | class TopKGateFused(Top2Gate): method forward (line 474) | def forward( FILE: examples/experiments/ernie_pretrain/models/sequence_parallel_utils.py function get_hcg (line 44) | def get_hcg(): function get_async_loader (line 51) | def get_async_loader(): function hack_offload_wait (line 64) | def hack_offload_wait(task): function hack_reload_wait (line 68) | def hack_reload_wait(task): class ScatterOp (line 72) | class ScatterOp(PyLayer): method forward (line 74) | def forward(ctx, input, axis=0, group=None): method backward (line 80) | def backward(ctx, grad): class GatherOp (line 84) | class GatherOp(PyLayer): method forward (line 86) | def forward(ctx, input, axis=0, group=None): method backward (line 92) | def backward(ctx, grad): class AllGatherOp (line 96) | class AllGatherOp(PyLayer): method forward (line 98) | def forward(ctx, input, group=None): method backward (line 103) | def backward(ctx, grad): class ReduceScatterOp (line 107) | class ReduceScatterOp(PyLayer): method forward (line 109) | def forward(ctx, input, group=None): method backward (line 115) | def backward(ctx, grad): class AllGatherVarlenOp (line 119) | class AllGatherVarlenOp(PyLayer): method forward (line 121) | def forward(ctx, input, group=None): method backward (line 160) | def backward(ctx, grad): class GemmReduceScatterOp (line 174) | class GemmReduceScatterOp(PyLayer): method forward (line 176) | def forward(ctx, input, weight, group): method backward (line 183) | def backward(ctx, grad): class AllGatherGemmOp (line 204) | class AllGatherGemmOp(PyLayer): method forward (line 206) | def forward(ctx, input, weight, group): method backward (line 214) | def backward(ctx, grad): function sequence_parallel_sparse_mask_labels (line 231) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100): function mark_as_sequence_parallel_parameter (line 247) | def mark_as_sequence_parallel_parameter(parameter): function is_sequence_parallel_parameter (line 251) | def is_sequence_parallel_parameter(parameter): function create_fused_allreduce_gradient_hook (line 255) | def create_fused_allreduce_gradient_hook(parameter_list, accumulation_st... function create_non_fused_allreduce_gradient_hook (line 272) | def create_non_fused_allreduce_gradient_hook(param, model, verbose=False): function register_sequence_parallel_allreduce_hooks (line 295) | def register_sequence_parallel_allreduce_hooks(model, fuse_sequence_para... function is_fused_matmul_bias_supported (line 318) | def is_fused_matmul_bias_supported(): class ColumnSequenceParallelLinear (line 334) | class ColumnSequenceParallelLinear(Layer): method __init__ (line 335) | def __init__( method forward (line 427) | def forward(self, x, use_comm=True): method sharded_state_dict (line 447) | def sharded_state_dict( class MPScale (line 455) | class MPScale(PyLayer): method forward (line 457) | def forward(ctx, x, mp_degree): method backward (line 462) | def backward(ctx, dout): class RowSequenceParallelLinear (line 466) | class RowSequenceParallelLinear(Layer): method __init__ (line 467) | def __init__( method forward (line 563) | def forward(self, x): method sharded_state_dict (line 594) | def sharded_state_dict( FILE: examples/experiments/ernie_pretrain/models/utils.py function get_global_training_logs (line 31) | def get_global_training_logs(): function global_training_logs_enabled (line 47) | def global_training_logs_enabled(): function inplace_offload (line 52) | def inplace_offload(tensor): function detach_and_requires_grad_ (line 57) | def detach_and_requires_grad_(*args): class FakeClone (line 65) | class FakeClone(paddle.autograd.PyLayer): method forward (line 67) | def forward(ctx, input): method backward (line 76) | def backward(ctx, grad_output): function manual_backward (line 80) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]): class FakeGather (line 118) | class FakeGather(paddle.autograd.PyLayer): method forward (line 120) | def forward(ctx, input, indices): method backward (line 130) | def backward(ctx, grad_output): class FusedUnpermutation (line 139) | class FusedUnpermutation(paddle.autograd.PyLayer): method forward (line 141) | def forward( method backward (line 178) | def backward(ctx, output_tokens_grad): FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/convert_sharded_to_uc.py function parse_args (line 35) | def parse_args(): function convert_ckpt (line 53) | def convert_ckpt(args): FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/gather_all_ckpt.py function parse_args (line 20) | def parse_args(): function parse_path (line 39) | def parse_path(args): function get_ip_list (line 50) | def get_ip_list(args): function gather_ckpt (line 63) | def gather_ckpt(org_path, tgt_path, hostnames, local_host): FILE: examples/experiments/ernie_pretrain/tools/sharded_to_uc/merge_sharding_ep.py class Timer (line 53) | class Timer: method __init__ (line 54) | def __init__(self, name="name"): method __enter__ (line 57) | def __enter__(self): method __exit__ (line 61) | def __exit__(self, exc_type, exc_val, exc_tb): function strtobool (line 66) | def strtobool(s): function execute_cmd (line 76) | def execute_cmd(cmd, ignore_error=False): function parse_args (line 85) | def parse_args(): function save_ckpt (line 100) | def save_ckpt(ckpt, save_dir, rank_info, mp_degree, pp_degree=0, ep_degr... class Client (line 127) | class Client: method __init__ (line 128) | def __init__(self, args, base_path, nproc_per_node=8, nnodes=1, node_r... method _get_expert_param_shape (line 174) | def _get_expert_param_shape(self, meta): method _expert_id (line 183) | def _expert_id(self, s_name): method _global_expert_id (line 191) | def _global_expert_id(self, local_id, ep_rank): method _get_num_experts_per_rank (line 194) | def _get_num_experts_per_rank(self): method _gen_node_id_map (line 205) | def _gen_node_id_map(self): method _modify_expert_id (line 233) | def _modify_expert_id(self, s_name, new_id): method merge_and_save (line 243) | def merge_and_save( method _merge_sharding_for_dense_params (line 311) | def _merge_sharding_for_dense_params(self, parallel_2_ckpt_map, ignore... method _replicate_fused_param (line 334) | def _replicate_fused_param(self, local_params, indices_or_sections, co... method _replicate_dense_params (line 346) | def _replicate_dense_params(self, dense_params): method _merge_sharding_for_expert_params (line 359) | def _merge_sharding_for_expert_params(self, parallel_2_ckpt_map, ignor... method _extend_ep_degree_for_expert_params (line 383) | def _extend_ep_degree_for_expert_params(self, expert_params, dst_ep_de... method _get_final_ckpts (line 404) | def _get_final_ckpts( method _read_ckpts (line 467) | def _read_ckpts(self, args): method _read_ckpt (line 480) | def _read_ckpt(self, mp, pp, sd, include_opt_state): method _read_all_ckpts_by_pp_stage (line 483) | def _read_all_ckpts_by_pp_stage(self, pp_stage, include_opt_state=False): method _merge_and_save (line 504) | def _merge_and_save(self, mp_rank, save_dir, include_opt_state, ignore... method _merge_pp_ckpts (line 549) | def _merge_pp_ckpts(self, rank_info, ckpts, is_opt): method _get_param_meta (line 564) | def _get_param_meta(self, mp_rank, ep_rank=None): method _merge_sharding_param_ckpts (line 584) | def _merge_sharding_param_ckpts( method _concat_crop_reshape (line 620) | def _concat_crop_reshape(self, arrs, shape, name, ignore_sharding_padd... method _get_opt_state_key_and_type (line 639) | def _get_opt_state_key_and_type(self, name): method _merge_sharding_opt_ckpts (line 664) | def _merge_sharding_opt_ckpts(self, mp_rank, ckpts, ignore_sharding_pa... method _cal_ep_rank (line 759) | def _cal_ep_rank(self, sd_rank, mp_rank): method load_ckpt (line 764) | def load_ckpt(self, mp_rank, pp_rank, sharding_rank, include_opt_state): method weight_suffix (line 821) | def weight_suffix(self, mp_rank, pp_rank, sharding_rank): method load_model_meta (line 834) | def load_model_meta(self): method move_useful_file (line 841) | def move_useful_file(self, save_dir): function merge_and_save (line 851) | def merge_and_save(args): FILE: examples/experiments/ernie_pretrain/tools/uc_to_sharded/convert_uc_to_sharded.py function parse_args (line 31) | def parse_args(): function find_files (line 40) | def find_files(path, suffixes): class Checkpoint (line 56) | class Checkpoint: method __init__ (line 57) | def __init__(self, args): method map_to_org_model (line 95) | def map_to_org_model(self, layer_name): method load_from_org_model (line 104) | def load_from_org_model(self, layer_name): method process_one_pdparam (line 122) | def process_one_pdparam(self, pdparam_path): method process_pdparams (line 134) | def process_pdparams(self): method load_from_org_model_with_tensor_name (line 138) | def load_from_org_model_with_tensor_name(self, tensor_name, structure_... method process_one_pdopt (line 184) | def process_one_pdopt(self, pdopt_path): method process_pdopts (line 246) | def process_pdopts(self): function convert_ckpt (line 251) | def convert_ckpt(args): FILE: examples/experiments/paddlefleet/glm45_provider.py class GLMMoEModelProvider (line 31) | class GLMMoEModelProvider(GPTModelProvider): class GLM45ModelProvider355B (line 88) | class GLM45ModelProvider355B(GLMMoEModelProvider): class GLM45AirModelProvider106B (line 107) | class GLM45AirModelProvider106B(GLMMoEModelProvider): class GLM45AirModelDebugProvider (line 127) | class GLM45AirModelDebugProvider(GLM45AirModelProvider106B): class GLM45AirModelDebugProviderFP8 (line 148) | class GLM45AirModelDebugProviderFP8(GLM45AirModelDebugProvider): class GLM45AirModelSingleCardDebugProvider (line 154) | class GLM45AirModelSingleCardDebugProvider(GLMMoEModelProvider): FILE: examples/experiments/paddlefleet/qwen_provider.py class Qwen3MoEModelProvider (line 31) | class Qwen3MoEModelProvider(GPTModelProvider): class Qwen3MoEModelProvider30B_A3B (line 75) | class Qwen3MoEModelProvider30B_A3B(Qwen3MoEModelProvider): class Qwen3MoEModelSingleCardProvider (line 89) | class Qwen3MoEModelSingleCardProvider(Qwen3MoEModelProvider): FILE: examples/experiments/paddlefleet/run_pretrain.py class PreTrainingArguments (line 65) | class PreTrainingArguments(TrainingArguments): method __post_init__ (line 118) | def __post_init__(self): class DataArguments (line 175) | class DataArguments: class ModelArguments (line 208) | class ModelArguments: function create_pretrained_dataset (line 246) | def create_pretrained_dataset( function get_train_data_file (line 357) | def get_train_data_file(args): class PretrainingTrainer (line 382) | class PretrainingTrainer(Trainer): method __init__ (line 383) | def __init__(self, *args, **kwargs): method evaluate (line 387) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre... method _get_eval_sampler (line 427) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]: method _get_train_sampler (line 437) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: function _set_random_seed (line 448) | def _set_random_seed( function main (line 474) | def main(): FILE: examples/tools/create_pretraining_data.py function print_datetime (line 40) | def print_datetime(string): function get_args (line 45) | def get_args(): function lexical_analysis_fn (line 100) | def lexical_analysis_fn(): function chinese_segmentation_fn (line 112) | def chinese_segmentation_fn(): function jieba_segmentation_fn (line 124) | def jieba_segmentation_fn(): function get_whole_word_mask_tokens (line 134) | def get_whole_word_mask_tokens(tokens, words, max_word_length=6): class IdentitySplitter (line 199) | class IdentitySplitter(object): method tokenize (line 200) | def tokenize(self, *text): class NewlineSplitter (line 204) | class NewlineSplitter: method tokenize (line 205) | def tokenize(self, text): class Converter (line 209) | class Converter(object): method __init__ (line 210) | def __init__(self, args): method initializer (line 213) | def initializer(self): method remove_repeated_chars (line 269) | def remove_repeated_chars(text, max_repeated_len=100): method encode (line 284) | def encode(self, json_line): function main (line 306) | def main(): FILE: examples/tools/gpt-oss_weight_change/change_weight_dtype.py function find_safetensors_files (line 46) | def find_safetensors_files(directory): function endswith (line 55) | def endswith(key, prefix_list): function save_single_safetenors (line 62) | def save_single_safetenors(save_path, state_dict, rank, total_files_size... function fp4_to_bf16 (line 73) | def fp4_to_bf16(load_path, save_path): function bf16_to_fp4 (line 96) | def bf16_to_fp4(load_path, save_path): FILE: examples/tools/merge.py function print_datetime (line 25) | def print_datetime(string): function merge_sft_datasets (line 30) | def merge_sft_datasets(input_dirs, output_dir): function main (line 120) | def main(args): FILE: examples/tools/trans_paddlenlp2hf.py function parse_arguments (line 28) | def parse_arguments(): function load_safetensors_state_dict (line 44) | def load_safetensors_state_dict(input_dir): function trans_paddlenlp2hf (line 61) | def trans_paddlenlp2hf(): FILE: paddleformers/__init__.py function compare_version (line 32) | def compare_version(v1, v2): function _check_dependency_versions (line 42) | def _check_dependency_versions(): FILE: paddleformers/cli/cli.py function main (line 57) | def main(): FILE: paddleformers/cli/export/export.py function check_download_repo (line 33) | def check_download_repo(model_name_or_path, download_hub=None): function logger_merge_config (line 52) | def logger_merge_config(merge_config, lora_merge): function run_export (line 83) | def run_export(args: Optional[dict[str, Any]] = None) -> None: FILE: paddleformers/cli/hparams/data_args.py class DataArguments (line 19) | class DataArguments: FILE: paddleformers/cli/hparams/export_args.py class ExportArguments (line 19) | class ExportArguments: FILE: paddleformers/cli/hparams/finetuning_args.py class PreTrainingArguments (line 28) | class PreTrainingArguments(TrainingArguments): method need_data (line 102) | def need_data(self): method reeao_dataset_rank (line 120) | def reeao_dataset_rank(self): method reeao_dataset_world_size (line 138) | def reeao_dataset_world_size(self): class VLSFTTrainingArguments (line 148) | class VLSFTTrainingArguments(PreTrainingArguments): class SFTTrainingArguments (line 156) | class SFTTrainingArguments(TrainingArguments): class DPOTrainingArguments (line 169) | class DPOTrainingArguments(TrainingArguments): class FinetuningArguments (line 232) | class FinetuningArguments( method __post_init__ (line 300) | def __post_init__(self): FILE: paddleformers/cli/hparams/generating_args.py class StreamOptions (line 18) | class StreamOptions: method __init__ (line 25) | def __init__(self, max_count: int = 100): class GeneratingArguments (line 35) | class GeneratingArguments: FILE: paddleformers/cli/hparams/model_args.py class VisionArguments (line 20) | class VisionArguments: class FP8MemConfigs (line 38) | class FP8MemConfigs: class FP8FusedOpsConfigs (line 47) | class FP8FusedOpsConfigs: class ErniePretrainArgument (line 56) | class ErniePretrainArgument: class ModelArguments (line 80) | class ModelArguments: method __post_init__ (line 234) | def __post_init__(self): FILE: paddleformers/cli/hparams/parser.py function _load_custom_template (line 84) | def _load_custom_template(custom_path): function read_args (line 94) | def read_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -... function _parse_args (line 115) | def _parse_args( function _parse_train_args (line 152) | def _parse_train_args(args: Optional[Union[dict[str, Any], list[str]]] =... function _parse_eval_args (line 166) | def _parse_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = ... function _parse_server_args (line 180) | def _parse_server_args(args: Optional[Union[dict[str, Any], list[str]]] ... function _parse_export_args (line 194) | def _parse_export_args(args: Optional[Union[dict[str, Any], list[str]]] ... function get_train_args (line 208) | def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = No... function get_eval_args (line 260) | def get_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = Non... function get_server_args (line 273) | def get_server_args(args: Optional[Union[dict[str, Any], list[str]]] = N... function get_export_args (line 286) | def get_export_args(args: Optional[Union[dict[str, Any], list[str]]] = N... FILE: paddleformers/cli/hparams/preprocess_args.py class BasePreprocessArguments (line 25) | class BasePreprocessArguments: method __post_init__ (line 26) | def __post_init__(self): class UtteranceProcessorArguments (line 31) | class UtteranceProcessorArguments(BasePreprocessArguments): method __post_init__ (line 39) | def __post_init__(self): class CoarseProcessorArguments (line 46) | class CoarseProcessorArguments(BasePreprocessArguments): method __post_init__ (line 57) | def __post_init__(self): class InputIdsMassageArguments (line 64) | class InputIdsMassageArguments(BasePreprocessArguments): method __post_init__ (line 92) | def __post_init__(self): class ImageModificationProcessorArguments (line 102) | class ImageModificationProcessorArguments(BasePreprocessArguments): method __post_init__ (line 112) | def __post_init__(self): class End2EndProcessorArgumentsHelper (line 117) | class End2EndProcessorArgumentsHelper(BasePreprocessArguments): method __post_init__ (line 124) | def __post_init__(self): class End2EndProcessorArguments (line 129) | class End2EndProcessorArguments( method __post_init__ (line 136) | def __post_init__(self): FILE: paddleformers/cli/hparams/server_args.py class ServerArguments (line 19) | class ServerArguments: FILE: paddleformers/cli/launcher.py function launch (line 21) | def launch(): FILE: paddleformers/cli/train/auto_parallel/workflow.py function create_pretrained_dataset (line 42) | def create_pretrained_dataset( function get_train_data_file (line 116) | def get_train_data_file(args): class PretrainingTrainer (line 141) | class PretrainingTrainer(Trainer): method __init__ (line 142) | def __init__(self, *args, **kwargs): function run_auto_parallel (line 147) | def run_auto_parallel(model_args, data_args, generating_args, training_a... FILE: paddleformers/cli/train/deepseek_v3_pretrain/configuration.py class DeepseekV2FastConfig (line 22) | class DeepseekV2FastConfig(PretrainedConfig): method __init__ (line 131) | def __init__( FILE: paddleformers/cli/train/deepseek_v3_pretrain/fp8_linear.py function fp8_linear (line 54) | def fp8_linear( function register_scale (line 95) | def register_scale(self): class Linear (line 109) | class Linear(PD_Linear): method __init__ (line 110) | def __init__(self, *args, **kwargs): class ColumnParallelLinear (line 116) | class ColumnParallelLinear(PD_ColumnParallelLinear): method __init__ (line 117) | def __init__(self, *args, **kwargs): class RowParallelLinear (line 123) | class RowParallelLinear(PD_RowParallelLinear): method __init__ (line 124) | def __init__(self, *args, **kwargs): class ColumnSequenceParallelLinear (line 130) | class ColumnSequenceParallelLinear(PD_ColumnSequenceParallelLinear): method __init__ (line 131) | def __init__(self, *args, **kwargs): class RowSequenceParallelLinear (line 137) | class RowSequenceParallelLinear(PD_RowSequenceParallelLinear): method __init__ (line 138) | def __init__(self, *args, **kwargs): FILE: paddleformers/cli/train/deepseek_v3_pretrain/kernel.py function act_quant_kernel (line 30) | def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr): function act_quant (line 51) | def act_quant(x: paddle.Tensor, block_size: int = 128) -> Tuple[paddle.T... function weight_dequant_kernel (line 74) | def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.cons... function weight_dequant (line 100) | def weight_dequant(x: paddle.Tensor, s: paddle.Tensor, block_size: int =... function fp8_gemm_kernel (line 130) | def fp8_gemm_kernel( function fp8_gemm (line 190) | def fp8_gemm(a: paddle.Tensor, a_s: paddle.Tensor, b: paddle.Tensor, b_s... FILE: paddleformers/cli/train/deepseek_v3_pretrain/modeling.py function swiglu (line 110) | def swiglu(x, y=None): function get_use_casual_mask (line 134) | def get_use_casual_mask(): function set_global_step (line 139) | def set_global_step(cur_step): function get_global_step (line 144) | def get_global_step(): function rms_norm_fused (line 149) | def rms_norm_fused(x_in, w, eps, use_fast_ln=False): function cast_if_needed (line 157) | def cast_if_needed(x, dtype): function fusion_rms_norm (line 164) | def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln... class LMHeadFunction (line 187) | class LMHeadFunction(paddle.autograd.PyLayer): method forward (line 189) | def forward(ctx, x, weight, transpose_y): method backward (line 196) | def backward(ctx, dout): function parallel_matmul (line 226) | def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_para... class DeepseekV2MLP (line 256) | class DeepseekV2MLP(nn.Layer): method __init__ (line 257) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, int... method forward (line 303) | def forward(self, x): class MoEGate (line 309) | class MoEGate(PretrainedMoEGate): method __init__ (line 310) | def __init__( method forward (line 356) | def forward(self, hidden_states): class DeepseekV2MoE (line 408) | class DeepseekV2MoE(MoELayer): method __init__ (line 413) | def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, nor... method fp8_quant_weight (line 492) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None): method forward (line 555) | def forward(self, hidden_states): method post_process (line 580) | def post_process(self, hidden_states, final_hidden_states, l_aux): class DeepseekV2RotaryEmbedding (line 591) | class DeepseekV2RotaryEmbedding(nn.Layer): method __init__ (line 592) | def __init__(self, dim, max_position_embeddings=2048, base=10000): method _set_cos_sin_cache (line 607) | def _set_cos_sin_cache(self, seq_len): method forward (line 620) | def forward(self, x, seq_len=None): class DeepseekV2Attention (line 633) | class DeepseekV2Attention(nn.Layer): method __init__ (line 636) | def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: ... method fp8_quant_weight (line 746) | def fp8_quant_weight(self, quant_transpose=None): method _init_rope (line 753) | def _init_rope(self): method _shape (line 785) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): method forward (line 788) | def forward( class DeepseekV2DecoderLayer (line 933) | class DeepseekV2DecoderLayer(nn.Layer): method __init__ (line 934) | def __init__( method fp8_quant_weight (line 975) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=None): method forward (line 984) | def forward( method self_attn_compute (line 1082) | def self_attn_compute(self, hidden_states, **kwargs): method pre_dispatch_compute (line 1132) | def pre_dispatch_compute(self, hidden_states): method expert_forward_compute (line 1139) | def expert_forward_compute(self, intermediate_hidden_states, dispatche... method post_combine_compute (line 1152) | def post_combine_compute(self, residual, hidden_states, final_hidden_s... class DeepseekV2MTPLayer (line 1167) | class DeepseekV2MTPLayer(DeepseekV2DecoderLayer): method __init__ (line 1168) | def __init__( method forward (line 1180) | def forward( class DeepseekV2PretrainedModelFast (line 1217) | class DeepseekV2PretrainedModelFast(PretrainedModel): method _get_model_flops (line 1222) | def _get_model_flops(self, batch_size=1, seq_length=None, **kwargs): method _get_hardware_flops (line 1235) | def _get_hardware_flops(self, *args, **kwargs): method _get_name_mappings (line 1239) | def _get_name_mappings(cls, config: DeepseekV2FastConfig) -> list[Stat... method _get_tensor_parallel_mappings (line 1299) | def _get_tensor_parallel_mappings(cls, config: DeepseekV2FastConfig, i... method _init_weights (line 1399) | def _init_weights(self, layer): method step_flex_token (line 1460) | def step_flex_token(self, cur_step): class DeepseekV2ModelFast (line 1465) | class DeepseekV2ModelFast(DeepseekV2PretrainedModelFast): method __init__ (line 1473) | def __init__(self, config: DeepseekV2FastConfig): method get_input_embeddings (line 1503) | def get_input_embeddings(self): method set_input_embeddings (line 1506) | def set_input_embeddings(self, value): method _prepare_decoder_attention_mask (line 1510) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_... method recompute_training_full (line 1545) | def recompute_training_full( method forward (line 1576) | def forward( class DeepseekV2PretrainingCriterionFast (line 1787) | class DeepseekV2PretrainingCriterionFast(nn.Layer): method __init__ (line 1793) | def __init__(self, config: DeepseekV2FastConfig): method forward (line 1804) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non... function yarn_find_correction_dim (line 1854) | def yarn_find_correction_dim(num_rotations, dim, base=10000, max_positio... function yarn_find_correction_range (line 1859) | def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_p... function yarn_linear_ramp_mask (line 1865) | def yarn_linear_ramp_mask(min, max, dim): class DeepseekV2YarnRotaryEmbedding (line 1874) | class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding): method __init__ (line 1875) | def __init__( method _set_cos_sin_cache (line 1895) | def _set_cos_sin_cache(self, seq_len): class RmsNormFunction (line 1929) | class RmsNormFunction(paddle.autograd.PyLayer): method forward (line 1931) | def forward(ctx, x, scale, epsilon): method backward (line 1940) | def backward(ctx, grad_output): class DeepseekV2RMSNorm (line 1954) | class DeepseekV2RMSNorm(nn.Layer): method __init__ (line 1955) | def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps... method forward (line 1979) | def forward(self, hidden_states): method extra_repr (line 1992) | def extra_repr(self): function apply_rotary_pos_emb (line 1996) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion... class FusedNormGateFunc (line 2050) | class FusedNormGateFunc(paddle.autograd.PyLayer): method set_temporary_vars (line 2057) | def set_temporary_vars(cls, norm_output, invar): method clear_temporary_vars (line 2062) | def clear_temporary_vars(cls): method forward (line 2067) | def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps): method backward (line 2077) | def backward(ctx, d_gate_logits, d_norm_output): class TemporaryVarContext (line 2101) | class TemporaryVarContext: method __init__ (line 2102) | def __init__(self, norm_output, invar): method __enter__ (line 2106) | def __enter__(self): method __exit__ (line 2109) | def __exit__(self, exc_type, exc_val, exc_tb): function balance_expert_assignment (line 2113) | def balance_expert_assignment(n, m, k): class FakeGate (line 2124) | class FakeGate(paddle.autograd.PyLayer): method forward (line 2126) | def forward(ctx, hidden_states, weight, fakse_gate_restrict_balance=Fa... method backward (line 2142) | def backward(ctx, grad_output): class AddAuxiliaryLoss (line 2146) | class AddAuxiliaryLoss(paddle.autograd.PyLayer): method forward (line 2153) | def forward(ctx, x, loss): method backward (line 2159) | def backward(ctx, grad_output): function qkv_pre_process_no_fuse (line 2167) | def qkv_pre_process_no_fuse( function rearrange_kv (line 2204) | def rearrange_kv(kv, k_pe, qk_nope_head_dim, num_heads): function enable_to_static (line 2215) | def enable_to_static(value): function qkv_pre_process (line 2224) | def qkv_pre_process( function manul_fwd (line 2267) | def manul_fwd( class MemroyRecomputeAttnFunc (line 2319) | class MemroyRecomputeAttnFunc(paddle.autograd.PyLayer): method forward (line 2321) | def forward( method backward (line 2516) | def backward(ctx, dout): class MemroyRecomputeAttn (line 2829) | class MemroyRecomputeAttn(paddle.nn.Layer): method __init__ (line 2830) | def __init__( method fp8_quant_weight (line 2908) | def fp8_quant_weight(self, quant_transpose=None): method forward (line 2912) | def forward(self, q_init, kv_init, position_ids): class FusedRMSLinearFunc (line 2942) | class FusedRMSLinearFunc(paddle.autograd.PyLayer): method forward (line 2944) | def forward(ctx, x, rms_norm_weight, q_down_weight, kv_down_weight, eps): method backward (line 2965) | def backward(ctx, d_q, d_kv): class FusedRMSLinear (line 3015) | class FusedRMSLinear(paddle.nn.Layer): method __init__ (line 3016) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None: method fp8_quant_weight (line 3040) | def fp8_quant_weight(self, quant_transpose=None): method forward (line 3043) | def forward(self, x): class FusedRMSLinearSingleFunc (line 3048) | class FusedRMSLinearSingleFunc(paddle.autograd.PyLayer): method forward (line 3050) | def forward(ctx, x, rms_norm_weight, linear_weight, eps): method backward (line 3059) | def backward(ctx, d_q, d_kv): class FusedRMSLinearSingle (line 3070) | class FusedRMSLinearSingle(paddle.nn.Layer): method __init__ (line 3071) | def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None: method forward (line 3088) | def forward(self, x): class FastCrossEntropyFunction (line 3093) | class FastCrossEntropyFunction(paddle.autograd.PyLayer): method forward (line 3095) | def forward(ctx, preds, labels): method backward (line 3103) | def backward(ctx, dout): class DeepseekV2LMHead (line 3113) | class DeepseekV2LMHead(nn.Layer): method __init__ (line 3114) | def __init__(self, config: DeepseekV2FastConfig, embedding_weight=None): method forward (line 3150) | def forward(self, hidden_states, tensor_parallel_output=None): method extra_repr (line 3172) | def extra_repr(self): FILE: paddleformers/cli/train/deepseek_v3_pretrain/modeling_pp.py function check_accept_none_grad (line 82) | def check_accept_none_grad(): function parse_args (line 98) | def parse_args(args): function return_args (line 128) | def return_args(hidden_states, attention_mask=None, attn_mask_startend_r... function get_attr (line 143) | def get_attr(layer, name): function calc_stream_wait (line 150) | def calc_stream_wait(group_id): class TensorMeta (line 155) | class TensorMeta: method __init__ (line 158) | def __init__(self, tensor): class PostProcessNode (line 163) | class PostProcessNode(ScheduleNode): method __init__ (line 164) | def __init__( method forward_without_residual (line 188) | def forward_without_residual(self, inputs): method forward (line 232) | def forward(self, inputs): method backward (line 278) | def backward(self, output_grad): class DecoderLayerNode (line 334) | class DecoderLayerNode(ScheduleNode): method __init__ (line 335) | def __init__( method dispatch_forward (line 364) | def dispatch_forward(self, inputs, previous_event=None, allocate_on_co... method combine_forward (line 410) | def combine_forward(self, inputs, previous_event=None): method dispatch_backward (line 426) | def dispatch_backward(self, output_grad): method combine_backward (line 465) | def combine_backward(self, output_grad): method forward (line 492) | def forward(self, inputs): method backward (line 512) | def backward(self, output_grad=None, scaler=None): class OverlapedScheduleChunk (line 535) | class OverlapedScheduleChunk: method __init__ (line 536) | def __init__(self, forward_nodes, backward_nodes, use_fuion=True): method forward_backward (line 547) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa... class DecoderBackwardScheduleChunk (line 560) | class DecoderBackwardScheduleChunk: method __init__ (line 561) | def __init__(self, nodes): method backward (line 564) | def backward(self, output_grad, combine_bw_event_to_wait=None, pp_stre... class OverlapedScheduleNode (line 574) | class OverlapedScheduleNode: method __init__ (line 575) | def __init__(self, forward_node, backward_node, name=""): method forward_backward (line 581) | def forward_backward(self, inputs, output_grad, event_to_wait=None): class FusionFp8DecoderLayerNode (line 609) | class FusionFp8DecoderLayerNode(ScheduleNode): method __init__ (line 610) | def __init__( method attn_forward (line 634) | def attn_forward(self, inputs): method dispatch_forward (line 670) | def dispatch_forward(self, inputs, previous_event=None, async_finish=F... method mlp_forward (line 698) | def mlp_forward(self, inputs): method combine_forward (line 737) | def combine_forward(self, inputs, async_finish=False, previous_event=N... method post_process_forward (line 763) | def post_process_forward(self, inputs, with_residual=True): method post_process_backward (line 786) | def post_process_backward(self, output_grad, event_to_wait=None): method combine_backward (line 820) | def combine_backward(self, output_grad, previous_event=None, async_fin... method mlp_backward (line 878) | def mlp_backward(self, output_grad): method dispatch_backward (line 910) | def dispatch_backward(self, output_grad, async_finish=False, previous_... method attn_backward (line 959) | def attn_backward(self, output_grad): method backward_for_fusion (line 1017) | def backward_for_fusion(self, output_grad, combine_bw_event_to_wait=No... method forward (line 1088) | def forward(self, inputs): method backward (line 1098) | def backward(self, output_grad=None, scaler=None): class DenseDecoderLayerNode (line 1109) | class DenseDecoderLayerNode(ScheduleNode): method __init__ (line 1110) | def __init__( method forward (line 1120) | def forward(self, inputs): method backward (line 1125) | def backward(self, output_grad=None, scaler=None): class OverlapedFUsionScheduleNode (line 1132) | class OverlapedFUsionScheduleNode: method __init__ (line 1133) | def __init__(self, forward_node, backward_node, name=""): method forward_backward (line 1141) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa... class OverlapedDenseFusionScheduleNode (line 1277) | class OverlapedDenseFusionScheduleNode: method __init__ (line 1278) | def __init__(self, forward_node, backward_node, name=""): method forward_backward (line 1287) | def forward_backward(self, inputs, output_grad, combine_bw_event_to_wa... function build_overlapped_nodes (line 1373) | def build_overlapped_nodes(config: DeepseekV2FastConfig, forward_chunk, ... class EmbeddingFunction (line 1437) | class EmbeddingFunction(paddle.autograd.PyLayer): method forward (line 1439) | def forward(ctx, x, weight): method backward (line 1448) | def backward(ctx, dout): class DeepseekV2EmbeddingPipe (line 1459) | class DeepseekV2EmbeddingPipe(nn.Layer): method __init__ (line 1460) | def __init__(self, config: DeepseekV2FastConfig): method embedding_weight (line 1475) | def embedding_weight(self): method forward (line 1478) | def forward(self, args): method build_schedule_node (line 1558) | def build_schedule_node(self): class DeepseekV2DecoderLayerPipe (line 1562) | class DeepseekV2DecoderLayerPipe(DeepseekV2DecoderLayer): method forward (line 1563) | def forward(self, args): method attn_compute (line 1622) | def attn_compute(self, args): method attn_compute_for_fusion (line 1657) | def attn_compute_for_fusion(self, args): method mlp_compute (line 1695) | def mlp_compute(self, inputs): method post_process_compute (line 1742) | def post_process_compute(self, inputs): method post_process_compute_for_fusion (line 1778) | def post_process_compute_for_fusion(self, inputs): method attn_compute_dense (line 1803) | def attn_compute_dense(self, args): method mlp_compute_dense (line 1821) | def mlp_compute_dense(self, inputs): method build_schedule_node (line 1835) | def build_schedule_node(self): class DeepseekV2MTPLayerPipe (line 1901) | class DeepseekV2MTPLayerPipe(DeepseekV2MTPLayer): method forward (line 1902) | def forward(self, args): method attn_compute_for_fusion (line 1970) | def attn_compute_for_fusion(self, args): method build_schedule_node (line 2017) | def build_schedule_node(self): class DeepseekV2RMSNormPipe (line 2036) | class DeepseekV2RMSNormPipe(nn.Layer): method __init__ (line 2037) | def __init__(self, config): method forward (line 2042) | def forward(self, args): method build_schedule_node (line 2057) | def build_schedule_node(self): class DeepseekV2LMHeadPipe (line 2061) | class DeepseekV2LMHeadPipe(DeepseekV2LMHead): method __init__ (line 2062) | def __init__(self, config, embedding_weight=None): method embedding_weight (line 2066) | def embedding_weight(self): method forward (line 2069) | def forward(self, args: Union[Tuple, paddle.Tensor]): method build_schedule_node (line 2079) | def build_schedule_node(self): class DeepseekV2PretrainingCriterionPipe (line 2083) | class DeepseekV2PretrainingCriterionPipe(DeepseekV2PretrainingCriterionF... method forward (line 2084) | def forward(self, logits, labels): method build_schedule_node (line 2095) | def build_schedule_node(self): class DeepseekV2ForCausalLMPipe (line 2099) | class DeepseekV2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): method step_flex_token (line 2118) | def step_flex_token(self, cur_step): method _prepare_pipeline_inputs_func (line 2122) | def _prepare_pipeline_inputs_func(cls, inputs): method __init__ (line 2145) | def __init__(self, config: DeepseekV2FastConfig): method fp8_quant_weight (line 2310) | def fp8_quant_weight(self, batch_mode=False, quant_transpose=True): method get_loss_fn (line 2323) | def get_loss_fn(self, config): method overlapped_forward_backward (line 2326) | def overlapped_forward_backward( FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_gate.py class PretrainedMoEGate (line 29) | class PretrainedMoEGate(nn.Layer, MoEGateMixin): method __init__ (line 30) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs): method _priority (line 69) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.... method _topk_greedy (line 91) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle.... method _topk_group_limited_greedy (line 106) | def _topk_group_limited_greedy( method _topk_noaux_tc (line 138) | def _topk_noaux_tc( method top1gating (line 175) | def top1gating( method top2gating (line 245) | def top2gating( method _cal_seq_aux_loss (line 321) | def _cal_seq_aux_loss(self, gates, top_k, topk_idx) -> paddle.Tensor: method topkgating (line 359) | def topkgating( method topkgating_nodrop (line 438) | def topkgating_nodrop(self, gates: paddle.Tensor): FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_layer.py function record_stream_for_multi_input (line 59) | def record_stream_for_multi_input(x): function stop_gradient_for_multi_input (line 67) | def stop_gradient_for_multi_input(x): class MoELayer (line 74) | class MoELayer(nn.Layer): method __init__ (line 75) | def __init__( method update_flex_token (line 150) | def update_flex_token(self): method _parse_moe_expert_parallel (line 166) | def _parse_moe_expert_parallel(self, n_routed_experts, expert_model_pa... method _post_init (line 176) | def _post_init(self): method forward (line 187) | def forward( method forward_drop_token (line 208) | def forward_drop_token( method expert_forward (line 327) | def expert_forward(self, dispatched_input): method forward_flex_token (line 338) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,... method get_tokens_per_expert (line 381) | def get_tokens_per_expert(self): method set_tokens_per_expert (line 384) | def set_tokens_per_expert(self, tokens_per_expert_list): method pre_dispatch_compute (line 387) | def pre_dispatch_compute(self, hidden_states): method post_dispatch_compute (line 395) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis... method pre_combine_compute (line 401) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p... method post_combine_compute (line 407) | def post_combine_compute(self, hidden_states): class MoEFlexTokenLayer (line 412) | class MoEFlexTokenLayer(nn.Layer): method __init__ (line 413) | def __init__(self, config, n_routed_experts, expert_class, expert_kwar... method expert_forward (line 429) | def expert_forward(self, dispatched_input, tokens_per_expert): method forward (line 441) | def forward(self, hidden_states: paddle.Tensor): method forward_flex_token (line 452) | def forward_flex_token(self, hidden_states: paddle.Tensor, probs=None,... method get_tokens_per_expert (line 495) | def get_tokens_per_expert(self): method set_tokens_per_expert (line 498) | def set_tokens_per_expert(self, tokens_per_expert_list): method pre_dispatch_compute (line 501) | def pre_dispatch_compute(self, hidden_states): method post_dispatch_compute (line 509) | def post_dispatch_compute(self, hidden_states, dispatched_indices, dis... method pre_combine_compute (line 515) | def pre_combine_compute(self, hidden_states, token_permuted_indices, p... method post_combine_compute (line 521) | def post_combine_compute(self, hidden_states): class Fp8DispatchQuantNode (line 526) | class Fp8DispatchQuantNode: method __init__ (line 527) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, name="fp8_... method forward (line 534) | def forward(self, hidden_states, probs, routing_map): method backward (line 562) | def backward(self, hs_grad, token_probs_grad): class Fp8DispatchNode (line 574) | class Fp8DispatchNode: method __init__ (line 575) | def __init__(self, token_dispatcher, name="fp8_dispatch_node"): method forward (line 581) | def forward( method backward (line 610) | def backward( class Fp8CombineNode (line 629) | class Fp8CombineNode: method __init__ (line 630) | def __init__(self, token_dispatcher, name="fp8_combine_node"): method forward (line 636) | def forward(self, hidden_states_out, previous_event=None, async_finish... method backward (line 651) | def backward(self, output_combine_grad, previous_event=None, async_fin... class Fp8CombineQuantNode (line 662) | class Fp8CombineQuantNode: method __init__ (line 663) | def __init__(self, token_dispatcher, dsv3_use_fp8_dispatch, moe_group=... method forward (line 670) | def forward(self, output_combine): method backward (line 679) | def backward(self, output_grad, event_to_wait=None): class FusionMlpNode (line 705) | class FusionMlpNode: method __init__ (line 710) | def __init__( method set_recompute_fwd_gate_up (line 746) | def set_recompute_fwd_gate_up(self, recompute_fwd_gate_up): method reset_statue (line 749) | def reset_statue(self): method prepare_env_subbatch (line 774) | def prepare_env_subbatch(self, unzipped_tokens=None, unzipped_tokens_s... method gemm_forward_subbatch (line 791) | def gemm_forward_subbatch( method gemm_backward_subbatch (line 826) | def gemm_backward_subbatch( method forward (line 887) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro... method backward (line 1025) | def backward(self, hidden_states_out_grad): class FusionMoeNode (line 1128) | class FusionMoeNode: method __init__ (line 1129) | def __init__( method forward (line 1162) | def forward(self, hidden_states, probs, routing_map): method backward (line 1190) | def backward(self, output_grad): class FusionMoe (line 1205) | class FusionMoe(paddle.autograd.PyLayer): method forward (line 1207) | def forward( method backward (line 1226) | def backward(ctx, output_grad): FILE: paddleformers/cli/train/deepseek_v3_pretrain/moe_utils.py function _clear_to_zero_allocation (line 29) | def _clear_to_zero_allocation(self): function _holder_size (line 45) | def _holder_size(self): function topk_to_permuted_indices (line 57) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk): function permute_fast (line 69) | def permute_fast( function unpermute_fast (line 90) | def unpermute_fast( class UnZipNode (line 132) | class UnZipNode: method __init__ (line 133) | def __init__(self, name="unzip"): method reset_statue (line 138) | def reset_statue(self): method forward (line 143) | def forward( method backward (line 189) | def backward(self, dx, total_zipped_tokens, probs_grad, dispatched_ind... class ZipNode (line 203) | class ZipNode: method __init__ (line 204) | def __init__(self, name="zip"): method forward (line 208) | def forward( method backward (line 218) | def backward( class PermuteNode (line 264) | class PermuteNode: method __init__ (line 265) | def __init__(self, token_dispatcher, name="permute"): method reset_status (line 269) | def reset_status(self): method forward (line 273) | def forward(self, hidden_states, hidden_states_scale, dispatched_indic... method backward (line 287) | def backward(self, out_grad, dispatched_probs): class UnPermuteNode (line 300) | class UnPermuteNode: method __init__ (line 301) | def __init__(self, token_dispatcher, name="unpermute"): method reset_status (line 305) | def reset_status(self): method forward (line 314) | def forward( method backward (line 352) | def backward(self, out_grad, out_grad_scale): function tokens_zip_unique_add_with_subbatch (line 383) | def tokens_zip_unique_add_with_subbatch(zipped, unzipped, index_unzipped... function merge_subbatch_cast (line 409) | def merge_subbatch_cast(x, dtype): function get_env_device (line 420) | def get_env_device(): FILE: paddleformers/cli/train/deepseek_v3_pretrain/token_dispatcher.py class _DeepepManager (line 31) | class _DeepepManager(_DispatchManager): method __init__ (line 53) | def __init__( method setup_metadata (line 74) | def setup_metadata(self, routing_map: paddle.Tensor, probs: paddle.Ten... method dispatch (line 82) | def dispatch( method _indices_to_multihot (line 94) | def _indices_to_multihot(self, indices, probs): method get_dispatched_metadata (line 119) | def get_dispatched_metadata(self) -> paddle.Tensor: method get_number_of_tokens_per_expert (line 122) | def get_number_of_tokens_per_expert(self) -> paddle.Tensor: method combine (line 128) | def combine(self, hidden_states: paddle.Tensor) -> paddle.Tensor: method get_permuted_hidden_states_by_experts (line 134) | def get_permuted_hidden_states_by_experts(self, hidden_states: paddle.... method get_permuted_hidden_states_by_experts_fast (line 146) | def get_permuted_hidden_states_by_experts_fast( method get_restored_hidden_states_by_experts (line 156) | def get_restored_hidden_states_by_experts(self, hidden_states: paddle.... method get_restored_hidden_states_by_experts_fast (line 168) | def get_restored_hidden_states_by_experts_fast( class MoETokenDispatcher (line 187) | class MoETokenDispatcher: method __init__ (line 192) | def __init__(self, ep_group) -> None: method ep_group (line 199) | def ep_group(self): method ep_size (line 204) | def ep_size(self): method token_permutation (line 209) | def token_permutation(self, tokens: paddle.Tensor, probs: paddle.Tenso... method token_unpermutation (line 223) | def token_unpermutation(self, expert_output: paddle.Tensor, bias: padd... class MoEFlexTokenDispatcher (line 236) | class MoEFlexTokenDispatcher(MoETokenDispatcher): method __init__ (line 241) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m... method token_permutation (line 253) | def token_permutation( method token_unpermutation (line 266) | def token_unpermutation( class MoEFlexTokenDispatcherFast (line 277) | class MoEFlexTokenDispatcherFast: method __init__ (line 282) | def __init__(self, num_local_experts: int, moe_router_topk: int, num_m... method ep_group (line 295) | def ep_group(self): method ep_size (line 300) | def ep_size(self): method pre_dispatch (line 304) | def pre_dispatch(self, hidden_states, probs, routing_map): method post_dispatch (line 314) | def post_dispatch(self, hidden_states, dispatched_indices): method pre_combine (line 322) | def pre_combine(self, hidden_states, token_permuted_indices, prob_perm... method post_combine (line 328) | def post_combine(self, hidden_states): method token_permutation (line 332) | def token_permutation( method token_unpermutation (line 350) | def token_unpermutation( class PreDispatchNode (line 368) | class PreDispatchNode: method __init__ (line 369) | def __init__(self, token_dispatcher): method reset_status (line 373) | def reset_status(self): method forward (line 379) | def forward(self, routing_map, probs): method backward (line 394) | def backward(self, token_probs_g): FILE: paddleformers/cli/train/deepseek_v3_pretrain/utils/convert_ckpt_to_sft.py function paddle_name_to_hf_names (line 45) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]: function _handle_expert_weights (line 134) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s... function _handle_shared_expert_weights (line 149) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional... function _handle_mlp_weights (line 162) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]: function _is_need_transpose (line 172) | def _is_need_transpose(key): function prepare_tensor (line 191) | def prepare_tensor(key, value): function load_pretrained_ckpt (line 218) | def load_pretrained_ckpt(ckpt_path, output_path): FILE: paddleformers/cli/train/deepseek_v3_pretrain/utils/load_hf_ckpt.py function paddle_name_to_hf_names_ds_v2 (line 57) | def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]: function paddle_name_to_hf_names (line 132) | def paddle_name_to_hf_names(paddle_name: str) -> List[str]: function _get_hf_prefix (line 200) | def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str: function _handle_expert_weights (line 210) | def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[s... function _handle_shared_expert_weights (line 225) | def _handle_shared_expert_weights(hf_prefix: str, rest: str) -> Optional... function _handle_mlp_weights (line 238) | def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]: function prepare_tensor (line 248) | def prepare_tensor(tensor, dst_shape, *, force_transpose=False): function load_huggingface_ckpt (line 278) | def load_huggingface_ckpt(model, huggingface_ckpt_path): FILE: paddleformers/cli/train/deepseek_v3_pretrain/workflow.py class PreTrainingArguments (line 64) | class PreTrainingArguments(TrainingArguments): method __post_init__ (line 91) | def __post_init__(self): class DataArguments (line 111) | class DataArguments: class ModelArguments (line 144) | class ModelArguments: function create_pretrained_dataset (line 175) | def create_pretrained_dataset( function get_train_data_file (line 249) | def get_train_data_file(args): class PretrainingTrainer (line 274) | class PretrainingTrainer(Trainer): method __init__ (line 275) | def __init__(self, *args, **kwargs): method evaluate (line 279) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre... method _get_eval_sampler (line 319) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]: method _get_train_sampler (line 329) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: function run_dsv3_pretrain (line 340) | def run_dsv3_pretrain(model_args, data_args, generating_args, training_a... FILE: paddleformers/cli/train/dpo/data_config.py class DataConfig (line 21) | class DataConfig: FILE: paddleformers/cli/train/dpo/dpo_argument.py class DPOTrainingArguments (line 30) | class DPOTrainingArguments(TrainingArguments): method __post_init__ (line 59) | def __post_init__(self): class DPOConfig (line 93) | class DPOConfig: class DPODataArgument (line 112) | class DPODataArgument(DataConfig): class DPOModelArgument (line 122) | class DPOModelArgument: FILE: paddleformers/cli/train/dpo/dpo_estimate_training.py function calculate_acc_steps (line 31) | def calculate_acc_steps(num_samples, train_batch, dataset_world_size, pe... function dpo_estimate_training (line 59) | def dpo_estimate_training(tokenizer, data_args, training_args, dataset_c... FILE: paddleformers/cli/train/dpo/dpo_trainer.py function disable_dropout_in_model (line 44) | def disable_dropout_in_model(model: paddle.nn.Layer) -> None: class DPOTrainer (line 51) | class DPOTrainer(Trainer): method __init__ (line 56) | def __init__( method get_batch_metrics (line 124) | def get_batch_metrics(self, ref_model, model, batch, train_eval="train"): method compute_loss (line 193) | def compute_loss(self, model, inputs): method _wrap_ref_model (line 206) | def _wrap_ref_model(self, model): method _wrap_model (line 227) | def _wrap_model(self, model, training=True): method evaluate (line 245) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre... method prediction_step (line 252) | def prediction_step(self, model, inputs, prediction_loss_only=False, i... method store_metrics (line 278) | def store_metrics(self, metrics, train_eval="train"): method log (line 283) | def log(self, logs, **kwargs): method fleet_prediction_pipeline_step (line 301) | def fleet_prediction_pipeline_step( method prediction_pipeline_step (line 407) | def prediction_pipeline_step( method log_metric (line 492) | def log_metric( method training_pipeline_step (line 542) | def training_pipeline_step(self, model, inputs): method disable_lora (line 624) | def disable_lora(self, model): method enable_lora (line 630) | def enable_lora(self, model): method reset_dpo_infohub (line 636) | def reset_dpo_infohub(self): method broadcast_last_stage_infohub_tensor (line 641) | def broadcast_last_stage_infohub_tensor(self): function prepare_pipeline_dpo_inputs_func (line 688) | def prepare_pipeline_dpo_inputs_func(inputs): function _prepare_pipeline_dpo_inputs_func_fleet (line 732) | def _prepare_pipeline_dpo_inputs_func_fleet(inputs): function fleet_merge_dpo_labels (line 771) | def fleet_merge_dpo_labels(labels, logprobs): FILE: paddleformers/cli/train/dpo/workflow.py function run_dpo (line 65) | def run_dpo( FILE: paddleformers/cli/train/ernie_pretrain/model_config.py class ModelConfig (line 22) | class ModelConfig: FILE: paddleformers/cli/train/ernie_pretrain/models/comm_utils.py function scatter (line 33) | def scatter(input, group=None, axis=0): function mp_slice (line 51) | def mp_slice(x, indices=None, group=None, axis=0): function all_gather_varlen (line 68) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True): function scatter_varlen (line 90) | def scatter_varlen(x, recv_tensor, indices, src_rank, group, sync_op=True): function all_gather (line 112) | def all_gather(input, group=None, axis=0): function reduce_scatter (line 131) | def reduce_scatter(input, group=None): function subbatch (line 148) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar... function gather_varlen (line 193) | def gather_varlen(input, dst, group, offload_pp_data_chunk_size=0, all_s... function profile (line 293) | def profile(name, use_event=True): FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py class ErnieMoEConfig (line 60) | class ErnieMoEConfig(PretrainedConfig): method __init__ (line 72) | def __init__( method __setattr__ (line 398) | def __setattr__(self, name: str, value): method register_nonsaveable_keys (line 409) | def register_nonsaveable_keys(self, keys): method use_moe (line 418) | def use_moe(self) -> bool: method to_json_string (line 421) | def to_json_string(self, use_diff: bool = True) -> str: FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling.py function get_triangle_upper_mask (line 131) | def get_triangle_upper_mask(x, mask=None): function gqa_qkv_split_func (line 143) | def gqa_qkv_split_func( function gqa_qkv_merge_func (line 173) | def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_h... function parallel_matmul (line 194) | def parallel_matmul( function calc_lm_head_logits (line 235) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para... function finfo (line 265) | def finfo(dtype: paddle.dtype = None): function masked_fill (line 283) | def masked_fill(x, mask, value): function mem_eff_attn (line 288) | def mem_eff_attn(query, key, value, pack_offset, drop_prob=0.0, dtype=pa... function inbatch_pack_offset_to_attn_mask_start_row_indices (line 325) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs... function scaled_dot_product_attention (line 340) | def scaled_dot_product_attention( function _make_causal_mask (line 458) | def _make_causal_mask(input_ids_shape, past_key_values_length, dtype): function _expand_mask (line 472) | def _expand_mask(mask, dtype, tgt_length): class FusedDropoutImpl (line 487) | class FusedDropoutImpl(nn.Layer): method __init__ (line 488) | def __init__(self, prob, mode): method forward (line 495) | def forward(self, x, y): class RMSNorm (line 503) | class RMSNorm(nn.Layer): method __init__ (line 504) | def __init__(self, config): method forward (line 518) | def forward(self, hidden_states): class RotaryEmbedding (line 534) | class RotaryEmbedding(nn.Layer): method __init__ (line 535) | def __init__(self, dim, max_position_embeddings=4096, base=10000): method forward (line 551) | def forward(self, x, seq_len=None): method rotate_half (line 559) | def rotate_half(cls, x): method apply_rotary_pos_emb (line 566) | def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, positio... class RopeEmbeddingLegacy (line 584) | class RopeEmbeddingLegacy(nn.Layer): method __init__ (line 585) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a... method forward (line 592) | def forward(self, seq_length, position_ids=None): method apply_rotary (line 608) | def apply_rotary(self, rp, q, k): method apply_rotary_3d (line 630) | def apply_rotary_3d(self, rp, q, k, position_ids): method forward_single (line 698) | def forward_single(self, position_ids): method apply_rotary_single (line 713) | def apply_rotary_single(x, rope_emb): class ErnieMLP (line 721) | class ErnieMLP(nn.Layer): method __init__ (line 722) | def __init__(self, config): method forward (line 802) | def forward(self, x): class ErnieAttention (line 839) | class ErnieAttention(nn.Layer): method __init__ (line 840) | def __init__(self, config, layer_idx=0): method forward (line 1001) | def forward( method rope_attn (line 1099) | def rope_attn( class ErnieDecoderLayer (line 1199) | class ErnieDecoderLayer(nn.Layer): method __init__ (line 1200) | def __init__(self, config, layer_idx=0): method forward (line 1214) | def forward( class ErniePretrainedModel (line 1269) | class ErniePretrainedModel(PretrainedModel): method _get_name_mappings (line 1274) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa... method _get_tensor_parallel_mappings (line 1345) | def _get_tensor_parallel_mappings(cls, config, is_split=True): method _init_weights (line 1430) | def _init_weights(self, layer): class ErnieModel (line 1478) | class ErnieModel(ErniePretrainedModel): method __init__ (line 1479) | def __init__(self, config: ErnieMoEConfig): method get_input_embeddings (line 1506) | def get_input_embeddings(self): method set_input_embeddings (line 1509) | def set_input_embeddings(self, value): method _prepare_decoder_attention_mask (line 1513) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ... method recompute_training (line 1532) | def recompute_training( method forward (line 1562) | def forward( class FusedHeadParallelCrossEntropy (line 1696) | class FusedHeadParallelCrossEntropy(PyLayer): method forward (line 1698) | def forward( method backward (line 1816) | def backward(ctx, loss_all_grad, labels_all_grad): class ErniePretrainingCriterion (line 1934) | class ErniePretrainingCriterion(paddle.nn.Layer): method __init__ (line 1935) | def __init__(self, config, return_tuple=True): method forward (line 1950) | def forward(self, prediction_scores, masked_lm_labels): method forward_impl_with_fused_head_loss_fn (line 2006) | def forward_impl_with_fused_head_loss_fn(self, masked_lm_labels, hidde... method forward_impl_with_calc_logits (line 2041) | def forward_impl_with_calc_logits(self, masked_lm_labels, hidden_state... method loss_impl (line 2053) | def loss_impl(self, prediction_scores, masked_lm_labels): method forward_impl (line 2059) | def forward_impl(self, prediction_scores, masked_lm_labels): class ErnieLMHead (line 2114) | class ErnieLMHead(nn.Layer): method __init__ (line 2115) | def __init__(self, config): method forward (line 2154) | def forward(self, hidden_states, tensor_parallel_output=None): method sharded_state_dict (line 2173) | def sharded_state_dict( class ErnieForCausalLM (line 2182) | class ErnieForCausalLM(ErniePretrainedModel): method __init__ (line 2185) | def __init__(self, config): method _post_init (line 2217) | def _post_init(self, original_init, *args, **kwargs): method get_input_embeddings (line 2226) | def get_input_embeddings(self): method set_input_embeddings (line 2229) | def set_input_embeddings(self, value): method get_output_embeddings (line 2232) | def get_output_embeddings(self): method set_output_embeddings (line 2235) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 2238) | def set_decoder(self, decoder): method get_decoder (line 2241) | def get_decoder(self): method prepare_attention_mask_for_generation (line 2245) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos... method prepare_inputs_for_generation (line 2258) | def prepare_inputs_for_generation( method update_model_kwargs_for_generation (line 2289) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is... method forward (line 2327) | def forward( method sharded_state_dict (line 2386) | def sharded_state_dict(self, *args, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling_moe.py class BaseModelOutputWithPastAndCrossAttentions (line 109) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput): class CausalLMOutputWithCrossAttentions (line 116) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput): function get_gate (line 136) | def get_gate( function build_mpdp_group (line 190) | def build_mpdp_group(): function _parse_moe_group (line 211) | def _parse_moe_group( function moe_ep2mp (line 258) | def moe_ep2mp(state_dict: Dict[str, paddle.Tensor], config: ErnieMoEConf... function moe_statedict_cherry_pick (line 307) | def moe_statedict_cherry_pick(state_dict: Dict[str, paddle.Tensor], conf... function moe_statedict_upcycle (line 332) | def moe_statedict_upcycle( class ErnieMoeMLP (line 504) | class ErnieMoeMLP(ErnieMLP): method __init__ (line 505) | def __init__(self, config, is_shared_expert=False): method forward (line 517) | def forward(self, x, use_comm=True): class ErnieMoeDenseExpert (line 579) | class ErnieMoeDenseExpert(nn.Layer): method __init__ (line 580) | def __init__(self, config): method forward (line 628) | def forward(self, x): class BMMLinear (line 655) | class BMMLinear(nn.Layer): method __init__ (line 656) | def __init__(self, experts, d_in, d_out, use_bias=False): method forward (line 664) | def forward(self, x): class ErnieMoeMLPFused (line 670) | class ErnieMoeMLPFused(nn.Layer): method __init__ (line 671) | def __init__(self, config): method __len__ (line 689) | def __len__(self): method __iter__ (line 692) | def __iter__(self): method forward (line 695) | def forward(self, x): class FusedLinearAddNormFunc (line 705) | class FusedLinearAddNormFunc(paddle.autograd.PyLayer): method forward (line 707) | def forward(ctx, x, residual, linear_weight, rms_norm_weight, eps): method backward (line 717) | def backward(ctx, d_rms_norm_out, d_residual_out): class FusedLinearAddNorm (line 736) | class FusedLinearAddNorm(paddle.nn.Layer): method __init__ (line 737) | def __init__(self, hidden_size, eps=1e-6) -> None: method forward (line 755) | def forward(self, x, residual): class FusedRMSLinearFunc (line 759) | class FusedRMSLinearFunc(paddle.autograd.PyLayer): method forward (line 761) | def forward(ctx, x, rms_norm_weight, linear_weight, eps): method backward (line 769) | def backward(ctx, d_qkv): class FusedRMSLinear (line 779) | class FusedRMSLinear(paddle.nn.Layer): method __init__ (line 780) | def __init__(self, hidden_size, eps=1e-6, num_heads=1, num_key_value_h... method forward (line 799) | def forward(self, x): class ErnieMoEAttention (line 803) | class ErnieMoEAttention(ErnieAttention): method __init__ (line 804) | def __init__(self, config, layer_idx): method forward (line 827) | def forward( class FakeMoERouterLoss (line 947) | class FakeMoERouterLoss(PyLayer): method forward (line 949) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss): method backward (line 957) | def backward(ctx, out_grad): class ErnieDecoderLayer (line 966) | class ErnieDecoderLayer(nn.Layer): method __init__ (line 967) | def __init__(self, config, layer_idx): method training (line 1053) | def training(self): method training (line 1057) | def training(self, new): method fp8_quant_weight (line 1063) | def fp8_quant_weight(self): method _init_gate_and_experts (line 1068) | def _init_gate_and_experts(self, layer_idx): method _init_shared_experts (line 1119) | def _init_shared_experts(self): method _init_dense_experts (line 1137) | def _init_dense_experts(self, layer_idx): method forward (line 1160) | def forward( method model_parallel_dropout (line 1244) | def model_parallel_dropout(self): class ErniePretrainedModel (line 1251) | class ErniePretrainedModel(PretrainedModel): method _get_name_mappings (line 1256) | def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMa... method _get_tensor_parallel_mappings (line 1326) | def _get_tensor_parallel_mappings(cls, config, is_split=True): method _init_weights (line 1446) | def _init_weights(self, layer): class ErnieModel (line 1525) | class ErnieModel(ErniePretrainedModel): method __init__ (line 1526) | def __init__(self, config: ErnieMoEConfig): method get_input_embeddings (line 1591) | def get_input_embeddings(self): method set_input_embeddings (line 1594) | def set_input_embeddings(self, value): method _prepare_decoder_attention_mask (line 1598) | def _prepare_decoder_attention_mask(cls, attention_mask, input_shape, ... method recompute_training (line 1617) | def recompute_training( method forward (line 1672) | def forward( class ErniePretrainingCriterion (line 1905) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase): method __init__ (line 1906) | def __init__(self, config, return_tuple=True): method forward (line 1921) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non... class ErnieMoEForCausalLM (line 1974) | class ErnieMoEForCausalLM(ErniePretrainedModel): method __init__ (line 1977) | def __init__(self, config): method _post_init (line 2002) | def _post_init(self, original_init, *args, **kwargs): method set_state_dict (line 2034) | def set_state_dict(self, state_dict, *args, **kwargs): method get_input_embeddings (line 2052) | def get_input_embeddings(self): method set_input_embeddings (line 2055) | def set_input_embeddings(self, value): method get_output_embeddings (line 2058) | def get_output_embeddings(self): method set_output_embeddings (line 2061) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 2064) | def set_decoder(self, decoder): method get_decoder (line 2067) | def get_decoder(self): method prepare_attention_mask_for_generation (line 2071) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos... method prepare_inputs_for_generation (line 2084) | def prepare_inputs_for_generation( method update_model_kwargs_for_generation (line 2116) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is... method forward (line 2155) | def forward( method sharded_state_dict (line 2224) | def sharded_state_dict(self, *args, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/models/ernie/modeling_pp.py class ErnieEmbeddingPipe (line 95) | class ErnieEmbeddingPipe(nn.Layer): method __init__ (line 96) | def __init__(self, config): method embedding_weight (line 112) | def embedding_weight(self): method forward (line 115) | def forward(self, args): class MTPEmbeddingPipe (line 209) | class MTPEmbeddingPipe(ErnieEmbeddingPipe): method __init__ (line 210) | def __init__(self, config): method embedding_weight (line 214) | def embedding_weight(self): method forward (line 217) | def forward(self, args): class EmptyLayer (line 236) | class EmptyLayer(nn.Layer): method __init__ (line 237) | def __init__(self): method forward (line 240) | def forward(self, x): class ErnieDecoderLayerPipe (line 244) | class ErnieDecoderLayerPipe(ErnieDecoderLayer): method __init__ (line 245) | def __init__(self, config, layer_idx, use_full_recompute=False): method forward (line 252) | def forward(self, args): class RMSNormPipe (line 356) | class RMSNormPipe(RMSNorm): method __init__ (line 357) | def __init__(self, config): method forward (line 362) | def forward(self, args): class ErnieMoELMHeadPipe (line 390) | class ErnieMoELMHeadPipe(ErnieMoELMHead): method forward (line 391) | def forward(self, args): class MTPLayer (line 402) | class MTPLayer(nn.Layer): method __init__ (line 403) | def __init__(self, config): method forward (line 437) | def forward(self, args): method forward_impl (line 446) | def forward_impl(self, *args): class ErniePretrainingCriterionPipe (line 523) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion): method __init__ (line 524) | def __init__(self, config): method forward (line 527) | def forward(self, logits, labels): class PipelinePretrainedModel (line 541) | class PipelinePretrainedModel(PretrainedModel): method __init__ (line 542) | def __init__(self, config, *args, **kwargs): method init (line 546) | def init(self, config, *args, **kwargs): method add_sequential_layer (line 551) | def add_sequential_layer(self, layer_desc, name_prefix=""): method get_sequential_layers (line 554) | def get_sequential_layers(self): method get_sequential_name_prefixs (line 557) | def get_sequential_name_prefixs(self): method get_shardlayer_prefix (line 560) | def get_shardlayer_prefix(self, name_splited): method _set_pipeline_name_mapping (line 571) | def _set_pipeline_name_mapping(self, mappings=None): method _check_shared_model_state (line 631) | def _check_shared_model_state(self): method state_dict (line 652) | def state_dict(self, *args, **kwargs): method _init_weights (line 664) | def _init_weights(self, layer): method sharded_state_dict (line 736) | def sharded_state_dict(self, *args, **kwargs): function get_pp_vp_split_layers (line 770) | def get_pp_vp_split_layers(config): class ErnieMoEForCausalLMPipe (line 802) | class ErnieMoEForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): method _prepare_pipeline_inputs_func (line 815) | def _prepare_pipeline_inputs_func(cls, data): method __init__ (line 849) | def __init__( method get_loss_fn (line 1000) | def get_loss_fn(self, config): method rename_model_params (line 1003) | def rename_model_params(self, func): method fp8_quant_weight (line 1010) | def fp8_quant_weight(self): method _post_init (line 1016) | def _post_init(self, original_init, *args, **kwargs): method set_state_dict (line 1033) | def set_state_dict(self, state_dict, *args, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/models/fp8_linear.py function fp8_gemm (line 44) | def fp8_gemm( function padding (line 94) | def padding(x, axis): class Fp8FusedMlpFunc (line 118) | class Fp8FusedMlpFunc(paddle.autograd.PyLayer): method forward (line 128) | def forward(ctx, x, w1, w2): method backward (line 208) | def backward(ctx, do3): class MemEfficientFp8FusedMlpFunc (line 320) | class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer): method forward (line 333) | def forward(ctx, x, w1, w2): method backward (line 393) | def backward(ctx, do3): class Fp8FusedMlp (line 515) | class Fp8FusedMlp(paddle.nn.Layer): method __init__ (line 526) | def __init__(self, config): method forward (line 557) | def forward(self, x): FILE: paddleformers/cli/train/ernie_pretrain/models/moe/moe_layer.py function set_grad_in_dtype_non_consistent (line 73) | def set_grad_in_dtype_non_consistent(ctx): class Fp8MoeGateDispatchAndQuant (line 79) | class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer): method forward (line 83) | def forward( method backward (line 135) | def backward(ctx, *grads): function recompute_fwd_gate_up_func (line 154) | def recompute_fwd_gate_up_func(config, layer_idx): class MoEStatics (line 164) | class MoEStatics(nn.Layer): method __init__ (line 165) | def __init__(self, config, layer_idx): class GateCombine (line 196) | class GateCombine(PyLayer): method forward (line 198) | def forward(ctx, x, combine_weights, scatter_index): method backward (line 206) | def backward(ctx, grad_y, *_): class FusionFP8Expert (line 215) | class FusionFP8Expert(paddle.autograd.PyLayer): method forward (line 217) | def forward(ctx, hidden_states, custom_map): method backward (line 234) | def backward(ctx, output_grad): class AlltoAll (line 245) | class AlltoAll(PyLayer): method forward (line 247) | def forward(ctx, x, group, sync_op=True): method backward (line 260) | def backward(ctx, *dx): class AlltoAllExpertOverlap (line 264) | class AlltoAllExpertOverlap(PyLayer): method forward (line 266) | def forward(ctx, input, group, num_local_experts, forward_func_dict, i... method backward (line 302) | def backward(ctx, out_grad): class AlltoAllAsync (line 321) | class AlltoAllAsync(PyLayer): method forward (line 323) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): method backward (line 344) | def backward(ctx, dx_out, *fn_out_grads): function dispatching (line 364) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): function combining_fused (line 391) | def combining_fused(x, combine_weights, scatter_index, hard_gate=False): class ReshapeKeepGradDtype (line 400) | class ReshapeKeepGradDtype(PyLayer): method forward (line 402) | def forward(ctx, x, shape): method backward (line 408) | def backward(ctx, grad): class MOELayer (line 412) | class MOELayer(nn.Layer): method __init__ (line 420) | def __init__( method forward_experts (line 523) | def forward_experts(self, dispatched_input): method fp8_quant_weight (line 559) | def fp8_quant_weight(self): method fused_gate_logits_process (line 590) | def fused_gate_logits_process(self, gate_logits, token_type_ids, offlo... method gate_distpach_and_quant (line 603) | def gate_distpach_and_quant(self, input, token_type_ids): method gate_and_distpach (line 688) | def gate_and_distpach(self, input, token_type_ids): method _calc_router_loss (line 800) | def _calc_router_loss( method calc_router_loss_and_logging (line 831) | def calc_router_loss_and_logging( method combine_expert_output (line 854) | def combine_expert_output(self, expert_output, combine_weights, scatte... method forward_single_stage (line 862) | def forward_single_stage(self, dispatched_input, stage_id): method all2all_expert_overlap (line 866) | def all2all_expert_overlap(self, x, group): method forward (line 893) | def forward( method sharded_state_dict (line 1067) | def sharded_state_dict( class FP8FusedWLCHFunc (line 1079) | class FP8FusedWLCHFunc(paddle.autograd.PyLayer): method forward (line 1081) | def forward( method backward (line 1134) | def backward(ctx, output_grad): class MlpNode (line 1149) | class MlpNode: method __init__ (line 1150) | def __init__(self, custom_map, max_topk, recompute_fwd_gate_up=False, ... method reset_status (line 1165) | def reset_status(self): method release_mem (line 1173) | def release_mem(self): method forward (line 1178) | def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_pro... method backward (line 1219) | def backward(self, hidden_states_out_grad): class Fp8FusedMoeFunc (line 1243) | class Fp8FusedMoeFunc(paddle.autograd.PyLayer): method forward (line 1245) | def forward( method backward (line 1269) | def backward(ctx, output_grad): FILE: paddleformers/cli/train/ernie_pretrain/models/moe/token_dispatcher/fp8_utils.py function _get_fp8_weight_and_scale (line 44) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False): function fused_stack_transpose_quant (line 64) | def fused_stack_transpose_quant(weight_list, transpose=False): function split_group_gemm (line 82) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ... function has_config (line 120) | def has_config(config_map, key): class ExpertsGroupGemmNode (line 124) | class ExpertsGroupGemmNode: method __init__ (line 137) | def __init__(self, experts, custom_map, name="moe_experts_node"): method reset_status (line 161) | def reset_status(self): method fwd_gate_up (line 167) | def fwd_gate_up(self, x_bf16, expert_w1, expert_w_count, tokens_per_ex... method fwd_swiglu (line 224) | def fwd_swiglu(self, o1): method fwd_down (line 240) | def fwd_down(self, o1, unzipped_probs, expert_w_count, tokens_per_expe... method fwd_down_no_probs (line 301) | def fwd_down_no_probs(self, o1, expert_w2, expert_w_count, tokens_per_... method bwd_down_input (line 359) | def bwd_down_input(self, expert_w2, unzipped_grad, tokens_per_expert, ... method bwd_down_input_no_prob (line 429) | def bwd_down_input_no_prob(self, expert_w2, unzipped_grad, tokens_per_... method bwd_swiglu (line 469) | def bwd_swiglu(self, o1, do2): method bwd_gate_up_input (line 486) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, expecte... method bwd_down_weight (line 544) | def bwd_down_weight(self, out_grad, o2, expert_w2): method bwd_gate_up_weight (line 628) | def bwd_gate_up_weight(self, do1, input_x, expert_w1): method forward (line 707) | def forward(self, hs_out, unzipped_probs, tokens_per_expert): method backward (line 726) | def backward(self, out_grad, tokens_per_expert, dispatched_indices, ex... method forward_no_prob (line 741) | def forward_no_prob(self, hs_out, tokens_per_expert): method backward_no_prob (line 753) | def backward_no_prob(self, out_grad, tokens_per_expert): class ExpertsGroupGemmContiguousNode (line 775) | class ExpertsGroupGemmContiguousNode: method __init__ (line 788) | def __init__( method reset_status (line 833) | def reset_status(self): method gen_m_indices (line 842) | def gen_m_indices(self, tokens_per_expert): method fwd_gate_up (line 863) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, sca... method fwd_swiglu (line 937) | def fwd_swiglu(self, o1): method fwd_down (line 941) | def fwd_down(self, o1, unzipped_probs, expert_w2, num_expert): method bwd_down_input (line 1007) | def bwd_down_input(self, expert_w2, unzipped_grad, o1): method bwd_swiglu (line 1084) | def bwd_swiglu(self, o1, do2): method bwd_gate_up_input (line 1088) | def bwd_gate_up_input(self, do1, expert_w1): method fused_transpose_split_quant (line 1145) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal... method bwd_down_weight (line 1170) | def bwd_down_weight(self, do3, o2, expert_w2): method bwd_gate_up_weight (line 1246) | def bwd_gate_up_weight(self, do1, input_x, expert_w1): method forward (line 1310) | def forward( method backward (line 1334) | def backward(self, out_grad, a2a_async_fn=None): class ExpertsGroupGemmWLCHNode (line 1427) | class ExpertsGroupGemmWLCHNode(ExpertsGroupGemmContiguousNode): method __init__ (line 1443) | def __init__( method gen_m_indices (line 1479) | def gen_m_indices(self, tokens_per_expert): method fused_transpose_split_quant (line 1499) | def fused_transpose_split_quant(self, x, tokens_per_expert, pow_2_scal... FILE: paddleformers/cli/train/ernie_pretrain/models/moe/token_dispatcher/moe_utils.py function inplace_offload (line 24) | def inplace_offload(x): function inplace_offload_if_needed (line 41) | def inplace_offload_if_needed(x, threshold=2 * 1024 * 1024 * 1024): function topk_to_permuted_indices_single (line 61) | def topk_to_permuted_indices_single(x, num_tokens, expert_id, topk): function topk_to_permuted_indices (line 81) | def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk): function permute (line 105) | def permute( function unpermute (line 128) | def unpermute( class UnZipNode (line 163) | class UnZipNode: method __init__ (line 178) | def __init__(self, token_dispatcher, name="unzip"): method reset_status (line 190) | def reset_status(self): method forward (line 196) | def forward( method backward (line 240) | def backward(self, dx, hidden_states_out_grad, probs_grad, dispatched_... class ZipNode (line 268) | class ZipNode: method __init__ (line 281) | def __init__(self, token_dispatcher, name="zip"): method forward (line 292) | def forward( method backward (line 326) | def backward( FILE: paddleformers/cli/train/ernie_pretrain/models/moe/top2_gate.py function cal_aux_loss_func (line 37) | def cal_aux_loss_func( function masked_fill (line 87) | def masked_fill(x, mask, value): class CalAuxLossFunctor (line 92) | class CalAuxLossFunctor(paddle.autograd.PyLayer): method forward (line 94) | def forward( method backward (line 124) | def backward(ctx, out_grad): function cast_if_needed (line 132) | def cast_if_needed(x, dtype): class FusedGateDetachMatmul (line 136) | class FusedGateDetachMatmul(paddle.autograd.PyLayer): method forward (line 138) | def forward(ctx, x, w): method backward (line 144) | def backward(ctx, y_grad): function gate_detach_matmul (line 157) | def gate_detach_matmul(x, weight, use_fuse): function compute_optimal_transport (line 166) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:... class Top2Gate (line 180) | class Top2Gate(nn.Layer): method __init__ (line 193) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->... method _create_gate_parameter (line 261) | def _create_gate_parameter(self): method forward (line 271) | def forward( method get_capacity (line 303) | def get_capacity(self, num_tokens, cap_factor=None): method top2_gating (line 318) | def top2_gating(self, logits, cap=None, correction_bias=None): method _cal_aux_loss (line 390) | def _cal_aux_loss( method _cal_orthogonal_loss (line 435) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None): method _cal_orthogonal_loss_opt_each_weight (line 450) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): function cal_orthogonal_loss_opt_each_weight_func (line 457) | def cal_orthogonal_loss_opt_each_weight_func(weight, moe_k, use_group, e... class TopKGateFused (line 475) | class TopKGateFused(Top2Gate): method forward (line 476) | def forward( FILE: paddleformers/cli/train/ernie_pretrain/models/sequence_parallel_utils.py function get_hcg (line 49) | def get_hcg(): function get_async_loader (line 56) | def get_async_loader(): function hack_offload_wait (line 69) | def hack_offload_wait(task): function hack_reload_wait (line 73) | def hack_reload_wait(task): class ScatterOp (line 77) | class ScatterOp(PyLayer): method forward (line 79) | def forward(ctx, input, axis=0, group=None): method backward (line 85) | def backward(ctx, grad): class GatherOp (line 89) | class GatherOp(PyLayer): method forward (line 91) | def forward(ctx, input, axis=0, group=None): method backward (line 97) | def backward(ctx, grad): class AllGatherOp (line 101) | class AllGatherOp(PyLayer): method forward (line 103) | def forward(ctx, input, group=None): method backward (line 108) | def backward(ctx, grad): class ReduceScatterOp (line 112) | class ReduceScatterOp(PyLayer): method forward (line 114) | def forward(ctx, input, group=None): method backward (line 120) | def backward(ctx, grad): class AllGatherVarlenOp (line 124) | class AllGatherVarlenOp(PyLayer): method forward (line 126) | def forward(ctx, input, group=None): method backward (line 165) | def backward(ctx, grad): class GemmReduceScatterOp (line 179) | class GemmReduceScatterOp(PyLayer): method forward (line 181) | def forward(ctx, input, weight, group): method backward (line 188) | def backward(ctx, grad): class AllGatherGemmOp (line 209) | class AllGatherGemmOp(PyLayer): method forward (line 211) | def forward(ctx, input, weight, group): method backward (line 219) | def backward(ctx, grad): function sequence_parallel_sparse_mask_labels (line 236) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100): function mark_as_sequence_parallel_parameter (line 252) | def mark_as_sequence_parallel_parameter(parameter): function is_sequence_parallel_parameter (line 256) | def is_sequence_parallel_parameter(parameter): function create_fused_allreduce_gradient_hook (line 260) | def create_fused_allreduce_gradient_hook(parameter_list, accumulation_st... function create_non_fused_allreduce_gradient_hook (line 277) | def create_non_fused_allreduce_gradient_hook(param, model, verbose=False): function register_sequence_parallel_allreduce_hooks (line 300) | def register_sequence_parallel_allreduce_hooks(model, fuse_sequence_para... function is_fused_matmul_bias_supported (line 323) | def is_fused_matmul_bias_supported(): class ColumnSequenceParallelLinear (line 339) | class ColumnSequenceParallelLinear(Layer): method __init__ (line 340) | def __init__( method forward (line 432) | def forward(self, x, use_comm=True): method sharded_state_dict (line 452) | def sharded_state_dict( class MPScale (line 460) | class MPScale(PyLayer): method forward (line 462) | def forward(ctx, x, mp_degree): method backward (line 467) | def backward(ctx, dout): class RowSequenceParallelLinear (line 471) | class RowSequenceParallelLinear(Layer): method __init__ (line 472) | def __init__( method forward (line 568) | def forward(self, x): method sharded_state_dict (line 599) | def sharded_state_dict( FILE: paddleformers/cli/train/ernie_pretrain/models/utils.py function get_global_training_logs (line 31) | def get_global_training_logs(): function global_training_logs_enabled (line 49) | def global_training_logs_enabled(): function inplace_offload (line 54) | def inplace_offload(tensor): function detach_and_requires_grad_ (line 59) | def detach_and_requires_grad_(*args): class FakeClone (line 67) | class FakeClone(paddle.autograd.PyLayer): method forward (line 69) | def forward(ctx, input): method backward (line 78) | def backward(ctx, grad_output): function manual_backward (line 82) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]): class FakeGather (line 120) | class FakeGather(paddle.autograd.PyLayer): method forward (line 122) | def forward(ctx, input, indices): method backward (line 132) | def backward(ctx, grad_output): class FusedUnpermutation (line 141) | class FusedUnpermutation(paddle.autograd.PyLayer): method forward (line 143) | def forward( method backward (line 180) | def backward(ctx, output_tokens_grad): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/fp8_quant_weight_callback.py function enable_in_dict_config (line 25) | def enable_in_dict_config(config, key): class FP8QuantWeightCallback (line 32) | class FP8QuantWeightCallback(TrainerCallback): method on_step_begin (line 33) | def on_step_begin(self, args, state, control, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/gc_callback.py class GCCallback (line 20) | class GCCallback(TrainerCallback): method on_train_begin (line 21) | def on_train_begin(self, args, state, control, **kwargs): method on_step_end (line 25) | def on_step_end(self, args, state, control, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/logging_callback.py class LoggingCallback (line 22) | class LoggingCallback(TrainerCallback): method __init__ (line 23) | def __init__( method on_log (line 28) | def on_log(self, args, state, control, logs=None, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/moe_correction_bias_adjust_callback.py class MoECorrectionBiasAdjustCallback (line 30) | class MoECorrectionBiasAdjustCallback(TrainerCallback): method __init__ (line 31) | def __init__(self, lr, use_sp): method on_optimizer_end (line 36) | def on_optimizer_end(self, args, state, control, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/moe_logging_callback.py function tensor_md5 (line 46) | def tensor_md5(tensor): class GlobalRNGCallback (line 52) | class GlobalRNGCallback(TrainerCallback): method on_step_end (line 53) | def on_step_end(self, args, state, control, model, **kwargs): class MoeLoggingCallback (line 58) | class MoeLoggingCallback(TrainerCallback): method __init__ (line 59) | def __init__(self, optimizer): method on_log (line 70) | def on_log(self, args, state, control, logs=None, **kwargs): method on_step_end (line 76) | def on_step_end(self, args, state, control, model, **kwargs): method on_save (line 115) | def on_save(self, args, state, control, model, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/ortho_loss_callback.py class OrthogonalCallback (line 24) | class OrthogonalCallback(TrainerCallback): method __init__ (line 25) | def __init__(self, ortho_loss_lambda): method on_optimizer_end (line 28) | def on_optimizer_end(self, args, state, control, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/sp_grad_sync_callback.py class SPGradSyncCallback (line 30) | class SPGradSyncCallback(TrainerCallback): method __init__ (line 31) | def __init__(self, model): method on_optimizer_begin (line 43) | def on_optimizer_begin(self, args, state, control, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/callbacks/tensorboard_callback.py function is_tensorboard_available (line 29) | def is_tensorboard_available(): function rewrite_logs (line 33) | def rewrite_logs(d): class TensorBoardCallback (line 49) | class TensorBoardCallback(TrainerCallback): method __init__ (line 50) | def __init__( method _init_summary_writer (line 91) | def _init_summary_writer(self, args, log_dir=None): method on_train_begin (line 96) | def on_train_begin(self, args, state, control, **kwargs): method on_log (line 120) | def on_log(self, args, state, control, logs=None, **kwargs): method on_train_end (line 183) | def on_train_end(self, args, state, control, **kwargs): FILE: paddleformers/cli/train/ernie_pretrain/src/clip/moe_clip.py class ClipGradForMOEByGlobalNorm (line 28) | class ClipGradForMOEByGlobalNorm(ClipGradBase): method __init__ (line 29) | def __init__( method __str__ (line 49) | def __str__(self): method get_l2_norm_pow (line 53) | def get_l2_norm_pow(params_grads, sum_dtype=None): method _dygraph_clip (line 101) | def _dygraph_clip(self, params_grads): FILE: paddleformers/cli/train/ernie_pretrain/src/lr_schedulers/cosine_lr.py function get_cosine_schedule_with_warmup (line 24) | def get_cosine_schedule_with_warmup( FILE: paddleformers/cli/train/ernie_pretrain/src/lr_schedulers/wsd_lr.py function get_wsd_schedule_with_warmup (line 20) | def get_wsd_schedule_with_warmup( FILE: paddleformers/cli/train/ernie_pretrain/src/tokenizers/tokenization_eb_v2.py class ErnieBotTokenizer (line 29) | class ErnieBotTokenizer(PretrainedTokenizer): method __init__ (line 40) | def __init__( method space_token (line 72) | def space_token(self): method space_token_id (line 76) | def space_token_id(self): method gend_token (line 80) | def gend_token(self): method gend_token_id (line 84) | def gend_token_id(self): method im_start_id (line 88) | def im_start_id(self): method im_end_id (line 92) | def im_end_id(self): method vocab_size (line 96) | def vocab_size(self): method get_vocab (line 99) | def get_vocab(self): method _tokenize (line 104) | def _tokenize(self, text): method _convert_token_to_id (line 107) | def _convert_token_to_id(self, token): method _convert_id_to_token (line 110) | def _convert_id_to_token(self, id): method convert_tokens_to_string (line 113) | def convert_tokens_to_string(self, tokens): method prepare_for_model (line 126) | def prepare_for_model(self, *args, **kwargs): method save_vocabulary (line 131) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st... method tokenize (line 147) | def tokenize(self, text: TextInput, **kwargs) -> List[str]: method _decode (line 169) | def _decode(self, *args, **kwargs): method _pad (line 179) | def _pad( function add_special_tokens (line 239) | def add_special_tokens( FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/data_parallel.py class DataParallel (line 22) | class DataParallel(paddle.DataParallel): method init_reducer (line 23) | def init_reducer(self): function sync_dp_moe_params_across_sharding (line 74) | def sync_dp_moe_params_across_sharding(model: paddle.nn.Layer) -> None: FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/dygraph_optimizer/hybrid_parallel_optimizer.py class HybridParallelClipGrad (line 37) | class HybridParallelClipGrad: method __init__ (line 38) | def __init__(self, clip, hcg, timers=None): method _global_norm (line 53) | def _global_norm( method _dygraph_clip (line 142) | def _dygraph_clip(self, params_grads): method _comm_and_clip (line 277) | def _comm_and_clip( method __getattr__ (line 330) | def __getattr__(self, item): method __call__ (line 333) | def __call__(self, params_grads): class HybridParallelOptimizer (line 337) | class HybridParallelOptimizer(HPBase): method __init__ (line 338) | def __init__(self, optimizer, hcg, strategy): FILE: paddleformers/cli/train/ernie_pretrain/src/trainers/pretraining_trainer.py function distributed_optimizer_maybe_overwrite (line 109) | def distributed_optimizer_maybe_overwrite( class PreTrainingArguments (line 134) | class PreTrainingArguments(TrainingArguments): method use_moe (line 286) | def use_moe(self): # noqa: F811 method use_moe (line 290) | def use_moe(self, value): method need_data (line 295) | def need_data(self): method combine_batch (line 299) | def combine_batch(self): method reeao_dataset_rank (line 303) | def reeao_dataset_rank(self): method reeao_dataset_world_size (line 307) | def reeao_dataset_world_size(self): method __post_init__ (line 310) | def __post_init__(self): class WeightedDistributedSampler (line 424) | class WeightedDistributedSampler(PaddleNLPDistributedBatchSampler): method __init__ (line 425) | def __init__( method set_epoch (line 471) | def set_epoch(self, epoch=0, consumed_samples=0): method gen_data_seq (line 476) | def gen_data_seq(self): method load_data_seq_from_cache (line 489) | def load_data_seq_from_cache(self): method gen_data_seq_weighted (line 502) | def gen_data_seq_weighted(self, num_examples, data_type=None): method roundup_and_shard (line 580) | def roundup_and_shard(self, indices): method __len__ (line 602) | def __len__(self): method __iter__ (line 605) | def __iter__(self): class DummySampler (line 673) | class DummySampler(PaddleNLPDistributedBatchSampler): method __init__ (line 674) | def __init__(self, dataset, batch_size=1, **kwargs): method __len__ (line 677) | def __len__(self): method __iter__ (line 680) | def __iter__(self): class PretrainingTrainer (line 685) | class PretrainingTrainer(Trainer): method __init__ (line 686) | def __init__(self, args=None, model=None, callbacks=[], **kwargs): method autocast_smart_context_manager (line 707) | def autocast_smart_context_manager(self): method _load_optimizer_state (line 739) | def _load_optimizer_state(self, checkpoint): method _save_moe_weights (line 788) | def _save_moe_weights(self, output_dir): method _wrap_model (line 835) | def _wrap_model(self, model, training=True): method _new_gradclip (line 1004) | def _new_gradclip(self): method evaluate (line 1050) | def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_pre... method prediction_pipeline_step (line 1081) | def prediction_pipeline_step(self, model, inputs, prediction_loss_only... method restore_dataloader_status (line 1087) | def restore_dataloader_status(self): method _get_eval_sampler (line 1132) | def _get_eval_sampler(self, eval_dataset) -> Optional[paddle.io.Sampler]: method _get_train_sampler (line 1142) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: method _maybe_log_save_evaluate (line 1152) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_... method create_scheduler (line 1316) | def create_scheduler(self, num_training_steps): method create_optimizer (line 1340) | def create_optimizer(self, lr_scheduler=None): method save_model (line 1404) | def save_model(self, output_dir=None): method _load_rng_state (line 1410) | def _load_rng_state(self, checkpoint): FILE: paddleformers/cli/train/ernie_pretrain/src/utils/logging.py function setup_logger_output_file (line 41) | def setup_logger_output_file(outputpath, local_rank): FILE: paddleformers/cli/train/ernie_pretrain/src/utils/misc.py class SmoothedValue (line 42) | class SmoothedValue: method __init__ (line 43) | def __init__( method update (line 52) | def update(self, value): method global_avg (line 63) | def global_avg(self): method reset (line 66) | def reset(self): class TrainingLogs (line 71) | class TrainingLogs: method __new__ (line 74) | def __new__(cls, *args, **kw): method __init__ (line 79) | def __init__(self): method set_trainer_interval (line 87) | def set_trainer_interval(self, trainer, logging_interval): method global_meters_keys (line 92) | def global_meters_keys(self): method global_meters_keys (line 96) | def global_meters_keys(self, lst): method enable_skip_zero (line 99) | def enable_skip_zero(self, keys=[]): method update (line 107) | def update(self, **kwargs): method is_enabled (line 111) | def is_enabled(self): method __setitem__ (line 114) | def __setitem__(self, k, v): method __getitem__ (line 122) | def __getitem__(self, v): method __getattr__ (line 125) | def __getattr__(self, attr): method dict (line 132) | def dict(self, use_async=False): method reset (line 186) | def reset(self): method take_snapshot (line 191) | def take_snapshot(self): method restore_snapshot (line 194) | def restore_snapshot(self): FILE: paddleformers/cli/train/ernie_pretrain/src/utils/seed_utils.py function set_seed (line 26) | def set_seed(seed): FILE: paddleformers/cli/train/ernie_pretrain/src/utils/training_utils.py function reset_per_device_batch_size (line 20) | def reset_per_device_batch_size(global_batch_size, per_device_train_batc... FILE: paddleformers/cli/train/ernie_pretrain/workflow.py function log_trainer_start (line 84) | def log_trainer_start(): function load_huggingface_checkpoint (line 94) | def load_huggingface_checkpoint(model, args): function get_expected_state_dict (line 189) | def get_expected_state_dict(model, **kwargs): function update_model_config_from_args (line 257) | def update_model_config_from_args(config: ErnieMoEConfig, model_args: di... function get_tp_split_ckpt (line 267) | def get_tp_split_ckpt(args, path): class AllArguments (line 279) | class AllArguments(PreTrainingArguments): method __post_init__ (line 280) | def __post_init__(self): class ExpConfig (line 285) | class ExpConfig: function create_pretrained_dataset (line 291) | def create_pretrained_dataset(args): function run_ernie_pretrain (line 337) | def run_ernie_pretrain(model_args, data_args, generating_args, training_... FILE: paddleformers/cli/train/sft/dataset_formatting.py function conversations_formatting_function (line 31) | def conversations_formatting_function(tokenizer: AutoTokenizer, messages... function instructions_formatting_function (line 49) | def instructions_formatting_function(tokenizer: AutoTokenizer): function paddleformers_instructions_formatting_function (line 75) | def paddleformers_instructions_formatting_function(tokenizer: AutoTokeni... function get_formatting_func_from_dataset (line 101) | def get_formatting_func_from_dataset(dataset: Union[Dataset], tokenizer:... FILE: paddleformers/cli/train/sft/make_data_utils.py class DataGenerator (line 18) | class DataGenerator: method __init__ (line 21) | def __init__(self, data_source): method __iter__ (line 34) | def __iter__(self): method __next__ (line 41) | def __next__(self): FILE: paddleformers/cli/train/sft/sft_config.py class SFTConfig (line 30) | class SFTConfig(TrainingArguments): method __post_init__ (line 87) | def __post_init__(self): FILE: paddleformers/cli/train/sft/sft_trainer.py class SFTTrainer (line 58) | class SFTTrainer(Trainer): method __init__ (line 59) | def __init__( method _prepare_dataset (line 197) | def _prepare_dataset( method _prepare_non_packed_dataloader (line 248) | def _prepare_non_packed_dataloader( method prediction_step (line 319) | def prediction_step( method log (line 378) | def log(self, logs: Dict[str, float], **kwargs) -> None: method get_ptq_dataloader (line 386) | def get_ptq_dataloader(self, ptq_ds): method ptq_loop (line 411) | def ptq_loop( FILE: paddleformers/cli/train/sft/workflow.py function create_pretrained_dataset (line 89) | def create_pretrained_dataset(training_args, data_args, model_args): function run_sft (line 167) | def run_sft( function create_peft_model (line 735) | def create_peft_model(model_args, training_args, dtype, model): FILE: paddleformers/cli/train/tuner.py function check_path (line 25) | def check_path(path): function _training_function (line 33) | def _training_function(config: dict[str, Any]) -> None: function run_tuner (line 71) | def run_tuner(args: Optional[dict[str, Any]] = None) -> None: FILE: paddleformers/cli/utils/llm_utils.py function compute_metrics (line 44) | def compute_metrics(eval_preds): function get_lora_target_modules (line 55) | def get_lora_target_modules(model): function get_infer_model_path (line 424) | def get_infer_model_path(input_dir, model_prefix): function deserialize_from_file (line 432) | def deserialize_from_file(fp): function get_alibi_slopes (line 462) | def get_alibi_slopes(num_heads): function pad_batch_data (line 477) | def pad_batch_data(insts, masks=None, pad_id=0, return_seq_len=False, pa... function dybatch_preprocess (line 505) | def dybatch_preprocess( function load_real_time_tokens (line 735) | def load_real_time_tokens(): function init_chat_template (line 752) | def init_chat_template( function get_model_max_position_embeddings (line 799) | def get_model_max_position_embeddings(config: PretrainedConfig) -> Optio... function read_res (line 812) | def read_res( function read_res_dynamic_insert (line 850) | def read_res_dynamic_insert( function speculate_read_res (line 899) | def speculate_read_res( function get_rotary_position_embedding (line 950) | def get_rotary_position_embedding(position_ids, head_dim, rope_theta=100... function init_dist_env (line 1000) | def init_dist_env(): function get_eos_token_id (line 1044) | def get_eos_token_id( function set_triton_cache (line 1066) | def set_triton_cache(model_name_or_path, mode): FILE: paddleformers/cli/utils/mllm_utils.py class MLLMModelMapping (line 27) | class MLLMModelMapping: class ModelKeys (line 37) | class ModelKeys: class MultiModelKeys (line 52) | class MultiModelKeys(ModelKeys): method __post_init__ (line 57) | def __post_init__(self): function register_multimodel_keys (line 66) | def register_multimodel_keys(multimodel_key: ModelKeys, *, exist_ok: boo... function get_multimodel_target_modules (line 73) | def get_multimodel_target_modules(model_type: Optional[str]) -> Optional... function get_multimodel_lora_target_modules (line 79) | def get_multimodel_lora_target_modules(model, target_modules, freeze_con... function freeze_model_parameters (line 131) | def freeze_model_parameters(model, freeze_config): FILE: paddleformers/cli/utils/process.py function terminate_process_tree (line 26) | def terminate_process_tree(pid: int) -> None: function is_env_enabled (line 61) | def is_env_enabled(env_var: str, default: str = "0") -> bool: function is_valid_model_dir (line 66) | def is_valid_model_dir(directory: str) -> bool: function detect_device (line 75) | def detect_device() -> str: function set_ascend_environment (line 98) | def set_ascend_environment(): function remove_paddle_shm_files (line 163) | def remove_paddle_shm_files(): function set_cuda_environment (line 174) | def set_cuda_environment(): function set_env_if_empty (line 197) | def set_env_if_empty(key, value): function add_new_special_tokens (line 212) | def add_new_special_tokens(tokenizer, path): FILE: paddleformers/data/blendable_dataset.py function print_rank_0 (line 26) | def print_rank_0(*args, **kwargs): class BlendableDataset (line 31) | class BlendableDataset(paddle.io.Dataset): method __init__ (line 32) | def __init__(self, datasets, weights, size, share_folder, *, data_cach... method __len__ (line 175) | def __len__(self): method __getitem__ (line 178) | def __getitem__(self, idx): FILE: paddleformers/data/causal_dataset.py function get_logits (line 35) | def get_logits(batch_ids, max_retries=1, timeout=1200, retry_delay=1, pr... function check_data_split (line 82) | def check_data_split(splits_string, do_train, do_eval, do_predict): function get_train_valid_test_split_ (line 102) | def get_train_valid_test_split_(splits_string, size): function get_datasets_weights_and_num_samples (line 129) | def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num... function print_rank_0 (line 160) | def print_rank_0(*args, **kwargs): function build_train_valid_test_datasets (line 165) | def build_train_valid_test_datasets( function _build_train_valid_test_datasets (line 261) | def _build_train_valid_test_datasets( function get_indexed_dataset_ (line 338) | def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): class GPTDataset (line 350) | class GPTDataset(paddle.io.Dataset): method __init__ (line 351) | def __init__( method __len__ (line 425) | def __len__(self): method __getitem__ (line 430) | def __getitem__(self, idx): function _build_index_mappings (line 523) | def _build_index_mappings( function _num_tokens (line 713) | def _num_tokens(documents, sizes): function _num_epochs (line 718) | def _num_epochs(tokens_per_epoch, seq_length, num_samples): function _build_doc_idx (line 733) | def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): function _build_sample_idx (line 749) | def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per... function _build_shuffle_idx (line 797) | def _build_shuffle_idx(num_samples, total_size, np_rng): FILE: paddleformers/data/collate.py class Stack (line 26) | class Stack(object): method __init__ (line 38) | def __init__(self, axis=0, dtype=None): method __call__ (line 42) | def __call__(self, data): class Pad (line 72) | class Pad(object): method __init__ (line 95) | def __init__(self, pad_val=0, axis=0, ret_length=None, dtype=None, pad... method __call__ (line 102) | def __call__(self, data): class Tuple (line 169) | class Tuple(object): method __init__ (line 187) | def __init__(self, fn, *args): method __call__ (line 200) | def __call__(self, data): class Dict (line 247) | class Dict(object): method __init__ (line 266) | def __init__(self, fn): method __call__ (line 280) | def __call__(self, data): FILE: paddleformers/data/data_collator.py class DataCollatorMixin (line 61) | class DataCollatorMixin: method __call__ (line 62) | def __call__(self, features, return_tensors=None): function default_data_collator (line 73) | def default_data_collator(features: List[InputDataClass], return_tensors... function paddle_default_data_collator (line 96) | def paddle_default_data_collator(features: List[InputDataClass]) -> Dict... function numpy_default_data_collator (line 128) | def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[... class DefaultDataCollator (line 162) | class DefaultDataCollator(DataCollatorMixin): method __call__ (line 179) | def __call__(self, features: List[Dict[str, Any]], return_tensors=None... class DataCollatorWithPadding (line 186) | class DataCollatorWithPadding: method __call__ (line 202) | def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: class DataCollatorForTokenClassification (line 225) | class DataCollatorForTokenClassification(DataCollatorMixin): method paddle_call (line 262) | def paddle_call(self, features): method numpy_call (line 299) | def numpy_call(self, features): class DataCollatorForSeq2Seq (line 330) | class DataCollatorForSeq2Seq: method __call__ (line 376) | def __call__(self, features, return_tensors=None): class DataCollatorForEmbedding (line 434) | class DataCollatorForEmbedding: method __call__ (line 448) | def __call__(self, batch, return_tensors=None) -> Any: method process_data (line 507) | def process_data(self, data, pad_idx, max_len): method pad_batch_data (line 516) | def pad_batch_data(insts, pad_id=0, max_seq_len=None, return_seq_len=F... method gen_self_attn_mask (line 531) | def gen_self_attn_mask(batch_token_ids: List[List[int]], max_seq_len: ... method gen_attn_mask_start_row_indices (line 543) | def gen_attn_mask_start_row_indices(batch_token_ids: List[List[int]], ... function _paddle_collate_batch (line 561) | def _paddle_collate_batch(examples, tokenizer, pad_to_multiple_of: Optio... function _numpy_collate_batch (line 599) | def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Option... function tolist (line 633) | def tolist(x): class DataCollatorForLanguageModeling (line 642) | class DataCollatorForLanguageModeling(DataCollatorMixin): method paddle_call (line 671) | def paddle_call(self, examples: List[Union[List[int], Any, Dict[str, A... method paddle_mask_tokens (line 695) | def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optiona... method numpy_call (line 736) | def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, An... method numpy_mask_tokens (line 758) | def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional... FILE: paddleformers/data/dist_dataloader.py class DummyDataset (line 27) | class DummyDataset(paddle.io.Dataset): method __len__ (line 32) | def __len__(self): class IterableDummyDataset (line 36) | class IterableDummyDataset(paddle.io.IterableDataset): method __iter__ (line 37) | def __iter__(self): class DistDataLoader (line 41) | class DistDataLoader(paddle.io.DataLoader): method __init__ (line 46) | def __init__( method _dataloader_iter (line 132) | def _dataloader_iter(self): method __len__ (line 137) | def __len__(self): method __iter__ (line 143) | def __iter__(self): method _broadcast_data (line 146) | def _broadcast_data(self, data): method __next__ (line 201) | def __next__(self): function init_dataloader_comm_group (line 213) | def init_dataloader_comm_group(): FILE: paddleformers/data/indexed_dataset.py function print_rank_0 (line 40) | def print_rank_0(*args, **kwargs): function __best_fitting_dtype (line 45) | def __best_fitting_dtype(vocab_size=None): function get_available_dataset_impl (line 52) | def get_available_dataset_impl(): function make_dataset (line 56) | def make_dataset(path, impl, skip_warmup=False): function make_sft_dataset (line 72) | def make_sft_dataset(path, dataclass, skip_warmup=False, impl="mmap"): function dataset_exists (line 85) | def dataset_exists(path, impl): function read_longs (line 92) | def read_longs(f, n): function write_longs (line 98) | def write_longs(f, a): function read_shorts (line 102) | def read_shorts(f, n): function write_shorts (line 108) | def write_shorts(f, a): function code (line 126) | def code(dtype): function index_file_path (line 133) | def index_file_path(prefix_path): function sft_index_file_path (line 137) | def sft_index_file_path(prefix_path): function sft_data_file_path (line 141) | def sft_data_file_path(prefix_path, dataclass): function data_file_path (line 149) | def data_file_path(prefix_path): function loss_mask_file_path (line 153) | def loss_mask_file_path(prefix_path): function create_doc_idx (line 157) | def create_doc_idx(sizes): class IndexedDataset (line 165) | class IndexedDataset(paddle.io.Dataset): method __init__ (line 170) | def __init__(self, path): method read_index (line 176) | def read_index(self, path): method read_data (line 193) | def read_data(self, path): method check_index (line 196) | def check_index(self, i): method __del__ (line 200) | def __del__(self): method __getitem__ (line 205) | def __getitem__(self, idx): method get (line 229) | def get(self, idx, offset=0, length=None): method __len__ (line 247) | def __len__(self): method num_tokens (line 250) | def num_tokens(self, index): method size (line 253) | def size(self, index): method exists (line 257) | def exists(path): method supports_prefetch (line 261) | def supports_prefetch(self): method doc_idx (line 265) | def doc_idx(self): method get_doc_idx (line 268) | def get_doc_idx(self): method set_doc_idx (line 271) | def set_doc_idx(self, doc_idx_): class IndexedDatasetBuilder (line 275) | class IndexedDatasetBuilder(object): method __init__ (line 287) | def __init__(self, out_file, dtype=np.int32): method add_item (line 296) | def add_item(self, tensor): method end_document (line 305) | def end_document(self): method merge_file_ (line 308) | def merge_file_(self, another_file): method finalize (line 333) | def finalize(self, index_file): function _warmup_mmap_file (line 354) | def _warmup_mmap_file(path): class MMapIndexedDataset (line 360) | class MMapIndexedDataset(paddle.io.Dataset): class Index (line 361) | class Index(object): method writer (line 365) | def writer(cls, path, dtype): method __init__ (line 410) | def __init__(self, path, skip_warmup=False): method __del__ (line 448) | def __del__(self): method dtype (line 453) | def dtype(self): method sizes (line 457) | def sizes(self): method doc_idx (line 461) | def doc_idx(self): method __getitem__ (line 465) | def __getitem__(self, i): method __len__ (line 468) | def __len__(self): method __init__ (line 471) | def __init__(self, path, skip_warmup=False): method __getstate__ (line 481) | def __getstate__(self): method __setstate__ (line 484) | def __setstate__(self, state): method _do_init (line 487) | def _do_init(self, path, skip_warmup): method __del__ (line 506) | def __del__(self): method __len__ (line 514) | def __len__(self): method __getitem__ (line 518) | def __getitem__(self, idx): method get (line 537) | def get(self, idx, offset=0, length=None): method sizes (line 555) | def sizes(self): method doc_idx (line 559) | def doc_idx(self): method get_doc_idx (line 562) | def get_doc_idx(self): method set_doc_idx (line 565) | def set_doc_idx(self, doc_idx_): method supports_prefetch (line 569) | def supports_prefetch(self): method exists (line 573) | def exists(path): class SFTMMapIndexedDataset (line 577) | class SFTMMapIndexedDataset(paddle.io.Dataset): class Index (line 578) | class Index(object): method writer (line 582) | def writer(cls, path, dtype): method __init__ (line 624) | def __init__(self, path, skip_warmup=False): method __del__ (line 662) | def __del__(self): method dtype (line 667) | def dtype(self): method sizes (line 671) | def sizes(self): method doc_idx (line 675) | def doc_idx(self): method __getitem__ (line 679) | def __getitem__(self, i): method __len__ (line 682) | def __len__(self): method __init__ (line 685) | def __init__(self, path, dataclass, skip_warmup=False): method __getstate__ (line 694) | def __getstate__(self): method __setstate__ (line 697) | def __setstate__(self, state): method _do_init (line 700) | def _do_init(self, path, skip_warmup): method __del__ (line 719) | def __del__(self): method __len__ (line 726) | def __len__(self): method __getitem__ (line 729) | def __getitem__(self, idx): method sizes (line 767) | def sizes(self): method doc_idx (line 771) | def doc_idx(self): method get_doc_idx (line 774) | def get_doc_idx(self): method set_doc_idx (line 777) | def set_doc_idx(self, doc_idx_): method supports_prefetch (line 781) | def supports_prefetch(self): method exists (line 785) | def exists(path, dataclass): function make_builder (line 794) | def make_builder(out_file, impl, save_dtype, loss_mask_file=None): class SFTMMapIndexedDatasetBuilder (line 801) | class SFTMMapIndexedDatasetBuilder(object): method __init__ (line 802) | def __init__(self, output_file_dict, dtype, index_file=None): method add_item (line 818) | def add_item(self, sequence): method add_item_bytes (line 827) | def add_item_bytes(self, serialized): method end_document (line 835) | def end_document(self): method finalize (line 842) | def finalize(self, index_file): class MMapIndexedDatasetBuilder (line 849) | class MMapIndexedDatasetBuilder(object): method __init__ (line 850) | def __init__(self, out_file, dtype, loss_mask_file=None): method flush_loss_mask_item (line 859) | def flush_loss_mask_item(self, loss_mask_lst): method add_item (line 864) | def add_item(self, tensor): method add_doc (line 869) | def add_doc(self, tensor, sizes): method end_document (line 875) | def end_document(self): method merge_file_ (line 878) | def merge_file_(self, another_file): method finalize (line 891) | def finalize(self, index_file): function get_indexed_dataset_ (line 903) | def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): class CompatibleIndexedDataset (line 919) | class CompatibleIndexedDataset(paddle.io.Dataset): method __init__ (line 920) | def __init__(self, path): method __getstate__ (line 934) | def __getstate__(self): method __len__ (line 937) | def __len__(self): method __getitem__ (line 941) | def __getitem__(self, idx): method get (line 960) | def get(self, idx, offset=0, length=None): method sizes (line 976) | def sizes(self): method doc_idx (line 980) | def doc_idx(self): method get_doc_idx (line 983) | def get_doc_idx(self): method set_doc_idx (line 986) | def set_doc_idx(self, doc_idx_): method exists (line 990) | def exists(path): FILE: paddleformers/data/sampler.py class SamplerHelper (line 22) | class SamplerHelper(object): method __init__ (line 46) | def __init__(self, dataset, iterable=None): method __iter__ (line 53) | def __iter__(self): method __len__ (line 63) | def __len__(self): method length (line 72) | def length(self): method length (line 86) | def length(self, length): method apply (line 89) | def apply(self, fn): method shuffle (line 105) | def shuffle(self, buffer_size=-1, seed=None): method sort (line 171) | def sort(self, cmp=None, key=None, reverse=False, buffer_size=-1): method batch (line 247) | def batch(self, batch_size, drop_last=False, batch_size_fn=None, key=N... method shard (line 335) | def shard(self, num_replicas=None, rank=None): method list (line 406) | def list(self): FILE: paddleformers/data/tokenizer.py function get_idx_from_word (line 16) | def get_idx_from_word(word, word_to_idx, unk_word): class BaseTokenizer (line 22) | class BaseTokenizer(object): method __init__ (line 23) | def __init__(self, vocab): method get_tokenizer (line 26) | def get_tokenizer(self): method cut (line 29) | def cut(self, sentence): method encode (line 32) | def encode(self, sentence): FILE: paddleformers/data/vocab.py class Vocab (line 24) | class Vocab(object): method __init__ (line 57) | def __init__( method _index_counter_keys (line 132) | def _index_counter_keys(self, counter, special_tokens, max_size, min_f... method _sort_index_according_to_user_specification (line 147) | def _sort_index_according_to_user_specification(self, token_to_idx): method to_tokens (line 172) | def to_tokens(self, indices): method to_indices (line 228) | def to_indices(self, tokens): method __getitem__ (line 259) | def __getitem__(self, tokens): method __len__ (line 268) | def __len__(self): method __contains__ (line 271) | def __contains__(self, token): method __call__ (line 274) | def __call__(self, tokens): method idx_to_token (line 284) | def idx_to_token(self): method token_to_idx (line 289) | def token_to_idx(self): method to_json (line 293) | def to_json(self, path=None): method from_json (line 333) | def from_json(cls, json_str): method from_dict (line 378) | def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_t... method build_vocab (line 431) | def build_vocab( method load_vocabulary (line 509) | def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_toke... method save_vocabulary (line 558) | def save_vocabulary(self, filepath): method get_unk_token_id (line 569) | def get_unk_token_id(self): method get_bos_token_id (line 572) | def get_bos_token_id(self): method get_eos_token_id (line 575) | def get_eos_token_id(self): method get_pad_token_id (line 578) | def get_pad_token_id(self): FILE: paddleformers/datasets/DPODataset.py class Sequence (line 37) | class Sequence: class BaseDPODataSet (line 54) | class BaseDPODataSet: method __init__ (line 55) | def __init__(self, **dataset_config): method __len__ (line 90) | def __len__(self): method _generate_sequences (line 93) | def _generate_sequences(self): method _generate_greedy_packs (line 160) | def _generate_greedy_packs(self, sequences): method _preprocess_dpo_example (line 186) | def _preprocess_dpo_example(self, example): method __postprocess_before_concat (line 237) | def __postprocess_before_concat(self, example): method _postprocess_sequence (line 375) | def _postprocess_sequence(self, example): class IteratorDPODataset (line 495) | class IteratorDPODataset(BaseDPODataSet, IterableDataset): method __init__ (line 496) | def __init__(self, **dataset_config): method __iter__ (line 499) | def __iter__(self): class MapDPODataset (line 507) | class MapDPODataset(BaseDPODataSet, Dataset): method __init__ (line 508) | def __init__(self, **dataset_config): method __len__ (line 515) | def __len__(self): method __getitem__ (line 518) | def __getitem__(self, idx): FILE: paddleformers/datasets/SFTDataset.py class TextSequence (line 39) | class TextSequence: class Sequence (line 49) | class Sequence: class BaseSFTDataset (line 62) | class BaseSFTDataset: method __init__ (line 63) | def __init__(self, **dataset_config): method __len__ (line 194) | def __len__(self): method _worker_loop (line 197) | def _worker_loop(self): method _get_processed_data_iterator (line 212) | def _get_processed_data_iterator(self, dataset_iterator, actual_exampl... method _process_sequence (line 317) | def _process_sequence(self, example, actual_example_num): method _process_pretraining_tokens (line 324) | def _process_pretraining_tokens(self, example, actual_example_num): method _generate_greedy_packs_from_sequences (line 328) | def _generate_greedy_packs_from_sequences(self, sequences): method _generate_sequences (line 357) | def _generate_sequences(self): method __iter__ (line 574) | def __iter__(self): method _encode_pretraining_messages (line 585) | def _encode_pretraining_messages(self, messages, actual_example_num): method _postprocess_pretraining_sequence (line 593) | def _postprocess_pretraining_sequence(self, example, actual_example_num): method _postprocess_sequence (line 679) | def _postprocess_sequence(self, example, actual_example_num): method print_max_steps_estimate_progress (line 858) | def print_max_steps_estimate_progress(self): method _add_dynamic_eos (line 869) | def _add_dynamic_eos(input_ids, labels, suffix_tokens_id): method _binpacking_process_batch (line 885) | def _binpacking_process_batch(self, iterator, batch_size): class IteratorSFTDataset (line 901) | class IteratorSFTDataset(BaseSFTDataset, IterableDataset): method __init__ (line 902) | def __init__(self, **dataset_config): method __iter__ (line 905) | def __iter__(self): class MapSFTDataset (line 913) | class MapSFTDataset(BaseSFTDataset, Dataset): method __init__ (line 914) | def __init__(self, **dataset_config): method __len__ (line 933) | def __len__(self): method __getitem__ (line 936) | def __getitem__(self, idx): FILE: paddleformers/datasets/collate.py function calc_padding_size (line 28) | def calc_padding_size(seq_len: int, training_args) -> int: function dpo_collate_fn (line 48) | def dpo_collate_fn( function mm_dpo_collate_fn (line 193) | def mm_dpo_collate_fn( function collate_fn (line 442) | def collate_fn( function mm_collate_fn (line 523) | def mm_collate_fn( function pad_batch_data (line 724) | def pad_batch_data( function gen_self_attn_mask (line 777) | def gen_self_attn_mask(batch_token_ids: List[List[int]], max_seq_len: in... function gen_attn_mask_startend_row_indices (line 804) | def gen_attn_mask_startend_row_indices( FILE: paddleformers/datasets/data_utils.py function round_up_to_multiple_of_8 (line 32) | def round_up_to_multiple_of_8(n): function print_debug_info (line 37) | def print_debug_info(tokenizer, data, label): function convert_to_tokens_for_pt (line 46) | def convert_to_tokens_for_pt( function convert_to_tokens_for_sft (line 68) | def convert_to_tokens_for_sft( function convert_to_input_ids (line 113) | def convert_to_input_ids( function function_call_chat_template (line 151) | def function_call_chat_template(tokenizer, messages, tools): function postprocess_fc_sequence (line 181) | def postprocess_fc_sequence(tokenizer, example): function estimate_training (line 188) | def estimate_training(train_dataset, data_args, training_args, model_args): function get_worker_sliced_iterator (line 308) | def get_worker_sliced_iterator(dataset): function calculate_matched_group (line 344) | def calculate_matched_group(sequences, packing_length: int, is_finished:... FILE: paddleformers/datasets/dataset.py function load_from_ppnlp (line 56) | def load_from_ppnlp(path, *args, **kwargs): class DatasetTuple (line 69) | class DatasetTuple: method __init__ (line 70) | def __init__(self, splits): method __getitem__ (line 75) | def __getitem__(self, key): method __setitem__ (line 81) | def __setitem__(self, key, value): method _gen_identifier_map (line 84) | def _gen_identifier_map(self, splits): method __len__ (line 92) | def __len__(self): function import_main_class (line 96) | def import_main_class(module_path): function load_from_hf (line 117) | def load_from_hf(path, name=None, splits=None, **kwargs): function load_dataset (line 158) | def load_dataset(path_or_read_func, name=None, data_files=None, splits=N... class MapDataset (line 233) | class MapDataset(Dataset): method __init__ (line 246) | def __init__(self, data, **kwargs): method _transform (line 254) | def _transform(self, data): method __getitem__ (line 259) | def __getitem__(self, idx): method __len__ (line 266) | def __len__(self): method filter (line 272) | def filter(self, fn, num_workers=0): method _filter (line 303) | def _filter(self, fn): method shard (line 307) | def shard(self, num_shards=None, index=None, contiguous=False): method _shard (line 311) | def _shard(self, num_shards=None, index=None, contiguous=False): method map (line 345) | def map(self, fn, lazy=True, batched=False, num_workers=0): method _map (line 384) | def _map(self, fn, lazy=True, batched=False): class IterDataset (line 394) | class IterDataset(IterableDataset): method __init__ (line 407) | def __init__(self, data, **kwargs): method _transform (line 415) | def _transform(self, data): method _shard_filter (line 420) | def _shard_filter(self, num_samples): method _filter (line 423) | def _filter(self, data): method __iter__ (line 429) | def __iter__(self): method skip (line 451) | def skip(self, n): method filter (line 457) | def filter(self, fn): method shard (line 471) | def shard(self, num_shards=None, index=None): method map (line 498) | def map(self, fn): class DatasetBuilder (line 512) | class DatasetBuilder: method __init__ (line 523) | def __init__(self, lazy=None, name=None, **config): method read_datasets (line 529) | def read_datasets(self, splits=None, data_files=None): method read (line 614) | def read(self, filename, split="train"): method _read (line 727) | def _read(self, filename: str, *args): method _get_data (line 736) | def _get_data(self, mode: str): method get_labels (line 745) | def get_labels(self): method get_vocab (line 751) | def get_vocab(self): class SimpleBuilder (line 758) | class SimpleBuilder(DatasetBuilder): method __init__ (line 759) | def __init__(self, lazy, read_func): method read (line 763) | def read(self, **kwargs): FILE: paddleformers/datasets/loader.py function create_dataset (line 21) | def create_dataset(**dataset_config: Dict[str, Any]): function create_indexed_dataset (line 47) | def create_indexed_dataset(data_file_prefix): FILE: paddleformers/datasets/reader/convertor.py function convert_dpo_txt_data (line 18) | def convert_dpo_txt_data(data): function convert_txt_data (line 94) | def convert_txt_data(item): function convert_mm_data (line 144) | def convert_mm_data(item): function convert_pretraining_data (line 242) | def convert_pretraining_data(data): function erniekit_convertor (line 256) | def erniekit_convertor(item): function messages_convertor (line 272) | def messages_convertor(item): FILE: paddleformers/datasets/reader/download_manager.py function HuggingFaceDownload (line 22) | def HuggingFaceDownload(repo_id, download_path, resume_download=True, ma... FILE: paddleformers/datasets/reader/file_reader.py class BaseReader (line 32) | class BaseReader(IterableDataset): method __init__ (line 35) | def __init__( class FileReader (line 63) | class FileReader(BaseReader): method __init__ (line 64) | def __init__( method __iter__ (line 82) | def __iter__(self): method _get_extension (line 143) | def _get_extension(self): method _data_check (line 147) | def _data_check(self, data): class FileListReader (line 225) | class FileListReader(BaseReader): method __init__ (line 226) | def __init__( method __iter__ (line 246) | def __iter__(self): method _get_files (line 259) | def _get_files(self): function get_hf_dataset_config (line 268) | def get_hf_dataset_config(file_path): class HuggingFaceReader (line 275) | class HuggingFaceReader(BaseReader): method __init__ (line 276) | def __init__( method __iter__ (line 316) | def __iter__(self): FILE: paddleformers/datasets/reader/io.py function load_json (line 23) | def load_json(file_path): function load_txt (line 47) | def load_txt(file_path): function load_csv (line 57) | def load_csv(file_path): function load_parquet (line 67) | def load_parquet(file_path): FILE: paddleformers/datasets/reader/mix_datasets.py class BaseMixDataset (line 24) | class BaseMixDataset(IterableDataset): method __init__ (line 29) | def __init__( method __iter__ (line 60) | def __iter__(self): method __len__ (line 67) | def __len__(self): class RandomDataset (line 74) | class RandomDataset(BaseMixDataset): method __init__ (line 79) | def __init__(self, *args, **kwargs): method __iter__ (line 90) | def __iter__(self): method __len__ (line 116) | def __len__(self): class ConcatDataset (line 120) | class ConcatDataset(BaseMixDataset): method __init__ (line 129) | def __init__(self, *args, **kwargs): method __iter__ (line 140) | def __iter__(self): method __len__ (line 153) | def __len__(self): class InterLeaveDataset (line 158) | class InterLeaveDataset(BaseMixDataset): method __init__ (line 167) | def __init__(self, *args, **kwargs): method _build_dataset (line 186) | def _build_dataset(self): method __iter__ (line 239) | def __iter__(self): method __len__ (line 252) | def __len__(self): function create_dataset_instance (line 265) | def create_dataset_instance(class_name, *args, **kwargs): FILE: paddleformers/datasets/reader/multi_source_datasets.py class InfiniteDataset (line 28) | class InfiniteDataset(IterableDataset): method __init__ (line 34) | def __init__(self, dataset, rng=None, random_shuffle=True): method __iter__ (line 49) | def __iter__(self): class MultiSourceDataset (line 62) | class MultiSourceDataset(IterableDataset): method __init__ (line 65) | def __init__(self, **dataset_config): method __iter__ (line 164) | def __iter__(self): FILE: paddleformers/datasets/rlhf_datasets/protocol.py class TensorDict (line 35) | class TensorDict: method __init__ (line 36) | def __init__(self, source: dict, batch_size=None, num_batch_dims=1): method __setitem__ (line 44) | def __setitem__(self, key: str, tensor: paddle.Tensor): method __getitem__ (line 51) | def __getitem__(self, key): method keys (line 54) | def keys(self): method items (line 57) | def items(self): method to (line 60) | def to(self, device: str): function union_two_dict (line 66) | def union_two_dict(dict1: Dict, dict2: Dict): function pad_dataproto_to_divisor (line 84) | def pad_dataproto_to_divisor(data: "DataProto", size_divisor: int): function unpad_dataproto (line 110) | def unpad_dataproto(data: "DataProto", pad_size): function union_tensor_dict (line 116) | def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict... function union_numpy_dict (line 132) | def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[... function list_of_dict_to_dict_of_list (line 146) | def list_of_dict_to_dict_of_list(list_of_dict: list[dict]): function fold_batch_dim (line 158) | def fold_batch_dim(data: "DataProto", new_batch_size): function unfold_batch_dim (line 178) | def unfold_batch_dim(data: "DataProto", batch_dims=2): function collate_fn (line 197) | def collate_fn(x: list["DataProtoItem"]): class DataProtoItem (line 211) | class DataProtoItem: class DataProto (line 219) | class DataProto: method __post_init__ (line 228) | def __post_init__(self): method __len__ (line 232) | def __len__(self): method __getitem__ (line 241) | def __getitem__(self, item): method print_size (line 246) | def print_size(self, prefix=""): method check_consistency (line 263) | def check_consistency(self): method from_single_dict (line 288) | def from_single_dict(cls, data: Dict[str, Union[paddle.Tensor, np.ndar... method from_dict (line 303) | def from_dict(cls, tensors: Dict[str, paddle.Tensor], non_tensors=None... method to (line 339) | def to(self, device) -> "DataProto": method select (line 353) | def select(self, batch_keys=None, non_tensor_batch_keys=None, meta_inf... method pop (line 388) | def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_k... method rename (line 420) | def rename(self, old_keys=None, new_keys=None) -> "DataProto": method union (line 447) | def union(self, other: "DataProto") -> "DataProto": method make_iterator (line 465) | def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader... method chunk (line 498) | def chunk(self, chunks: int) -> List["DataProto"]: method concat (line 533) | def concat(data: List["DataProto"]) -> "DataProto": method reorder (line 557) | def reorder(self, indices): method repeat (line 565) | def repeat(self, repeat_times=2, interleave=True): FILE: paddleformers/datasets/rlhf_datasets/rl_dataset.py function left_padding (line 33) | def left_padding(sequences, padding_value=0, max_length=None): function padding_batch_data (line 43) | def padding_batch_data( function collate_fn (line 63) | def collate_fn(data_list: list[dict], pad_token_id: int, requires_label:... class RLHFDataset (line 83) | class RLHFDataset(Dataset): method __init__ (line 91) | def __init__( method _read_files (line 124) | def _read_files(self): method tokenize (line 133) | def tokenize( method __len__ (line 155) | def __len__(self): method __getitem__ (line 158) | def __getitem__(self, index: int) -> dict[str, paddle.Tensor]: FILE: paddleformers/datasets/template/augment_utils.py class RandomApply (line 24) | class RandomApply: method __init__ (line 25) | def __init__(self, transforms, p=0.5): method __call__ (line 29) | def __call__(self, x): class RandomDiscreteRotation (line 36) | class RandomDiscreteRotation: method __init__ (line 37) | def __init__(self, degrees, interpolation="nearest", expand=True): method __call__ (line 42) | def __call__(self, img): class JpegCompression (line 47) | class JpegCompression: method __init__ (line 48) | def __init__(self, quality_range=(20, 80)): method __call__ (line 51) | def __call__(self, img): class RandomScale (line 59) | class RandomScale: method __init__ (line 60) | def __init__(self, scale_range=(0.7, 1.3), interpolation="bicubic"): method __call__ (line 64) | def __call__(self, img): class RandomSingleSidePadding (line 75) | class RandomSingleSidePadding: method __init__ (line 76) | def __init__(self, padding_range=(0, 20), fill="white"): method __call__ (line 83) | def __call__(self, img): FILE: paddleformers/datasets/template/formatter.py class Formatter (line 33) | class Formatter(ABC): method apply (line 38) | def apply(self, **kwargs) -> SLOTS: method extract (line 42) | def extract(self, content: str) -> Union[str, list["FunctionCall"]]: class EmptyFormatter (line 51) | class EmptyFormatter(Formatter): method __post_init__ (line 52) | def __post_init__(self): method apply (line 62) | def apply(self, **kwargs) -> SLOTS: class StringFormatter (line 67) | class StringFormatter(Formatter): method __post_init__ (line 68) | def __post_init__(self): method apply (line 78) | def apply(self, **kwargs) -> SLOTS: class FunctionFormatter (line 97) | class FunctionFormatter(StringFormatter): method __post_init__ (line 98) | def __post_init__(self): method apply (line 103) | def apply(self, **kwargs) -> SLOTS: class ToolFormatter (line 138) | class ToolFormatter(Formatter): method __post_init__ (line 139) | def __post_init__(self): method apply (line 143) | def apply(self, **kwargs) -> SLOTS: class ThinkingFormatter (line 153) | class ThinkingFormatter(StringFormatter): method __post_init__ (line 154) | def __post_init__(self): method apply (line 158) | def apply(self, **kwargs) -> SLOTS: FILE: paddleformers/datasets/template/grounding_plugin.py class BaseGroundingPlugin (line 20) | class BaseGroundingPlugin: method normalize_bbox (line 21) | def normalize_bbox(self, bbox: List[float]) -> List[int]: method format_ref_object (line 24) | def format_ref_object(self, obj_name: str) -> str: method format_bbox (line 27) | def format_bbox(self, bbox: List[float]) -> str: method process_messages (line 31) | def process_messages(self, messages, objects): function register_grounding_plugin (line 62) | def register_grounding_plugin(name, plugin_class): function get_grounding_plugin (line 69) | def get_grounding_plugin( FILE: paddleformers/datasets/template/mm_plugin.py function _make_batched_images (line 61) | def _make_batched_images(images, imglens: list[int]): function _check_video_is_nested_images (line 71) | def _check_video_is_nested_images(video) -> bool: class MMPluginMixin (line 77) | class MMPluginMixin: method _validate_input (line 83) | def _validate_input( method _validate_messages (line 121) | def _validate_messages( method _file_download (line 150) | def _file_download(self, url: str) -> bytes: method _img_download (line 163) | def _img_download(self, url: str) -> Image.Image: method _video_download (line 169) | def _video_download(self, url: str) -> VideoReader: method _preprocess_image (line 175) | def _preprocess_image(self, image, image_max_pixels, image_min_pixels,... method _get_video_sample_indices (line 192) | def _get_video_sample_indices(self, video_reader, video_fps, video_max... method _regularize_images (line 205) | def _regularize_images(self, images, **kwargs): method _regularize_videos (line 214) | def _regularize_videos(self, videos, **kwargs): method _regularize_audios (line 240) | def _regularize_audios(self, audios, sampling_rate: float, **kwargs): method _get_mm_inputs (line 251) | def _get_mm_inputs( class BasePlugin (line 308) | class BasePlugin(MMPluginMixin): method process_messages (line 309) | def process_messages( method process_tokens (line 322) | def process_tokens(self, tokens, processor): method get_mm_inputs (line 345) | def get_mm_inputs( class PaddleOCRVLPlugin (line 364) | class PaddleOCRVLPlugin(BasePlugin): method __init__ (line 368) | def __init__(self, image_token, video_token, audio_token, **kwargs): method get_ocr_augmentations (line 382) | def get_ocr_augmentations( method _preprocess_image (line 420) | def _preprocess_image(self, image, **kwargs): method _get_mm_inputs (line 443) | def _get_mm_inputs( method process_messages (line 465) | def process_messages( class ErnieVLPlugin (line 507) | class ErnieVLPlugin(BasePlugin): method convert_to_rgb (line 513) | def convert_to_rgb(self, image: Image.Image) -> Image.Image: method _preprocess_image (line 554) | def _preprocess_image(self, image, **kwargs): method _get_video_sample_indices (line 559) | def _get_video_sample_indices(self, video_reader, video_fps, video_max... method _regularize_videos (line 593) | def _regularize_videos(self, videos, **kwargs): method _get_mm_inputs (line 635) | def _get_mm_inputs( method process_messages (line 668) | def process_messages( class Qwen2VLPlugin (line 724) | class Qwen2VLPlugin(BasePlugin): method _preprocess_image (line 729) | def _preprocess_image(self, image, **kwargs): method _regularize_videos (line 746) | def _regularize_videos(self, videos, **kwargs): method _get_mm_inputs (line 785) | def _get_mm_inputs( method process_messages (line 819) | def process_messages( class Qwen2OmniPlugin (line 872) | class Qwen2OmniPlugin(Qwen2VLPlugin): method _get_mm_inputs (line 877) | def _get_mm_inputs( method _to_float_dtype (line 953) | def _to_float_dtype(data: Any, dtype: str) -> Any: method process_messages (line 976) | def process_messages( class Qwen3VLPlugin (line 1052) | class Qwen3VLPlugin(Qwen2VLPlugin): method _get_mm_inputs (line 1054) | def _get_mm_inputs( method process_messages (line 1095) | def process_messages( class GLM4VPlugin (line 1179) | class GLM4VPlugin(Qwen2VLPlugin): method _get_mm_inputs (line 1181) | def _get_mm_inputs( method process_messages (line 1217) | def process_messages( method get_mm_inputs (line 1296) | def get_mm_inputs( class Gemma3Plugin (line 1311) | class Gemma3Plugin(BasePlugin): method process_messages (line 1313) | def process_messages( method get_mm_inputs (line 1351) | def get_mm_inputs( class GlmOcrPlugin (line 1366) | class GlmOcrPlugin(BasePlugin): method process_messages (line 1379) | def process_messages( function register_mm_plugin (line 1470) | def register_mm_plugin(name: str, plugin_class: type["BasePlugin"]) -> N... function get_mm_plugin (line 1478) | def get_mm_plugin( FILE: paddleformers/datasets/template/template.py class Role (line 49) | class Role(str, Enum): class Template (line 58) | class Template: method encode_oneturn (line 77) | def encode_oneturn( method encode_multiturn (line 93) | def encode_multiturn( method add_thought (line 104) | def add_thought(self, content: str = "") -> str: method remove_thought (line 108) | def remove_thought(self, content: str) -> str: method get_thought_word_ids (line 113) | def get_thought_word_ids(self, tokenizer: "PreTrainedTokenizer") -> li... method _convert_elements_to_ids (line 117) | def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", e... method _encode (line 136) | def _encode( method _add_or_replace_eos_token (line 185) | def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_to... method fix_special_tokens (line 201) | def fix_special_tokens(self, tokenizer: "PreTrainedTokenizer") -> None: class ReasoningTemplate (line 222) | class ReasoningTemplate(Template): method encode_oneturn (line 226) | def encode_oneturn( method encode_multiturn (line 253) | def encode_multiturn( class Llama2Template (line 280) | class Llama2Template(Template): method _encode (line 284) | def _encode( class ErnieThinkingTemplate (line 328) | class ErnieThinkingTemplate(ReasoningTemplate): method _encode (line 332) | def _encode( function register_template (line 383) | def register_template( function parse_template (line 458) | def parse_template(tokenizer: "PreTrainedTokenizer") -> "Template": function get_template_and_fix_tokenizer (line 521) | def get_template_and_fix_tokenizer(dataset_config) -> "Template": function _get_gpt_oss_prefix (line 841) | def _get_gpt_oss_prefix(): FILE: paddleformers/datasets/template/tool_utils.py class FunctionCall (line 29) | class FunctionCall(NamedTuple): class ToolUtils (line 82) | class ToolUtils(ABC): method tool_formatter (line 87) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 93) | def function_formatter(functions: list["FunctionCall"]) -> str: class DefaultToolUtils (line 98) | class DefaultToolUtils(ToolUtils): method tool_formatter (line 103) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 138) | def function_formatter(functions: list["FunctionCall"]) -> str: class QwenToolUtils (line 142) | class QwenToolUtils(ToolUtils): method tool_formatter (line 147) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 157) | def function_formatter(functions: list["FunctionCall"]) -> str: class GLM4ToolUtils (line 165) | class GLM4ToolUtils(ToolUtils): method tool_formatter (line 170) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 182) | def function_formatter(functions: list["FunctionCall"]) -> str: class GLM4MOEToolUtils (line 189) | class GLM4MOEToolUtils(QwenToolUtils): method tool_formatter (line 194) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 204) | def function_formatter(functions: list["FunctionCall"]) -> str: class Llama3ToolUtils (line 221) | class Llama3ToolUtils(ToolUtils): method tool_formatter (line 229) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 240) | def function_formatter(functions: list["FunctionCall"]) -> str: class ERNIEToolUtils (line 245) | class ERNIEToolUtils(ToolUtils): method tool_formatter (line 250) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 261) | def function_formatter(functions: list["FunctionCall"]) -> str: class ERNIEVLToolUtils (line 269) | class ERNIEVLToolUtils(ToolUtils): method tool_formatter (line 274) | def tool_formatter(tools: list[dict[str, Any]]) -> str: method function_formatter (line 285) | def function_formatter(functions: list["FunctionCall"]) -> str: function get_tool_utils (line 304) | def get_tool_utils(name: str) -> "ToolUtils": FILE: paddleformers/generation/configuration_utils.py function resolve_hf_generation_config_path (line 36) | def resolve_hf_generation_config_path(repo_id: str, cache_dir: str, subf... class GenerationConfig (line 62) | class GenerationConfig: method _get_generation_mode (line 131) | def _get_generation_mode(self): method __init__ (line 142) | def __init__(self, **kwargs): method __eq__ (line 206) | def __eq__(self, other): method __repr__ (line 218) | def __repr__(self): method validate (line 221) | def validate(self, is_init=False): method save_pretrained (line 292) | def save_pretrained( method from_pretrained (line 337) | def from_pretrained( method _dict_from_json_file (line 434) | def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): method dict_paddle_dtype_to_str (line 439) | def dict_paddle_dtype_to_str(self, d: Dict[str, Any]) -> None: method from_dict (line 452) | def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "Generati... method to_diff_dict (line 476) | def to_diff_dict(self) -> Dict[str, Any]: method to_dict (line 499) | def to_dict(self) -> Dict[str, Any]: method to_json_string (line 514) | def to_json_string(self, use_diff: bool = True) -> str: method to_json_file (line 532) | def to_json_file(self, json_file_path: Union[str, os.PathLike], use_di... method from_model_config (line 547) | def from_model_config(cls, model_config: PretrainedConfig) -> "Generat... method update (line 575) | def update(self, **kwargs): FILE: paddleformers/generation/logits_process.py class LogitsProcessor (line 26) | class LogitsProcessor(ABC): method __call__ (line 32) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor): class LogitsProcessorList (line 38) | class LogitsProcessorList: method __init__ (line 41) | def __init__(self, processors: List[LogitsProcessor] = None) -> None: method __call__ (line 47) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor, **... method append (line 59) | def append(self, processor: LogitsProcessor): class MinLengthLogitsProcessor (line 63) | class MinLengthLogitsProcessor(LogitsProcessor): method __init__ (line 72) | def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]): method __call__ (line 82) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor): class RepetitionPenaltyLogitsProcessor (line 89) | class RepetitionPenaltyLogitsProcessor(LogitsProcessor): method __init__ (line 99) | def __init__(self, penalty: float): method __call__ (line 105) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor): function _get_ngrams (line 113) | def _get_ngrams(ngram_size: int, prev_input_ids: paddle.Tensor, num_hypo... function _get_generated_ngrams (line 140) | def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur... function _calc_banned_ngram_tokens (line 162) | def _calc_banned_ngram_tokens(ngram_size: int, prev_input_ids: paddle.Te... class NoRepeatNGramLogitsProcessor (line 177) | class NoRepeatNGramLogitsProcessor(LogitsProcessor): method __init__ (line 186) | def __init__(self, ngram_size: int): method __call__ (line 191) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor): class HammingDiversityLogitsProcessor (line 204) | class HammingDiversityLogitsProcessor(LogitsProcessor): method __init__ (line 219) | def __init__(self, diversity_rate: float, num_beams: int, num_beam_gro... method __call__ (line 230) | def __call__( class ForcedBOSTokenLogitsProcessor (line 252) | class ForcedBOSTokenLogitsProcessor(LogitsProcessor): method __init__ (line 261) | def __init__(self, forced_bos_token_id: int): method __call__ (line 264) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor): class ForcedEOSTokenLogitsProcessor (line 272) | class ForcedEOSTokenLogitsProcessor(LogitsProcessor): method __init__ (line 281) | def __init__(self, max_length: int, forced_eos_token_id: Union[int, Li... method __call__ (line 285) | def __call__(self, input_ids, scores): function TopKProcess (line 293) | def TopKProcess(probs: paddle.Tensor, top_k: int, min_tokens_to_keep: int): function TopPProcess (line 311) | def TopPProcess(probs: paddle.Tensor, top_p: float, min_tokens_to_keep: ... class LogitsWarper (line 347) | class LogitsWarper: method __call__ (line 350) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor): class TemperatureLogitsWarper (line 356) | class TemperatureLogitsWarper(LogitsWarper): method __init__ (line 364) | def __init__(self, temperature: float): method __call__ (line 370) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor): class SequenceBiasLogitsProcessor (line 375) | class SequenceBiasLogitsProcessor(LogitsProcessor): method __init__ (line 437) | def __init__(self, sequence_bias: Dict[Tuple[int], float]): method __call__ (line 446) | def __call__(self, input_ids, scores): method _prepare_bias_variables (line 484) | def _prepare_bias_variables(self, scores): method _validate_arguments (line 508) | def _validate_arguments(self): class NoBadWordsLogitsProcessor (line 527) | class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor): method __init__ (line 591) | def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union... method _validate_arguments (line 608) | def _validate_arguments(self): class PrefixConstrainedLogitsProcessor (line 623) | class PrefixConstrainedLogitsProcessor(LogitsProcessor): method __init__ (line 636) | def __init__(self, prefix_allowed_tokens_fn: Callable[[int, paddle.Ten... method __call__ (line 640) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor) ->... FILE: paddleformers/generation/stopping_criteria.py class StoppingCriteria (line 24) | class StoppingCriteria(ABC): method __call__ (line 30) | def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor, **... class MaxTimeCriteria (line 34) | class MaxTimeCriteria(StoppingCriteria): method __init__ (line 47) | def __init__(self, max_time: float, initial_timestamp: Optional[float]... method __call__ (line 51) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **... class MaxLengthCriteria (line 55) | class MaxLengthCriteria(StoppingCriteria): method __init__ (line 65) | def __init__(self, max_length: int): method __call__ (line 68) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **... class StoppingCriteriaList (line 72) | class StoppingCriteriaList(list): method __call__ (line 73) | def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **... method max_length (line 77) | def max_length(self): function validate_stopping_criteria (line 84) | def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, ... FILE: paddleformers/generation/streamers.py class BaseStreamer (line 26) | class BaseStreamer: method put (line 31) | def put(self, value): method end (line 35) | def end(self): class TextStreamer (line 40) | class TextStreamer(BaseStreamer): method __init__ (line 67) | def __init__(self, tokenizer: PreTrainedTokenizer, skip_prompt: bool =... method put (line 77) | def put(self, value): method end (line 111) | def end(self): method on_finalized_text (line 125) | def on_finalized_text(self, text: str, stream_end: bool = False): method _is_chinese_char (line 129) | def _is_chinese_char(self, cp): class TextIteratorStreamer (line 154) | class TextIteratorStreamer(TextStreamer): method __init__ (line 195) | def __init__( method on_finalized_text (line 207) | def on_finalized_text(self, text: str, stream_end: bool = False): method __iter__ (line 213) | def __iter__(self): method __next__ (line 216) | def __next__(self): FILE: paddleformers/generation/utils.py function _make_sliding_window_mask (line 66) | def _make_sliding_window_mask(input_shape, past_key_values_length=0, win... function get_unfinished_flag (line 95) | def get_unfinished_flag( class BeamHypotheses (line 126) | class BeamHypotheses: method __init__ (line 127) | def __init__(self, num_beams, length_penalty, early_stopping): method __len__ (line 137) | def __len__(self): method add (line 143) | def add(self, hyp, sum_logprobs, origin_len=0): method is_done (line 157) | def is_done(self, best_sum_logprobs, cur_len, origin_len=0): class BeamSearchScorer (line 173) | class BeamSearchScorer(object): method __init__ (line 178) | def __init__( method is_done (line 221) | def is_done(self): method process (line 224) | def process( method finalize (line 292) | def finalize( class GenerationMixin (line 347) | class GenerationMixin(object): method prepare_input_ids_for_generation (line 357) | def prepare_input_ids_for_generation(bos_token_id, encoder_output=None): method prepare_attention_mask_for_generation (line 366) | def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos... method _prepare_decoder_attention_mask (line 386) | def _prepare_decoder_attention_mask( method prepare_seq_len_for_generation (line 467) | def prepare_seq_len_for_generation(input_ids, pad_token_id, eos_token_... method get_logits_processor (line 478) | def get_logits_processor( method expand_inputs_for_generation (line 527) | def expand_inputs_for_generation(input_ids, expand_size, attention_mas... method update_model_kwargs_for_generation (line 561) | def update_model_kwargs_for_generation(outputs, model_kwargs, is_encod... method update_scores_for_generation (line 614) | def update_scores_for_generation(scores, next_scores, length, unfinish... method prepare_encoder_decoder_kwargs_for_generation (line 623) | def prepare_encoder_decoder_kwargs_for_generation(self, input_ids, mod... method prepare_decoder_input_ids_for_generation (line 641) | def prepare_decoder_input_ids_for_generation(self, input_ids, decoder_... method get_decoder_start_token_id (line 651) | def get_decoder_start_token_id(self, decoder_start_token_id=None, bos_... method prepare_inputs_for_generation (line 669) | def prepare_inputs_for_generation( method adjust_logits_during_generation (line 781) | def adjust_logits_during_generation(self, logits): method prepare_fast_entry (line 787) | def prepare_fast_entry(self, kwargs): method _convert_to_fast (line 790) | def _convert_to_fast(self, kwargs): method _build_fast (line 794) | def _build_fast(self, kwargs): method set_pad_token_id (line 806) | def set_pad_token_id(self, pad_token_id, eos_token_id): method generate (line 818) | def generate( method greedy_search (line 1248) | def greedy_search( method sample (line 1349) | def sample( method _get_model_inputs_spec (line 1493) | def _get_model_inputs_spec(self, dtype: str): method to_static (line 1502) | def to_static(self, path: str, config: dict): method sample_d2s (line 1543) | def sample_d2s( method reorder_cache (line 1677) | def reorder_cache(self, cache, beam_idx): method beam_search (line 1684) | def beam_search( method group_beam_search (line 1849) | def group_beam_search( FILE: paddleformers/mergekit/merge_config.py class MergeConfig (line 25) | class MergeConfig: method __post_init__ (line 81) | def __post_init__(self): method config_check (line 84) | def config_check(self): method __dict__ (line 128) | def __dict__(self): method to_dict (line 131) | def to_dict(self): method save_pretrained (line 134) | def save_pretrained(self, save_directory): method from_pretrained (line 155) | def from_pretrained(cls, pretrained_model_path, **kwargs): method from_json_file (line 180) | def from_json_file(cls, path_json_file): FILE: paddleformers/mergekit/merge_method.py class MergeMethod (line 19) | class MergeMethod: method __init__ (line 20) | def __init__(self, merge_config, sparsify_method=None): method merge (line 24) | def merge(self, tensor_list): method linear (line 36) | def linear(self, tensor_list): method slerp (line 58) | def slerp(self, tensor_list): method ties (line 127) | def ties(self, tensor_list): method normalize (line 192) | def normalize(self, t): FILE: paddleformers/mergekit/merge_model.py class MergeModel (line 63) | class MergeModel: method __init__ (line 64) | def __init__(self, merge_config): method reset_merge_model (line 70) | def reset_merge_model(self, merge_config=None, merge_param_dict=None): method merge_model (line 94) | def merge_model(self): method copy_file (line 107) | def copy_file(self): method mergekit (line 121) | def mergekit(self): method merge_mix_model (line 135) | def merge_mix_model(self, file_type_list): method get_model_state_dict (line 249) | def get_model_state_dict(self, model_path, file_type, key_list=None, f... method get_safetensor_index (line 304) | def get_safetensor_index(self, model_path, file_type): method merge_safetensor_model (line 319) | def merge_safetensor_model(self, file_type_list): method shard_merge_np (line 406) | def shard_merge_np( method shard_merge_pd (line 451) | def shard_merge_pd( method check_model_path (line 523) | def check_model_path(self, model_path, lora_merge=False): method check_lora_model_path (line 536) | def check_lora_model_path(self, model_path): method weight_name (line 547) | def weight_name(self): method safe_weight_name (line 553) | def safe_weight_name(self): method safe_index_name (line 559) | def safe_index_name(self): method merge_lora_model (line 565) | def merge_lora_model(self): method get_split_qkv_hidden_size (line 576) | def get_split_qkv_hidden_size(self, base_state_dict): method split_fuse_lora_state_dict (line 589) | def split_fuse_lora_state_dict(self, base_state_dict, lora_state_dict): method shard_lora_merge (line 618) | def shard_lora_merge(self, base_index, shard_file, lora_config, file_t... method merge_safetensor_lora_model (line 691) | def merge_safetensor_lora_model(self, file_type_list): method merge_pdparams_lora_model (line 792) | def merge_pdparams_lora_model(self, file_type_list): FILE: paddleformers/mergekit/merge_utils.py function divide_positions (line 17) | def divide_positions(m, n): function divide_lora_key_list (line 35) | def divide_lora_key_list(key_list, n, lora_config): function divide_safetensor_key_list (line 58) | def divide_safetensor_key_list(weight_map, n): FILE: paddleformers/mergekit/sparsify_method.py class SparsifyMethod (line 18) | class SparsifyMethod: method __init__ (line 19) | def __init__(self, merge_config): method sparsify (line 22) | def sparsify(self, tensor): method dare (line 34) | def dare(self, tensor): method magprune (line 47) | def magprune(self, tensor): method trim (line 91) | def trim(self, tensor): FILE: paddleformers/nn/activation.py class ClassInstantier (line 20) | class ClassInstantier(OrderedDict): method __getitem__ (line 21) | def __getitem__(self, key): FILE: paddleformers/nn/attention/eager_attention.py function repeat_kv (line 23) | def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: function eager_attention_forward (line 31) | def eager_attention_forward( FILE: paddleformers/nn/attention/flashmask_attention.py function flashmask_attention_forward (line 24) | def flashmask_attention_forward( FILE: paddleformers/nn/attention/interface.py class AttentionInterface (line 22) | class AttentionInterface(GeneralInterface): FILE: paddleformers/nn/attention/sdpa_attention.py function sdpa_attention_forward (line 24) | def sdpa_attention_forward( FILE: paddleformers/nn/attention/sink_impl.py function _get_fa_version (line 26) | def _get_fa_version(): function _flash_attention_forward_dispatch (line 33) | def _flash_attention_forward_dispatch( function _flash_attention_backward_dispatch (line 87) | def _flash_attention_backward_dispatch( function _flashmask_attention_forward_dispatch (line 129) | def _flashmask_attention_forward_dispatch( function _flashmask_attention_backward_dispatch (line 180) | def _flashmask_attention_backward_dispatch( class FlashMaskSinkPyLayer (line 222) | class FlashMaskSinkPyLayer(PyLayer): method forward (line 232) | def forward( method backward (line 387) | def backward(ctx, grad_output): function sink_attention_forward (line 550) | def sink_attention_forward( FILE: paddleformers/nn/attention/utils.py function repeat_kv (line 18) | def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: FILE: paddleformers/nn/criterion/dpo_loss.py function dpo_preprocess_inputs (line 33) | def dpo_preprocess_inputs(self, logits, labels): function loss_impl (line 50) | def loss_impl(self, logits, labels): function dpo_logps (line 56) | def dpo_logps( function cal_dpo_loss (line 248) | def cal_dpo_loss( function dpo_loss_forward (line 336) | def dpo_loss_forward( FILE: paddleformers/nn/criterion/interface.py class LossInterface (line 27) | class LossInterface(GeneralInterface): class CriterionLayer (line 40) | class CriterionLayer(nn.Layer): method __init__ (line 41) | def __init__(self, config, return_tuple=True, use_infohub=False, **kwa... method forward (line 89) | def forward(self, logits, labels, loss_mask=None, **kwargs): FILE: paddleformers/nn/criterion/kto_loss.py function kto_preprocess_inputs (line 36) | def kto_preprocess_inputs(self, logits, labels): function _nested_gather (line 51) | def _nested_gather(self, tensors): function kto_logps (line 69) | def kto_logps( function kto_loss (line 198) | def kto_loss( function kto_loss_forward (line 231) | def kto_loss_forward( FILE: paddleformers/nn/criterion/loss_utils.py function calc_lm_head_logits (line 23) | def calc_lm_head_logits( function subbatch (line 67) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar... FILE: paddleformers/nn/criterion/sft_loss.py function sft_preprocess_inputs (line 29) | def sft_preprocess_inputs(self, logits, labels): function sft_postprocess_loss (line 44) | def sft_postprocess_loss(self, masked_lm_loss, labels, loss_mask, **kwar... function loss_impl (line 60) | def loss_impl(self, logits, labels): function sft_calculate_loss (line 66) | def sft_calculate_loss(self, logits, hidden_states, lm_head_weight, lm_h... function sft_loss_forward (line 128) | def sft_loss_forward( function mtp_sft_loss_forward (line 188) | def mtp_sft_loss_forward( FILE: paddleformers/nn/embedding.py class Embedding (line 24) | class Embedding(GeneralInterface): method create (line 31) | def create( method process_kwargs (line 57) | def process_kwargs(self, embedding_type, **kwargs): method get_embedding_type (line 67) | def get_embedding_type(self, config: PretrainedConfig): FILE: paddleformers/nn/general.py class GeneralInterface (line 19) | class GeneralInterface(MutableMapping): method __init__ (line 29) | def __init__(self): method __getitem__ (line 32) | def __getitem__(self, key): method __setitem__ (line 38) | def __setitem__(self, key, value): method __delitem__ (line 42) | def __delitem__(self, key): method __iter__ (line 45) | def __iter__(self): method __len__ (line 49) | def __len__(self): method register (line 53) | def register(cls, key: str, value: Callable): method valid_keys (line 56) | def valid_keys(self) -> list[str]: FILE: paddleformers/nn/linear.py class Linear (line 29) | class Linear(GeneralInterface): method create (line 39) | def create( method get_linear_type (line 62) | def get_linear_type(self, config: PretrainedConfig, tp_plan: str = None): method get_linear_kwargs (line 72) | def get_linear_kwargs(self, linear_type, has_bias=False, gather_output... FILE: paddleformers/nn/lm_head.py class LMHead (line 27) | class LMHead(nn.Layer): method __init__ (line 28) | def __init__(self, config: PretrainedConfig): method _set_distributed_attr (line 68) | def _set_distributed_attr(self, param): method forward (line 73) | def forward(self, hidden_states, tensor_parallel_output=None): method extra_repr (line 111) | def extra_repr(self): method sharded_state_dict (line 114) | def sharded_state_dict( FILE: paddleformers/nn/mlp.py class MLP (line 25) | class MLP(nn.Layer): method __init__ (line 26) | def __init__( method forward (line 105) | def forward(self, x): FILE: paddleformers/nn/moe/abstract.py class MOELayerBase (line 18) | class MOELayerBase(nn.Layer): FILE: paddleformers/nn/moe/all_gather.py function allgather_async (line 26) | def allgather_async(input, group=None): function reduce_scatter_async (line 51) | def reduce_scatter_async(input, group=None): class AllGatherAsync (line 86) | class AllGatherAsync(PyLayer): method forward (line 92) | def forward(ctx, input, *fn_args, group=None, fn=None, is_first_fwd=Fa... method backward (line 116) | def backward(ctx, grad, *fn_out_grads): class AlltoAllSmart (line 137) | class AlltoAllSmart(paddle.autograd.PyLayer): method forward (line 143) | def forward( method backward (line 334) | def backward( class AlltoAllSmartXPU (line 415) | class AlltoAllSmartXPU(paddle.autograd.PyLayer): method forward (line 421) | def forward( method backward (line 611) | def backward( FILE: paddleformers/nn/moe/all_to_all.py class AlltoAll (line 23) | class AlltoAll(PyLayer): method forward (line 29) | def forward(ctx, x, group, sync_op=True): method backward (line 53) | def backward(ctx, *dx): class AlltoAllAsync (line 66) | class AlltoAllAsync(PyLayer): method forward (line 72) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): method backward (line 106) | def backward(ctx, dx_out, *fn_out_grads): FILE: paddleformers/nn/moe/moe_allgather_layer.py class ReshardCombineWeight (line 52) | class ReshardCombineWeight(PyLayer): method forward (line 58) | def forward(ctx, input, group=None): method backward (line 78) | def backward(ctx, grad): class MOEAllGatherLayerV2 (line 95) | class MOEAllGatherLayerV2(MOEAlltoAllLayer): method __init__ (line 100) | def __init__( method forward (line 145) | def forward( method fused_gate_logits_process_fused (line 371) | def fused_gate_logits_process_fused(self, gate_logits_lm, gate_logits_... method fused_gate_and_dispatch (line 456) | def fused_gate_and_dispatch(self, input, token_type_ids=None, global_d... method forward_experts (line 666) | def forward_experts(self, *dispatched_input): method calc_router_loss_and_logging (line 748) | def calc_router_loss_and_logging( FILE: paddleformers/nn/moe/moe_alltoall_layer.py class GateCombine (line 42) | class GateCombine(PyLayer): method forward (line 48) | def forward(ctx, x, combine_weights, scatter_index): method backward (line 67) | def backward(ctx, grad_y, *_): function combining (line 87) | def combining(x, combine_weights, scatter_index, hard_gate=False): class MOEAlltoAllLayer (line 113) | class MOEAlltoAllLayer(MOELayerBase): method __init__ (line 118) | def __init__( method forward_experts (line 199) | def forward_experts(self, dispatched_input): method fused_gate_logits_process (line 240) | def fused_gate_logits_process(self, gate_logits, token_type_ids=None, ... method gate_and_dispatch (line 303) | def gate_and_dispatch(self, input, token_type_ids=None): method _calc_router_loss (line 392) | def _calc_router_loss( method calc_router_loss_and_logging (line 444) | def calc_router_loss_and_logging( method combine_expert_output (line 540) | def combine_expert_output(self, expert_output, combine_weights, scatte... method forward_single_stage (line 560) | def forward_single_stage(self, dispatched_input, stage_id): method all2all_expert_overlap (line 574) | def all2all_expert_overlap(self, x, group): method forward (line 602) | def forward( FILE: paddleformers/nn/moe/moe_block.py function create_moe_block (line 31) | def create_moe_block( class MoEStatics (line 87) | class MoEStatics(nn.Layer): method __init__ (line 93) | def __init__(self, config, layer_idx): FILE: paddleformers/nn/moe/topk_gate.py function masked_fill (line 40) | def masked_fill(x, mask, value): function compute_optimal_transport (line 57) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:... function cast_if_needed (line 86) | def cast_if_needed(x, dtype): class FusedGateDetachMatmul (line 100) | class FusedGateDetachMatmul(paddle.autograd.PyLayer): method forward (line 107) | def forward(ctx, x, w): method backward (line 124) | def backward(ctx, y_grad): function gate_detach_matmul (line 145) | def gate_detach_matmul(x, weight, use_fuse): class TopKGate (line 164) | class TopKGate(nn.Layer): method __init__ (line 169) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->... method _create_gate_parameter (line 279) | def _create_gate_parameter(self): method get_gate_weight (line 318) | def get_gate_weight(self, transform_weight): method forward (line 346) | def forward( method get_capacity (line 380) | def get_capacity(self, num_tokens, cap_factor=None): method _cal_aux_loss (line 406) | def _cal_aux_loss( method _cal_z_loss (line 503) | def _cal_z_loss(self, logits, loss_mask=None): method _cal_orthogonal_loss_opt_each_weight (line 523) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): method _cal_orthogonal_loss (line 553) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None): FILE: paddleformers/nn/moe/utils.py function get_hcg (line 32) | def get_hcg(): function scatter_axis (line 39) | def scatter_axis(input, group=None, axis=0): class ReduceScatterGroupOp (line 71) | class ReduceScatterGroupOp(PyLayer): method forward (line 77) | def forward(ctx, input, group=None): method backward (line 92) | def backward(ctx, grad): class AllGatherGroupOp (line 103) | class AllGatherGroupOp(PyLayer): method forward (line 109) | def forward(ctx, input, group=None): method backward (line 124) | def backward(ctx, grad): function get_async_loader (line 135) | def get_async_loader(): function hack_offload_wait (line 149) | def hack_offload_wait(task): function all_gather_group (line 154) | def all_gather_group(input, group=None, axis=0): function reduce_scatter_group (line 190) | def reduce_scatter_group(input, group=None): class ScatterOp (line 221) | class ScatterOp(PyLayer): method forward (line 234) | def forward(ctx, input, axis=0, group=None): method backward (line 241) | def backward(ctx, grad): function detach_and_requires_grad_ (line 246) | def detach_and_requires_grad_(*args): class FakeClone (line 263) | class FakeClone(paddle.autograd.PyLayer): method forward (line 269) | def forward(ctx, input): method backward (line 287) | def backward(ctx, grad_output): function manual_backward (line 300) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]): function _parse_moe_group (line 351) | def _parse_moe_group( FILE: paddleformers/nn/moe_deepep/modular_moe_layer.py class ModularMoELayer (line 40) | class ModularMoELayer(nn.Layer): method __init__ (line 41) | def __init__( method _init_expert_parallel (line 191) | def _init_expert_parallel(self): method forward (line 236) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: method _forward_traditional_moe (line 287) | def _forward_traditional_moe( method _forward_with_ep_parallel (line 327) | def _forward_with_ep_parallel( method get_auxiliary_loss (line 367) | def get_auxiliary_loss(self) -> paddle.Tensor: method get_z_loss (line 370) | def get_z_loss(self) -> paddle.Tensor: method get_all_losses (line 373) | def get_all_losses(self) -> Dict[str, paddle.Tensor]: method get_total_loss (line 379) | def get_total_loss(self) -> paddle.Tensor: method remove_loss_function (line 385) | def remove_loss_function(self, name: str): method update_loss_weights (line 395) | def update_loss_weights(self, weights: Dict[str, float]): method set_loss_combiner (line 405) | def set_loss_combiner(self, combiner_name: str): method get_expert_info (line 415) | def get_expert_info(self) -> Dict[str, Any]: FILE: paddleformers/nn/moe_deepep/moe_communication.py class MoECommunicationInterface (line 26) | class MoECommunicationInterface(ABC): method forward (line 28) | def forward( class AllToAllMoECommunication (line 70) | class AllToAllMoECommunication(nn.Layer, MoECommunicationInterface): method forward (line 75) | def forward( class DeepEPMoECommunication (line 226) | class DeepEPMoECommunication(nn.Layer, MoECommunicationInterface): method expert_forward (line 231) | def expert_forward(self, dispatched_input, tokens_per_expert, experts,... method forward (line 248) | def forward( FILE: paddleformers/nn/moe_deepep/moe_expert.py class MoEExpertInterface (line 23) | class MoEExpertInterface(ABC): method forward (line 25) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: class StandardMLPExpert (line 36) | class StandardMLPExpert(MLP): method __init__ (line 37) | def __init__( FILE: paddleformers/nn/moe_deepep/moe_factory.py class QuickAccessMoEFactory (line 19) | class QuickAccessMoEFactory: method create_from_model_name (line 21) | def create_from_model_name( FILE: paddleformers/nn/moe_deepep/moe_gate.py class MoEGateMixin (line 30) | class MoEGateMixin: method gate_score_func (line 31) | def gate_score_func(self, logits: paddle.Tensor) -> paddle.Tensor: method gumbel_rsample (line 54) | def gumbel_rsample(self, logits: paddle.Tensor) -> paddle.Tensor: method uniform_sample (line 58) | def uniform_sample(self, logits: paddle.Tensor) -> paddle.Tensor: method _one_hot_to_float (line 63) | def _one_hot_to_float(self, x, num_classes): method _one_hot_to_int64 (line 69) | def _one_hot_to_int64(self, x, num_classes): method _capacity (line 75) | def _capacity( method _cal_aux_loss (line 99) | def _cal_aux_loss(self, gates, mask): method _cal_seq_aux_loss (line 126) | def _cal_seq_aux_loss(self, probs, top_k, routing_map, max_seq_len): method _cal_z_loss (line 160) | def _cal_z_loss(self, logits) -> paddle.Tensor: method _cal_orthogonal_loss (line 173) | def _cal_orthogonal_loss(self) -> paddle.Tensor: method _priority (line 183) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.... method _probs_drop_policy (line 205) | def _probs_drop_policy( method _topk_greedy (line 275) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle.... method _topk_group_limited_greedy (line 291) | def _topk_group_limited_greedy( method _topk_noaux_tc (line 323) | def _topk_noaux_tc( class StandardMoEGate (line 365) | class StandardMoEGate(nn.Layer, MoEGateMixin): method __init__ (line 366) | def __init__( method forward (line 442) | def forward( method topkgating (line 453) | def topkgating( FILE: paddleformers/nn/moe_deepep/moe_loss.py class LossType (line 27) | class LossType(Enum): class LossConfig (line 37) | class LossConfig: method __post_init__ (line 45) | def __post_init__(self): class LossFunction (line 50) | class LossFunction(Protocol): method __call__ (line 51) | def __call__( class AddAuxiliaryLoss (line 61) | class AddAuxiliaryLoss(paddle.autograd.PyLayer): method forward (line 68) | def forward(ctx, x, loss): method backward (line 75) | def backward(ctx, grad_output): class LossCombiner (line 82) | class LossCombiner(Protocol): method __call__ (line 83) | def __call__(self, losses: Dict[str, paddle.Tensor], configs: Dict[str... class LossRegistry (line 87) | class LossRegistry: method __init__ (line 88) | def __init__(self): method _register_default_losses (line 94) | def _register_default_losses(self): method _register_default_combiners (line 101) | def _register_default_combiners(self): method register_loss (line 106) | def register_loss(self, name: str, loss_func: LossFunction): method register_combiner (line 110) | def register_combiner(self, name: str, combiner: LossCombiner): method get_loss (line 114) | def get_loss(self, name: str) -> Optional[LossFunction]: method get_combiner (line 117) | def get_combiner(self, name: str) -> Optional[LossCombiner]: method list_losses (line 120) | def list_losses(self) -> List[str]: method list_combiners (line 123) | def list_combiners(self) -> List[str]: method _auxiliary_loss (line 126) | def _auxiliary_loss( method _z_loss (line 145) | def _z_loss( method _entropy_loss (line 156) | def _entropy_loss( method _sparsity_loss (line 166) | def _sparsity_loss( method _diversity_loss (line 184) | def _diversity_loss( method _weighted_sum_combiner (line 207) | def _weighted_sum_combiner( method _adaptive_sum_combiner (line 217) | def _adaptive_sum_combiner( method _geometric_mean_combiner (line 239) | def _geometric_mean_combiner( FILE: paddleformers/nn/moe_deepep/moe_loss_instance.py function get_global_loss_registry (line 22) | def get_global_loss_registry(): function custom_diversity_loss (line 32) | def custom_diversity_loss( function custom_weighted_sum_combiner (line 56) | def custom_weighted_sum_combiner( FILE: paddleformers/nn/norm.py class LayerNorm (line 29) | class LayerNorm(nn.LayerNorm): method __init__ (line 30) | def __init__( method enable_sequence_parallel (line 47) | def enable_sequence_parallel(self): class RMSNorm (line 54) | class RMSNorm(nn.Layer): method __init__ (line 55) | def __init__(self, config: PretrainedConfig, hidden_size=None, norm_ep... method forward (line 69) | def forward(self, hidden_states): method enable_sequence_parallel (line 82) | def enable_sequence_parallel(self): class Norm (line 86) | class Norm(GeneralInterface): method create (line 90) | def create( FILE: paddleformers/nn/pp_model.py function parse_args (line 40) | def parse_args(args, mtp_enable=False, is_embed=False): function get_pp_vp_split_layers (line 113) | def get_pp_vp_split_layers(config, skip_recompute_num=-1): function get_attr (line 179) | def get_attr(layer, name): class RotaryEmbedding (line 187) | class RotaryEmbedding(nn.Layer): method __init__ (line 188) | def __init__(self, config): method forward (line 194) | def forward(self, x, position_ids): class EmbeddingPipe (line 219) | class EmbeddingPipe(nn.Layer): method __init__ (line 220) | def __init__(self, config, embed_cls=None, rotary_emb_cls=None): method embedding_weight (line 239) | def embedding_weight(self): method forward (line 247) | def forward(self, args): class RMSNormPipe (line 349) | class RMSNormPipe(RMSNorm): method __init__ (line 350) | def __init__(self, *args, **kwargs): method forward (line 355) | def forward(self, args): class LayerNormPipe (line 361) | class LayerNormPipe(LayerNorm): method __init__ (line 362) | def __init__(self, *args, **kwargs): method forward (line 367) | def forward(self, args): class EmptyLayer (line 373) | class EmptyLayer(nn.Layer): method __init__ (line 378) | def __init__(self): method forward (line 381) | def forward(self, x): class LMHeadPipe (line 385) | class LMHeadPipe(LMHead): method forward (line 390) | def forward(self, args): method embedding_weight (line 408) | def embedding_weight(self): function make_decoder_layer_pipe (line 413) | def make_decoder_layer_pipe(decoder_layer): class CriterionLayerPipe (line 497) | class CriterionLayerPipe(CriterionLayer): method __init__ (line 498) | def __init__(self, *args, **kwargs): method forward (line 502) | def forward(self, logits, labels, mtp_logits=None): class GeneralModelForCausalLMPipe (line 509) | class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): method __init__ (line 528) | def __init__(self, config: PretrainedConfig, **kwargs): method get_loss_fn (line 677) | def get_loss_fn(self, config): method register_cls_attr (line 686) | def register_cls_attr(cls, config_class=None, pretrained_model_class=N... method _prepare_pipeline_inputs_func (line 703) | def _prepare_pipeline_inputs_func(cls, inputs): FILE: paddleformers/peft/lora/auto_lora_model.py class LoRAAutoLinear (line 45) | class LoRAAutoLinear(LoRALinear): method __init__ (line 46) | def __init__( method process_intermediate_api (line 76) | def process_intermediate_api(self): method process_base_api (line 83) | def process_base_api(self): method auto_dist_config (line 98) | def auto_dist_config(self, prefix=""): class LoRAAutoModel (line 120) | class LoRAAutoModel(nn.Layer): method __init__ (line 126) | def __init__(self, model, lora_config: LoRAAutoConfig) -> None: method from_pretrained (line 150) | def from_pretrained(cls, model, lora_path, **kwargs): method set_state_dict (line 227) | def set_state_dict(self, state_dict): method _get_tensor_parallel_convert_actions (line 271) | def _get_tensor_parallel_convert_actions(self, loaded_keys, is_split=T... method _convert_tensor_parallel (line 284) | def _convert_tensor_parallel(self, lora_state_dict): method save_pretrained (line 295) | def save_pretrained(self, save_directory: str, merge_tensor_parallel: ... method _find_and_replace_module (line 350) | def _find_and_replace_module(self, model, module_name, lora_config, en... method _find_and_restore_module (line 391) | def _find_and_restore_module(self, module_name): method get_trainable_state_dict (line 404) | def get_trainable_state_dict(self, concat_init_lora=False): method print_trainable_parameters (line 419) | def print_trainable_parameters(self) -> None: method mark_only_lora_as_trainable (line 431) | def mark_only_lora_as_trainable(self) -> None: method get_lora_model (line 454) | def get_lora_model(self, model: Union[PretrainedModel, nn.Layer], lora... method merge_auto_dist_configs (line 526) | def merge_auto_dist_configs(self, configs): method _generate_auto_dist_config (line 604) | def _generate_auto_dist_config(self, auto_dist_degree): method restore_original_model (line 676) | def restore_original_model(self): method __getattr__ (line 684) | def __getattr__(self, name: str): method train (line 691) | def train(self): method eval (line 698) | def eval(self): method save_to_aistudio (line 705) | def save_to_aistudio( method disable_lora (line 760) | def disable_lora(self): method enable_lora (line 765) | def enable_lora(self): method merge (line 770) | def merge(self): method unmerge (line 775) | def unmerge(self): method get_model_config (line 780) | def get_model_config( FILE: paddleformers/peft/lora/lora_config.py class LoRAConfig (line 26) | class LoRAConfig: method __post_init__ (line 84) | def __post_init__(self): method scaling (line 92) | def scaling(self): method __dict__ (line 99) | def __dict__(self): method to_dict (line 102) | def to_dict(self): method save_pretrained (line 105) | def save_pretrained(self, save_directory): method from_pretrained (line 126) | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): method from_json_file (line 152) | def from_json_file(cls, path_json_file): class LoRAAutoConfig (line 166) | class LoRAAutoConfig(LoRAConfig): FILE: paddleformers/peft/lora/lora_layers.py class LoRALinear (line 50) | class LoRALinear(nn.Linear): method __init__ (line 52) | def __init__( method rope_init (line 107) | def rope_init(self): method get_delta_weight (line 116) | def get_delta_weight(self, lora_A=None, lora_B=None): method merge (line 124) | def merge(self): method unmerge (line 131) | def unmerge(self): method forward (line 138) | def forward(self, input: paddle.Tensor, *args, **kwargs): method extra_repr (line 146) | def extra_repr(self): class FleetLoRALinear (line 151) | class FleetLoRALinear(LoRALinear): method __init__ (line 152) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs): method forward (line 156) | def forward(self, input: paddle.Tensor): class RowParallelLoRALinear (line 164) | class RowParallelLoRALinear(RowParallelLinear): method __init__ (line 165) | def __init__( method sharded_state_dict (line 225) | def sharded_state_dict( method get_delta_weight (line 232) | def get_delta_weight(self, lora_A=None, lora_B=None): method unmerge (line 239) | def unmerge(self): method merge (line 246) | def merge(self): method forward (line 253) | def forward(self, x: paddle.Tensor): method extra_repr (line 299) | def extra_repr(self): class FleetRowParallelLoRALinear (line 304) | class FleetRowParallelLoRALinear(RowParallelLoRALinear): method __init__ (line 305) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs): method forward (line 309) | def forward(self, input: paddle.Tensor): class RowSequenceParallelLoRALinear (line 317) | class RowSequenceParallelLoRALinear(RowSequenceParallelLinear): method __init__ (line 318) | def __init__( method sharded_state_dict (line 378) | def sharded_state_dict( method get_delta_weight (line 385) | def get_delta_weight(self, lora_A=None, lora_B=None): method unmerge (line 392) | def unmerge(self): method merge (line 399) | def merge(self): method forward (line 406) | def forward(self, x: paddle.Tensor): method extra_repr (line 432) | def extra_repr(self): class FleetRowSequenceParallelLoRALinear (line 437) | class FleetRowSequenceParallelLoRALinear(RowSequenceParallelLoRALinear): method __init__ (line 438) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs): method forward (line 442) | def forward(self, input: paddle.Tensor): class ColumnParallelLoRALinear (line 450) | class ColumnParallelLoRALinear(ColumnParallelLinear): method __init__ (line 451) | def __init__( method sharded_state_dict (line 510) | def sharded_state_dict( method get_delta_weight (line 517) | def get_delta_weight(self, lora_A=None, lora_B=None): method unmerge (line 524) | def unmerge(self): method merge (line 532) | def merge(self): method forward (line 540) | def forward(self, input: paddle.Tensor): method extra_repr (line 571) | def extra_repr(self): class FleetColumnParallelLoRALinear (line 576) | class FleetColumnParallelLoRALinear(ColumnParallelLoRALinear): method __init__ (line 577) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs): method forward (line 581) | def forward(self, input: paddle.Tensor): class ColumnSequenceParallelLoRALinear (line 589) | class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear): method __init__ (line 590) | def __init__( method sharded_state_dict (line 650) | def sharded_state_dict( method get_delta_weight (line 657) | def get_delta_weight(self, lora_A=None, lora_B=None): method unmerge (line 664) | def unmerge(self): method merge (line 671) | def merge(self): method forward (line 678) | def forward(self, x: paddle.Tensor): method extra_repr (line 707) | def extra_repr(self): class FleetColumnSequenceParallelLoRALinear (line 712) | class FleetColumnSequenceParallelLoRALinear(ColumnSequenceParallelLoRALi... method __init__ (line 713) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs): method forward (line 717) | def forward(self, input: paddle.Tensor): class LoRAConv2D (line 725) | class LoRAConv2D(nn.Conv2D): method __init__ (line 727) | def __init__( method get_delta_weight (line 780) | def get_delta_weight(self, lora_A=None, lora_B=None): method unmerge (line 801) | def unmerge(self): method merge (line 809) | def merge(self): method forward (line 817) | def forward(self, input: paddle.Tensor, *args, **kwargs): method extra_repr (line 828) | def extra_repr(self): FILE: paddleformers/peft/lora/lora_model.py function get_tensor_model_parallel_group (line 53) | def get_tensor_model_parallel_group(): function get_tensor_model_parallel_world_size (line 56) | def get_tensor_model_parallel_world_size(): class PaddleFleetPipelineLayer (line 59) | class PaddleFleetPipelineLayer: class FleetColumnParallelLinear (line 62) | class FleetColumnParallelLinear: class FleetRowParallelLinear (line 65) | class FleetRowParallelLinear: function get_lora_layers (line 93) | def get_lora_layers(): class FleetColumnParallelQuantizationLinear (line 182) | class FleetColumnParallelQuantizationLinear: class FleetQuantizationLinear (line 185) | class FleetQuantizationLinear: class FleetRowParallelQuantizationLinear (line 188) | class FleetRowParallelQuantizationLinear: class FleetColumnParallelQuantizationLoRALinear (line 191) | class FleetColumnParallelQuantizationLoRALinear: class FleetQuantizationLoRALinear (line 194) | class FleetQuantizationLoRALinear: class FleetRowParallelQuantizationLoRALinear (line 197) | class FleetRowParallelQuantizationLoRALinear: class LoRAModel (line 214) | class LoRAModel(nn.Layer): method __init__ (line 224) | def __init__(self, model, lora_config: LoRAConfig) -> None: method add_lora_split_mapping (line 260) | def add_lora_split_mapping(self, module_name, is_column=False): method _get_tensor_parallel_mappings (line 263) | def _get_tensor_parallel_mappings(self, config, is_split=True): method from_pretrained (line 352) | def from_pretrained(cls, model, lora_path, **kwargs): method set_state_dict (line 459) | def set_state_dict(self, state_dict): method _merge_trainable_tensor_parallel (line 511) | def _merge_trainable_tensor_parallel(self, trainable_state_dict, offlo... method _get_tensor_parallel_convert_actions (line 549) | def _get_tensor_parallel_convert_actions(self, loaded_keys, is_split=T... method _convert_tensor_parallel (line 562) | def _convert_tensor_parallel(self, lora_state_dict): method sharded_state_dict (line 573) | def sharded_state_dict(self, *args, **kwargs): method save_pretrained (line 580) | def save_pretrained(self, save_directory: str, merge_tensor_parallel: ... method _find_and_replace_module (line 701) | def _find_and_replace_module(self, model, module_name, lora_config): method _find_and_restore_module (line 959) | def _find_and_restore_module(self, module_name): method get_trainable_state_dict (line 972) | def get_trainable_state_dict(self, concat_init_lora=False): method print_trainable_parameters (line 987) | def print_trainable_parameters(self) -> None: method mark_only_lora_as_trainable (line 999) | def mark_only_lora_as_trainable(self) -> None: method get_lora_model (line 1052) | def get_lora_model(self, model: Union[PretrainedModel, nn.Layer], lora... method restore_original_model (line 1066) | def restore_original_model(self): method __getattr__ (line 1091) | def __getattr__(self, name: str): method train (line 1098) | def train(self): method eval (line 1105) | def eval(self): method save_to_aistudio (line 1112) | def save_to_aistudio( method disable_lora (line 1167) | def disable_lora(self): method enable_lora (line 1172) | def enable_lora(self): method merge (line 1177) | def merge(self): method unmerge (line 1182) | def unmerge(self): method get_merge_state_dict (line 1187) | def get_merge_state_dict(self, offload: bool = True): FILE: paddleformers/peft/lora/lora_quant_layers.py class QuantedLoRALinear (line 22) | class QuantedLoRALinear(ConvertibleQuantedLayer): method __init__ (line 32) | def __init__(self, layer: nn.Layer, q_config): method forward (line 57) | def forward(self, input): method _linear_forward (line 69) | def _linear_forward(self, input, weight): method unmerge (line 74) | def unmerge(self): method merge (line 81) | def merge(self): method weights_to_quanters (line 88) | def weights_to_quanters(self): method activation_quanters (line 91) | def activation_quanters(self): class ColumnParallelQuantedLoRALinear (line 95) | class ColumnParallelQuantedLoRALinear(ConvertibleQuantedLayer): method __init__ (line 105) | def __init__(self, layer: nn.Layer, q_config): method forward (line 133) | def forward(self, input): method _linear_forward (line 147) | def _linear_forward(self, input, weight): method unmerge (line 161) | def unmerge(self): method merge (line 168) | def merge(self): method weights_to_quanters (line 175) | def weights_to_quanters(self): method activation_quanters (line 178) | def activation_quanters(self): class RowParallelQuantedLoRALinear (line 182) | class RowParallelQuantedLoRALinear(ConvertibleQuantedLayer): method __init__ (line 192) | def __init__(self, layer: nn.Layer, q_config): method forward (line 220) | def forward(self, input): method _linear_forward (line 235) | def _linear_forward(self, input, weight): method unmerge (line 254) | def unmerge(self): method merge (line 261) | def merge(self): method weights_to_quanters (line 268) | def weights_to_quanters(self): method activation_quanters (line 271) | def activation_quanters(self): FILE: paddleformers/peft/lora/lora_quantization_layers.py class QuantizationLoRABaseLinear (line 34) | class QuantizationLoRABaseLinear(nn.Layer): method __init__ (line 35) | def __init__(self, layer, lora_config): method forward (line 69) | def forward(self, x, add_bias=True): method merge (line 85) | def merge(self): method unmerge (line 88) | def unmerge(self): class QuantizationLoRALinear (line 92) | class QuantizationLoRALinear(QuantizationLoRABaseLinear): method __init__ (line 101) | def __init__(self, layer, lora_config): method forward (line 123) | def forward(self, x): class FleetQuantizationLoRALinear (line 130) | class FleetQuantizationLoRALinear(QuantizationLoRALinear): method __init__ (line 131) | def __init__(self, layer, skip_bias_add, lora_config): method forward (line 135) | def forward(self, input: paddle.Tensor): class ColumnParallelQuantizationLoRALinear (line 143) | class ColumnParallelQuantizationLoRALinear(QuantizationLoRABaseLinear): method __init__ (line 152) | def __init__(self, layer, lora_config): method forward (line 183) | def forward(self, x): method sharded_state_dict (line 219) | def sharded_state_dict( class FleetColumnParallelQuantizationLoRALinear (line 227) | class FleetColumnParallelQuantizationLoRALinear(ColumnParallelQuantizati... method __init__ (line 228) | def __init__(self, layer, skip_bias_add, lora_config): method forward (line 232) | def forward(self, input: paddle.Tensor): class RowParallelQuantizationLoRALinear (line 240) | class RowParallelQuantizationLoRALinear(QuantizationLoRABaseLinear): method __init__ (line 249) | def __init__(self, layer, lora_config): method forward (line 284) | def forward(self, x): method sharded_state_dict (line 321) | def sharded_state_dict( class FleetRowParallelQuantizationLoRALinear (line 329) | class FleetRowParallelQuantizationLoRALinear(RowParallelQuantizationLoRA... method __init__ (line 330) | def __init__(self, layer, skip_bias_add, lora_config): method forward (line 334) | def forward(self, input: paddle.Tensor): FILE: paddleformers/peft/lora/loraga_utils.py class LoRAGATrainer (line 36) | class LoRAGATrainer(Trainer): method __init__ (line 39) | def __init__(self, loraga_init_iters: int, gradient_offload: bool, **k... method estimate_gradient (line 53) | def estimate_gradient(self, model: PretrainedModel): method _wrap_model (line 86) | def _wrap_model(self, model): function get_module_gradient (line 137) | def get_module_gradient( function loraga_svd_reinit (line 205) | def loraga_svd_reinit( function loraga_svd_module (line 268) | def loraga_svd_module( function set_hook_enable (line 321) | def set_hook_enable(value=False): function get_hook_enable (line 326) | def get_hook_enable(): class GradientOffloadHookContext (line 331) | class GradientOffloadHookContext: method __init__ (line 334) | def __init__( method __enter__ (line 350) | def __enter__(self): method __exit__ (line 355) | def __exit__(self, exc_type, exc_val, exc_tb): method register_gradient_hook (line 358) | def register_gradient_hook(self): method get_record_gradient_hook (line 365) | def get_record_gradient_hook(self, model, gradient_dict, grad_name, pa... FILE: paddleformers/peft/lora/utils.py function rng_ctx (line 20) | def rng_ctx(is_mp: bool, in_dynamic_mode: bool): FILE: paddleformers/quantization/checkpoint_quantization_utils.py function cal_ratio (line 20) | def cal_ratio(m, v, eps=1e-8): function group_wise_quant_dequant (line 34) | def group_wise_quant_dequant( function merge_int4 (line 164) | def merge_int4(x, y): function split_int8 (line 179) | def split_int8(final): function cal_abs_min_max_channel (line 197) | def cal_abs_min_max_channel(inputs, quant_axis=1): function asymmetry_qdq_weight (line 219) | def asymmetry_qdq_weight( function cal_abs_max_channel (line 284) | def cal_abs_max_channel(inputs, quant_axis=1): function qdq_weight (line 305) | def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False... FILE: paddleformers/quantization/hadamard_utils.py function matmul_hadU (line 20) | def matmul_hadU(X): function create_hadamard_matrix (line 36) | def create_hadamard_matrix(block_size, dtype): function hadamard_matmul (line 42) | def hadamard_matmul(input, side, hadamard_matrix, block_size): function apply_hadamard_matmul (line 59) | def apply_hadamard_matmul(x, side, block_size): FILE: paddleformers/quantization/qat_utils.py function quantize (line 47) | def quantize( function dequantize (line 120) | def dequantize( function int8_forward (line 138) | def int8_forward( function int8_backward (line 169) | def int8_backward(ctx, x, grad_output, quant_weight, weight_scale, quant... function fp8_forward (line 197) | def fp8_forward( function fp8_backward (line 236) | def fp8_backward(ctx, x, grad_output, quant_weight, weight_scale, quant_... class QATFunc (line 358) | class QATFunc(PyLayer): method forward (line 360) | def forward( method backward (line 420) | def backward(ctx, grad_output): FILE: paddleformers/quantization/qlora.py function qlora_weight_quantize (line 20) | def qlora_weight_quantize( function qlora_weight_dequantize (line 56) | def qlora_weight_dequantize( function qlora_weight_quantize_dequantize (line 71) | def qlora_weight_quantize_dequantize( function qlora_weight_linear (line 98) | def qlora_weight_linear( FILE: paddleformers/quantization/quantization_config.py class QuantizationConfig (line 29) | class QuantizationConfig: method __init__ (line 48) | def __init__( method fp8_format (line 168) | def fp8_format(self): method is_weight_quantize (line 171) | def is_weight_quantize(self): method is_support_merge_tensor_parallel (line 189) | def is_support_merge_tensor_parallel(self): method from_dict (line 196) | def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): method to_json_file (line 215) | def to_json_file(self, json_file_path): method to_dict (line 222) | def to_dict(self): method __repr__ (line 225) | def __repr__(self): method to_json_string (line 228) | def to_json_string(self, use_diff=True): method to_diff_dict (line 235) | def to_diff_dict(self): FILE: paddleformers/quantization/quantization_linear.py function quant_weight_forward (line 52) | def quant_weight_forward( function dequant_weight (line 90) | def dequant_weight( class QuantizationLinearFunc (line 123) | class QuantizationLinearFunc(PyLayer): method forward (line 125) | def forward( method backward (line 161) | def backward(ctx, grad_output): function quant_weight_linear (line 203) | def quant_weight_linear( function get_activation_scale_group (line 246) | def get_activation_scale_group(is_row=False): class QuantizationLinear (line 262) | class QuantizationLinear(nn.Layer): method __init__ (line 265) | def __init__( method forward (line 387) | def forward(self, x): class FleetQuantizationLinear (line 409) | class FleetQuantizationLinear(QuantizationLinear): method __init__ (line 410) | def __init__(self, in_features, out_features, skip_bias_add, **kwargs): method forward (line 414) | def forward(self, input: paddle.Tensor): class ColumnParallelQuantizationLinear (line 422) | class ColumnParallelQuantizationLinear(nn.Layer): method __init__ (line 432) | def __init__( method forward (line 600) | def forward(self, x): method sharded_state_dict (line 638) | def sharded_state_dict( class FleetColumnParallelQuantizationLinear (line 646) | class FleetColumnParallelQuantizationLinear(ColumnParallelQuantizationLi... method __init__ (line 647) | def __init__(self, in_features, output_size_per_partition, skip_bias_a... method forward (line 651) | def forward(self, input: paddle.Tensor): class RowParallelQuantizationLinear (line 659) | class RowParallelQuantizationLinear(nn.Layer): method __init__ (line 669) | def __init__( method forward (line 838) | def forward(self, x): method sharded_state_dict (line 896) | def sharded_state_dict( class FleetRowParallelQuantizationLinear (line 904) | class FleetRowParallelQuantizationLinear(RowParallelQuantizationLinear): method __init__ (line 905) | def __init__(self, input_size_per_partition, out_features, skip_bias_a... method forward (line 909) | def forward(self, input: paddle.Tensor): FILE: paddleformers/quantization/quantization_utils.py function get_tensor_model_parallel_group (line 65) | def get_tensor_model_parallel_group(): function get_tensor_model_parallel_world_size (line 68) | def get_tensor_model_parallel_world_size(): class PaddleFleetPipelineLayer (line 71) | class PaddleFleetPipelineLayer: class FleetColumnParallelLinear (line 74) | class FleetColumnParallelLinear: class FleetRowParallelLinear (line 77) | class FleetRowParallelLinear: class FleetColumnParallelQuantizationLinear (line 80) | class FleetColumnParallelQuantizationLinear: class FleetQuantizationLinear (line 83) | class FleetQuantizationLinear: class FleetRowParallelQuantizationLinear (line 86) | class FleetRowParallelQuantizationLinear: function parse_weight_quantize_algo (line 102) | def parse_weight_quantize_algo(quantization_config, name): function replace_with_quantization_linear (line 117) | def replace_with_quantization_linear(model, quantization_config, llm_int... function convert_to_weight_quantize_state_dict (line 263) | def convert_to_weight_quantize_state_dict(state_dict, name, quantization... function convert_to_qlora_state_dict (line 300) | def convert_to_qlora_state_dict(state_dict, name, quantization_config, d... function convert_to_quantize_state_dict (line 339) | def convert_to_quantize_state_dict(state_dict, quantization_linear_list,... function convert_to_weight_quantize_dequantize_state_dict (line 364) | def convert_to_weight_quantize_dequantize_state_dict(state_dict, name, q... function convert_to_qlora_dequantize_state_dict (line 402) | def convert_to_qlora_dequantize_state_dict(state_dict, name, quantizatio... function convert_to_quantize_dequantize_state_dict (line 461) | def convert_to_quantize_dequantize_state_dict(state_dict, quantization_l... function update_loaded_state_dict_keys (line 488) | def update_loaded_state_dict_keys(state_dict, quantization_linear_list, ... FILE: paddleformers/quantization/unified_checkpoint_quantization.py function dequant_unified_optimizer (line 37) | def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict, ... function quant_unified_optimizer (line 152) | def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stag... FILE: paddleformers/trainer/argparser.py function strtobool (line 54) | def strtobool(v): class PdArgumentParser (line 67) | class PdArgumentParser(ArgumentParser): method __init__ (line 78) | def __init__(self, dataclass_types: Union[DataClassType, Iterable[Data... method _parse_dataclass_field (line 97) | def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.... method _add_dataclass_arguments (line 177) | def _add_dataclass_arguments(self, dtype: DataClassType): method parse_args_into_dataclasses (line 197) | def parse_args_into_dataclasses( method common_parse (line 239) | def common_parse(self, args, return_remaining_strings) -> Tuple[DataCl... method read_json (line 260) | def read_json(self, json_file: str) -> list: method read_yaml (line 277) | def read_yaml(self, yaml_file: str) -> list: method parse_json_file (line 296) | def parse_json_file(self, json_file: str, return_remaining_strings=Fal... method parse_json_file_and_cmd_lines (line 304) | def parse_json_file_and_cmd_lines(self, return_remaining_strings=False... method parse_yaml_file_and_cmd_lines (line 324) | def parse_yaml_file_and_cmd_lines(self, return_remaining_strings=False... method read_python (line 344) | def read_python(self, python_file: str) -> list: method parse_python_file_and_cmd_lines (line 382) | def parse_python_file_and_cmd_lines(self, return_remaining_strings=Fal... method parse_dict (line 402) | def parse_dict(self, args: dict) -> Tuple[DataClass, ...]: FILE: paddleformers/trainer/integrations.py function is_visualdl_available (line 31) | def is_visualdl_available(): function is_tensorboardX_available (line 35) | def is_tensorboardX_available(): function is_wandb_available (line 39) | def is_wandb_available(): function is_swanlab_available (line 45) | def is_swanlab_available(): function is_ray_available (line 49) | def is_ray_available(): function get_available_reporting_integrations (line 53) | def get_available_reporting_integrations(): function rewrite_logs (line 67) | def rewrite_logs(d): class VisualDLCallback (line 83) | class VisualDLCallback(TrainerCallback): method __init__ (line 91) | def __init__(self, vdl_writer=None): method _init_summary_writer (line 106) | def _init_summary_writer(self, args, log_dir=None): method on_train_begin (line 111) | def on_train_begin(self, args, state, control, **kwargs): method on_log (line 140) | def on_log(self, args, state, control, logs=None, **kwargs): method on_train_end (line 161) | def on_train_end(self, args, state, control, **kwargs): class TensorBoardCallback (line 167) | class TensorBoardCallback(TrainerCallback): method __init__ (line 176) | def __init__(self, tb_writer=None): method _init_summary_writer (line 192) | def _init_summary_writer(self, args, log_dir=None): method on_train_begin (line 197) | def on_train_begin(self, args, state, control, **kwargs): method on_log (line 217) | def on_log(self, args, state, control, logs=None, **kwargs): method on_train_end (line 238) | def on_train_end(self, args, state, control, **kwargs): class WandbCallback (line 244) | class WandbCallback(TrainerCallback): method __init__ (line 249) | def __init__(self): method setup (line 261) | def setup(self, args, state, model, **kwargs): method on_train_begin (line 335) | def on_train_begin(self, args, state, control, model=None, **kwargs): method on_train_end (line 341) | def on_train_end(self, args, state, control, model=None, tokenizer=Non... method on_log (line 377) | def on_log(self, args, state, control, model=None, logs=None, **kwargs): method on_save (line 386) | def on_save(self, args, state, control, **kwargs): class SwanLabCallback (line 406) | class SwanLabCallback(TrainerCallback): method __init__ (line 411) | def __init__(self): method setup (line 420) | def setup(self, args, state, model, **kwargs): method on_train_begin (line 491) | def on_train_begin(self, args, state, control, model=None, **kwargs): method on_train_end (line 495) | def on_train_end(self, args, state, control, model=None, processing_cl... method on_log (line 502) | def on_log(self, args, state, control, model=None, logs=None, **kwargs): method on_save (line 521) | def on_save(self, args, state, control, **kwargs): method on_predict (line 528) | def on_predict(self, args, state, control, metrics, **kwargs): class AutoNLPCallback (line 536) | class AutoNLPCallback(TrainerCallback): method __init__ (line 541) | def __init__(self): method on_evaluate (line 550) | def on_evaluate(self, args, state, control, **kwargs): function get_reporting_integration_callbacks (line 568) | def get_reporting_integration_callbacks(report_to): FILE: paddleformers/trainer/plugins/npu_plugin.py function npu_accelerate_plugin (line 25) | def npu_accelerate_plugin(optimizer): function _optimizer_step_with_flatten_param_grads (line 36) | def _optimizer_step_with_flatten_param_grads(optimizer): function _flatten_param_grads (line 61) | def _flatten_param_grads(optimizer, params_grads): FILE: paddleformers/trainer/plugins/timer.py class _Timer (line 24) | class _Timer: method __init__ (line 27) | def __init__(self, name): method start (line 33) | def start(self): method stop (line 41) | def stop(self): method reset (line 49) | def reset(self): method elapsed (line 54) | def elapsed(self, reset=True): class RuntimeTimer (line 75) | class RuntimeTimer: method __init__ (line 78) | def __init__(self, name): method start (line 81) | def start(self, name): method stop (line 86) | def stop(self): method log (line 90) | def log(self): class Timers (line 101) | class Timers: method __init__ (line 104) | def __init__(self): method __call__ (line 107) | def __call__(self, name, use_event=False): method write (line 117) | def write(self, names, writer, iteration, normalizer=1.0, reset=True): method log (line 124) | def log(self, names, normalizer=1.0, reset=True): method info (line 145) | def info(self, names, normalizer=1.0, reset=False): function get_timers (line 158) | def get_timers(): function set_timers (line 163) | def set_timers(): function disable_timers (line 169) | def disable_timers(): FILE: paddleformers/trainer/trainer.py class Trainer (line 269) | class Trainer: method __init__ (line 328) | def __init__( method _wrap_amp_model (line 610) | def _wrap_amp_model(self, args, model): method add_callback (line 662) | def add_callback(self, callback): method pop_callback (line 673) | def pop_callback(self, callback): method remove_callback (line 686) | def remove_callback(self, callback): method _load_from_peft_checkpoint (line 696) | def _load_from_peft_checkpoint(self, resume_from_checkpoint=None): method _load_from_checkpoint (line 734) | def _load_from_checkpoint(self, resume_from_checkpoint=None): method _wrap_model_and_load_sharded_checkpoint (line 910) | def _wrap_model_and_load_sharded_checkpoint(self, resume_from_checkpoi... method _get_zcc_implementation_classes (line 926) | def _get_zcc_implementation_classes(self): method _create_zcc_manager_instance (line 932) | def _create_zcc_manager_instance(self, unwrapped_model, zcc_worker_cla... method _register_pipeline_hooks (line 952) | def _register_pipeline_hooks(self, unwrapped_model): method _setup_zcc_callback (line 966) | def _setup_zcc_callback(self, zcc_callback_class): method _handle_checkpoint_resume (line 971) | def _handle_checkpoint_resume(self, resume_from_checkpoint): method _get_ema_state_path (line 989) | def _get_ema_state_path(self, checkpoint_path): method _should_load_ema_state (line 997) | def _should_load_ema_state(self, checkpoint_path, ema_state_path): method create_zcc_manager (line 1011) | def create_zcc_manager(self, unwrapped_model, resume_from_checkpoint=N... method add_non_zcc_ema_callback (line 1038) | def add_non_zcc_ema_callback(self, resume_from_checkpoint, ema_state_a... method _save_flex_model_state (line 1052) | def _save_flex_model_state(self, output_dir): method _save_flex_optimizer_state (line 1066) | def _save_flex_optimizer_state(self, output_dir): method _load_flex_checkpoint (line 1095) | def _load_flex_checkpoint(self, resume_from_checkpoint): method prepare_resume_from_checkpoint (line 1329) | def prepare_resume_from_checkpoint(self, args, resume_from_checkpoint): method cal_epoch_step_samples (line 1363) | def cal_epoch_step_samples(self, args, train_dataloader, total_train_b... method _wrap_optimizer (line 1412) | def _wrap_optimizer(self, model): method train (line 1433) | def train( method log_trainable_numel (line 1588) | def log_trainable_numel(self, model): method _split_batches_for_accumulation (line 1616) | def _split_batches_for_accumulation(self, inputs): method optimizer_step (line 1752) | def optimizer_step(self, args, model, parameters_list=None): method _get_meshes_for_loader (line 1802) | def _get_meshes_for_loader(self): method _get_inputs_list (line 1805) | def _get_inputs_list(self, inputs): method _inner_training_loop (line 1836) | def _inner_training_loop( method _load_best_model_from_peft_checkpoint (line 2383) | def _load_best_model_from_peft_checkpoint(self): method _get_train_sampler (line 2418) | def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: method _set_state_dict_in_model (line 2444) | def _set_state_dict_in_model(self, state_dict): method _print_timer (line 2448) | def _print_timer(self): method _check_loss_valid (line 2472) | def _check_loss_valid(self, loss): method _get_item_from_loss (line 2480) | def _get_item_from_loss(self, loss): method _maybe_log_save_evaluate (line 2490) | def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_... method log_trained_tokens (line 2647) | def log_trained_tokens(self): method _get_learning_rate (line 2667) | def _get_learning_rate(self): method get_train_dataloader (line 2670) | def get_train_dataloader(self, dense_tensor_idx=None): method _get_eval_sampler (line 2747) | def _get_eval_sampler(self, eval_dataset: Dataset): method get_eval_dataloader (line 2783) | def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) ... method get_test_dataloader (line 2847) | def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: method create_optimizer_and_scheduler (line 2909) | def create_optimizer_and_scheduler(self, num_training_steps: int): method create_optimizer (line 2920) | def create_optimizer(self, lr_scheduler=None): method _apply_to_optimizer (line 2965) | def _apply_to_optimizer(self, action): method _offload_optimizer (line 2986) | def _offload_optimizer(self): method _reload_optimizer (line 2994) | def _reload_optimizer(self): method _load_rng_state (line 3000) | def _load_rng_state(self, checkpoint): method get_optimizer_cls_and_kwargs (line 3060) | def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any... method create_scheduler (line 3096) | def create_scheduler(self, num_training_steps: int): method num_examples (line 3125) | def num_examples(self, dataloader: DataLoader) -> int: method _decorate_exclude_layers (line 3139) | def _decorate_exclude_layers(self, model: nn.Layer): method _wrap_distributed_optimizer (line 3150) | def _wrap_distributed_optimizer(self, optimizer): method _wrap_model (line 3205) | def _wrap_model(self, model, training=True): method _prepare_input (line 3478) | def _prepare_input(self, data: Union[paddle.Tensor, Any]) -> Union[pad... method _prepare_inputs (line 3497) | def _prepare_inputs(self, inputs: Dict[str, Union[paddle.Tensor, Any]]... method autocast_smart_context_manager (line 3508) | def autocast_smart_context_manager(self): method compute_loss (line 3538) | def compute_loss(self, model, inputs, return_outputs=False): method _enable_delay_scale_loss (line 3593) | def _enable_delay_scale_loss(self): method training_step (line 3604) | def training_step( method training_pipeline_step (line 3662) | def training_pipeline_step(self, model: nn.Layer, inputs: Dict[str, Un... method save_model (line 3718) | def save_model( method copy_custom_files (line 3771) | def copy_custom_files(self, output_dir): method _filter_moe_no_sync_optimizer_params (line 3795) | def _filter_moe_no_sync_optimizer_params(self): method _ordered_save (line 3815) | def _ordered_save(self, state_dict, save_path, signal_path=None): method _save_checkpoint (line 3839) | def _save_checkpoint(self, model, metrics=None): method set_optimizer_grouped_parameters (line 4080) | def set_optimizer_grouped_parameters(self, optimizer_grouped_parameter... method disable_autocast_context_manager (line 4088) | def disable_autocast_context_manager(self): method _sorted_checkpoints (line 4097) | def _sorted_checkpoints( method _rotate_checkpoints (line 4121) | def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None: method _rotate_hf_checkpoints (line 4147) | def _rotate_hf_checkpoints(self, use_mtime=False, output_dir=None) -> ... method _save (line 4173) | def _save( method _load_scheduler (line 4380) | def _load_scheduler(self, checkpoint): method _load_optimizer_and_scheduler (line 4398) | def _load_optimizer_and_scheduler(self, checkpoint): method log (line 4481) | def log(self, logs: Dict[str, float], **kwargs) -> None: method evaluate (line 4517) | def evaluate( method evaluation_loop (line 4582) | def evaluation_loop( method predict (line 4800) | def predict( method prediction_pipeline_step (line 4857) | def prediction_pipeline_step( method prediction_step (line 4925) | def prediction_step( method is_local_process_zero (line 5012) | def is_local_process_zero(self) -> bool: method is_world_process_zero (line 5019) | def is_world_process_zero(self) -> bool: method _nested_gather (line 5026) | def _nested_gather(self, tensors): method _pad_across_processes (line 5046) | def _pad_across_processes(self, tensor, pad_index=-100): method _set_signature_columns_if_needed (line 5080) | def _set_signature_columns_if_needed(self): method _remove_unused_columns (line 5088) | def _remove_unused_columns(self, dataset: "datasets.Dataset", descript... method _get_collator_with_removed_columns (line 5118) | def _get_collator_with_removed_columns( method _is_iterable_dataset (line 5136) | def _is_iterable_dataset(self, dataset): method _is_iterable_dataset_distributed (line 5139) | def _is_iterable_dataset_distributed(self, dataset): method print_config (line 5148) | def print_config(self, args=None, key=""): method is_unified_checkpoint (line 5174) | def is_unified_checkpoint(self, resume_from_checkpoint, safe_serializa... FILE: paddleformers/trainer/trainer_callback.py class TrainerState (line 75) | class TrainerState: method __post_init__ (line 128) | def __post_init__(self): method save_to_json (line 132) | def save_to_json(self, json_path: str): method load_from_json (line 139) | def load_from_json(cls, json_path: str): class TrainerControl (line 147) | class TrainerControl: method _new_training (line 182) | def _new_training(self): method _new_epoch (line 186) | def _new_epoch(self): method _new_step (line 190) | def _new_step(self): class TrainerCallback (line 198) | class TrainerCallback: method on_init_end (line 248) | def on_init_end(self, args: TrainingArguments, state: TrainerState, co... method on_train_begin (line 254) | def on_train_begin(self, args: TrainingArguments, state: TrainerState,... method on_train_end (line 260) | def on_train_end(self, args: TrainingArguments, state: TrainerState, c... method on_epoch_begin (line 266) | def on_epoch_begin(self, args: TrainingArguments, state: TrainerState,... method on_epoch_end (line 272) | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, c... method on_step_begin (line 278) | def on_step_begin(self, args: TrainingArguments, state: TrainerState, ... method on_load_data_end (line 285) | def on_load_data_end(self, args: TrainingArguments, state: TrainerStat... method on_optimizer_begin (line 288) | def on_optimizer_begin(self, args: TrainingArguments, state: TrainerSt... method on_optimizer_end (line 291) | def on_optimizer_end(self, args: TrainingArguments, state: TrainerStat... method on_substep_end (line 294) | def on_substep_end(self, args: TrainingArguments, state: TrainerState,... method on_step_end (line 300) | def on_step_end(self, args: TrainingArguments, state: TrainerState, co... method on_evaluate (line 307) | def on_evaluate(self, args: TrainingArguments, state: TrainerState, co... method on_save (line 313) | def on_save(self, args: TrainingArguments, state: TrainerState, contro... method on_log (line 319) | def on_log(self, args: TrainingArguments, state: TrainerState, control... method on_prediction_step (line 325) | def on_prediction_step(self, args: TrainingArguments, state: TrainerSt... method on_save_hf (line 331) | def on_save_hf(self, args: TrainingArguments, state: TrainerState, con... class CallbackHandler (line 338) | class CallbackHandler(TrainerCallback): method __init__ (line 341) | def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler): method add_callback (line 360) | def add_callback(self, callback): method pop_callback (line 371) | def pop_callback(self, callback): method remove_callback (line 383) | def remove_callback(self, callback): method callback_list (line 393) | def callback_list(self): method on_init_end (line 396) | def on_init_end(self, args: TrainingArguments, state: TrainerState, co... method on_train_begin (line 399) | def on_train_begin(self, args: TrainingArguments, state: TrainerState,... method on_train_end (line 403) | def on_train_end(self, args: TrainingArguments, state: TrainerState, c... method on_epoch_begin (line 406) | def on_epoch_begin(self, args: TrainingArguments, state: TrainerState,... method on_epoch_end (line 410) | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, c... method on_step_begin (line 413) | def on_step_begin(self, args: TrainingArguments, state: TrainerState, ... method on_load_data_end (line 420) | def on_load_data_end(self, args: TrainingArguments, state: TrainerStat... method on_optimizer_begin (line 423) | def on_optimizer_begin(self, args: TrainingArguments, state: TrainerSt... method on_optimizer_end (line 426) | def on_optimizer_end(self, args: TrainingArguments, state: TrainerStat... method on_substep_end (line 429) | def on_substep_end(self, args: TrainingArguments, state: TrainerState,... method on_step_end (line 432) | def on_step_end(self, args: TrainingArguments, state: TrainerState, co... method on_evaluate (line 435) | def on_evaluate(self, args: TrainingArguments, state: TrainerState, co... method on_save (line 439) | def on_save(self, args: TrainingArguments, state: TrainerState, contro... method on_save_hf (line 443) | def on_save_hf(self, args: TrainingArguments, state: TrainerState, con... method on_log (line 447) | def on_log(self, args: TrainingArguments, state: TrainerState, control... method on_prediction_step (line 451) | def on_prediction_step(self, args: TrainingArguments, state: TrainerSt... method call_event (line 454) | def call_event(self, event, args, state, control, **kwargs): class DefaultFlowCallback (line 474) | class DefaultFlowCallback(TrainerCallback): method on_step_end (line 479) | def on_step_end(self, args: TrainingArguments, state: TrainerState, co... method on_epoch_end (line 522) | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, c... class ProgressCallback (line 538) | class ProgressCallback(TrainerCallback): method __init__ (line 543) | def __init__(self): method on_train_begin (line 547) | def on_train_begin(self, args, state, control, **kwargs): method on_step_end (line 552) | def on_step_end(self, args, state, control, **kwargs): method on_prediction_step (line 557) | def on_prediction_step(self, args, state, control, eval_dataloader=Non... method on_evaluate (line 565) | def on_evaluate(self, args, state, control, **kwargs): method on_log (line 571) | def on_log(self, args, state, control, logs=None, **kwargs): method on_train_end (line 580) | def on_train_end(self, args, state, control, **kwargs): class PrinterCallback (line 589) | class PrinterCallback(TrainerCallback): method on_log (line 594) | def on_log(self, args, state, control, logs=None, **kwargs): class EarlyStoppingCallback (line 606) | class EarlyStoppingCallback(TrainerCallback): method __init__ (line 622) | def __init__(self, early_stopping_patience: int = 1, early_stopping_th... method check_metric_value (line 628) | def check_metric_value(self, args, state, control, metric_value): method on_train_begin (line 639) | def on_train_begin(self, args, state, control, **kwargs): method on_evaluate (line 648) | def on_evaluate(self, args, state, control, metrics, **kwargs): class StepFlexToken (line 665) | class StepFlexToken(TrainerCallback): method on_step_begin (line 666) | def on_step_begin( function enable_in_dict_config (line 681) | def enable_in_dict_config(config, key): class FP8QuantWeightCallback (line 689) | class FP8QuantWeightCallback(TrainerCallback): method on_step_begin (line 694) | def on_step_begin(self, args, state, control, **kwargs): method on_optimizer_begin (line 733) | def on_optimizer_begin(self, args, state, control, **kwargs): class MoECorrectionBiasAdjustCallback (line 751) | class MoECorrectionBiasAdjustCallback(TrainerCallback): method __init__ (line 756) | def __init__(self, lr=0.001, use_mp=False): method on_optimizer_end (line 761) | def on_optimizer_end(self, args, state, control, **kwargs): class MoeExpertsGradScaleCallback (line 812) | class MoeExpertsGradScaleCallback(TrainerCallback): method __init__ (line 817) | def __init__(self, args): method on_optimizer_begin (line 832) | def on_optimizer_begin(self, args, state, control, **kwargs): class MoEGateSpGradSyncCallBack (line 837) | class MoEGateSpGradSyncCallBack(TrainerCallback): method __init__ (line 844) | def __init__(self): method on_optimizer_begin (line 847) | def on_optimizer_begin(self, args, state, control, **kwargs): class SPGradSyncCallback (line 865) | class SPGradSyncCallback(TrainerCallback): method __init__ (line 872) | def __init__(self, model): method on_optimizer_begin (line 885) | def on_optimizer_begin(self, args, state, control, **kwargs): class EMAStateAssemblerCallback (line 895) | class EMAStateAssemblerCallback(TrainerCallback): method __init__ (line 896) | def __init__(self, ema_state_assembler): method on_step_end (line 899) | def on_step_end(self, args, state, control, **kwargs): class InterleaveGateUpCallback (line 906) | class InterleaveGateUpCallback(TrainerCallback): method __init__ (line 907) | def __init__(self, model, resume_from_checkpoint=None, output_dir=None): method interleave_gate_up_proj (line 912) | def interleave_gate_up_proj(self, w): method on_train_begin (line 920) | def on_train_begin(self, args, state, control, **kwargs): FILE: paddleformers/trainer/trainer_utils.py function mock_offload_optimizer (line 99) | def mock_offload_optimizer(): function log_trainer_start (line 112) | def log_trainer_start(): class Topology (line 122) | class Topology: method __init__ (line 123) | def __init__( method __repr__ (line 183) | def __repr__(self): function _get_distributed_seeds (line 187) | def _get_distributed_seeds(seed: int = 1234, topo: Topology = None): function set_seed (line 298) | def set_seed(seed: int = 1234, topo=None): function set_random_seed (line 318) | def set_random_seed( function _switch_mode (line 360) | def _switch_mode(mode="dynamic"): function _exec_mode_guard (line 369) | def _exec_mode_guard(mode="dynamic"): class ExplicitEnum (line 378) | class ExplicitEnum(Enum): method _missing_ (line 384) | def _missing_(cls, value): class EvalPrediction (line 390) | class EvalPrediction(NamedTuple): class EvalLoopOutput (line 403) | class EvalLoopOutput(NamedTuple): class PredictionOutput (line 410) | class PredictionOutput(NamedTuple): class TrainOutput (line 416) | class TrainOutput(NamedTuple): function _check_checkpoint_files (line 422) | def _check_checkpoint_files( function get_last_checkpoint (line 440) | def get_last_checkpoint(folder, signal_folder=None, uc_async_save=False): class IntervalStrategy (line 480) | class IntervalStrategy(ExplicitEnum): class EvaluationStrategy (line 486) | class EvaluationStrategy(ExplicitEnum): class OptimizerNames (line 492) | class OptimizerNames(ExplicitEnum): class ShardingOption (line 503) | class ShardingOption(ExplicitEnum): function is_main_process (line 519) | def is_main_process(local_rank): function total_processes_number (line 528) | def total_processes_number(local_rank): function speed_metrics (line 539) | def speed_metrics(split, start_time, num_samples=None, num_steps=None, s... class SchedulerType (line 571) | class SchedulerType(ExplicitEnum): function get_constant_schedule (line 579) | def get_constant_schedule(learning_rate: float, last_epoch: int = -1): function get_constant_schedule_with_warmup (line 593) | def get_constant_schedule_with_warmup(learning_rate: float, num_warmup_s... function get_linear_schedule_with_warmup (line 616) | def get_linear_schedule_with_warmup(learning_rate: float, num_warmup_ste... function get_cosine_schedule_with_warmup (line 643) | def get_cosine_schedule_with_warmup( function get_polynomial_decay_schedule_with_warmup (line 681) | def get_polynomial_decay_schedule_with_warmup( function get_scheduler (line 741) | def get_scheduler( function _secs2timedelta (line 814) | def _secs2timedelta(secs): function metrics_format (line 823) | def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]: function log_metrics (line 847) | def log_metrics(self, split, metrics): function save_metrics (line 868) | def save_metrics(self, split, metrics, combined=True): function save_state (line 902) | def save_state(self): function has_length (line 914) | def has_length(dataset): class TrainerMemoryTracker (line 925) | class TrainerMemoryTracker: method __init__ (line 957) | def __init__(self, skip_memory_metrics=False): method derive_stage (line 984) | def derive_stage(self): method cpu_mem_used (line 994) | def cpu_mem_used(self): method peak_monitor_func (line 998) | def peak_monitor_func(self): method start (line 1010) | def start(self): method stop (line 1040) | def stop(self, stage): method update_metrics (line 1084) | def update_metrics(self, stage, metrics): method stop_and_update_metrics (line 1129) | def stop_and_update_metrics(self, metrics=None): class IterableDatasetShard (line 1142) | class IterableDatasetShard(IterableDataset): method __init__ (line 1169) | def __init__( method set_epoch (line 1187) | def set_epoch(self, epoch): method __iter__ (line 1192) | def __iter__(self): method __len__ (line 1227) | def __len__(self): class LastBatchPaddingSampler (line 1235) | class LastBatchPaddingSampler(paddle.io.DistributedBatchSampler): method __iter__ (line 1238) | def __iter__(self): function find_batch_size (line 1295) | def find_batch_size(tensors): class RemoveColumnsCollator (line 1315) | class RemoveColumnsCollator: method __init__ (line 1318) | def __init__( method _remove_columns (line 1333) | def _remove_columns(self, feature: dict) -> dict: method __call__ (line 1349) | def __call__(self, features: List[dict]): function set_hyrbid_parallel_seed (line 1354) | def set_hyrbid_parallel_seed(basic_seed, dataset_rank, tp_rank, pp_rank=0): function should_skip_data (line 1373) | def should_skip_data(global_step, skip_data_intervals): function split_parallel_config (line 1390) | def split_parallel_config(parallel_config): function download_recovery_ckpt_from_pdc (line 1398) | def download_recovery_ckpt_from_pdc(recovery_checkpoint_path, timeout): function _insert_sync (line 1438) | def _insert_sync(self, sync_var, src, mp_group, sync_mode): function init_optimizer (line 1473) | def init_optimizer(optimizer, model_sharded_state_dict, state_dict_metad... function parse_nccl_config_file (line 1549) | def parse_nccl_config_file(config_dir): function init_nccl_config (line 1606) | def init_nccl_config(nccl_comm_group_config, strategy): class HFFormatFullParamSaver (line 1638) | class HFFormatFullParamSaver: method __init__ (line 1639) | def __init__( method get_full_param_iter (line 1660) | def get_full_param_iter(self): method determin_saver_based_group (line 1680) | def determin_saver_based_group(self): method save_checkpoint (line 1692) | def save_checkpoint(self, path, max_shard_size="16GB"): function recover_params_from_master_weight (line 1715) | def recover_params_from_master_weight(ema_state_dict, model, optimizer, ... class EMAStateAssembler (line 1765) | class EMAStateAssembler: method __init__ (line 1766) | def __init__( method run (line 1831) | def run(self): method _update_expected_next_save_ckpt_step (line 1903) | def _update_expected_next_save_ckpt_step(self): method _set_latest_processed_checkpoint_step (line 1909) | def _set_latest_processed_checkpoint_step(self, start_step): method _find_checkpoint (line 1914) | def _find_checkpoint(self, mode: str = "next") -> Tuple[Optional[int],... method _is_already_handled (line 1940) | def _is_already_handled(self, checkpoint_dir: Path) -> bool: method _check_all_ranks_saved (line 1944) | def _check_all_ranks_saved(self, checkpoint_dir: Path) -> bool: method _mark_as_handled (line 1955) | def _mark_as_handled(self, checkpoint_dir: Path, step: int): method _handle_checkpoint_with_ema (line 1969) | def _handle_checkpoint_with_ema(self, step: int, checkpoint_dir: Path): method _handle_naive_checkpoint (line 1991) | def _handle_naive_checkpoint(self, step: int, checkpoint_dir: Path): method _get_ema_state_path (line 2004) | def _get_ema_state_path(self, checkpoint_dir: Path) -> Path: method _load_ema_state_dict (line 2012) | def _load_ema_state_dict(self, ema_state_path: Path): method _build_ema_sharded_state_dict (line 2020) | def _build_ema_sharded_state_dict(self, ema_state_dict): method _save_full_ema_states (line 2086) | def _save_full_ema_states(self, step, ema_sharded_state_dict): function select_flex_ckpt_comm_method (line 2104) | def select_flex_ckpt_comm_method(): FILE: paddleformers/trainer/training_args.py function get_tensor_model_parallel_group (line 54) | def get_tensor_model_parallel_group(*args, **kwargs): function initialize_fleet (line 57) | def initialize_fleet(*args, **kwargs): function default_logdir (line 67) | def default_logdir() -> str: class TrainingArguments (line 79) | class TrainingArguments: method __post_init__ (line 1576) | def __post_init__(self): method _post_init_parallel_degree (line 2583) | def _post_init_parallel_degree(self): method _post_init_save_checkpoint_format (line 2713) | def _post_init_save_checkpoint_format(self): method _post_init_load_checkpoint_format (line 2725) | def _post_init_load_checkpoint_format(self): method add_moe_comm_group (line 2737) | def add_moe_comm_group(self): method __str__ (line 2746) | def __str__(self): method train_batch_size (line 2756) | def train_batch_size(self) -> int: method eval_batch_size (line 2764) | def eval_batch_size(self) -> int: method current_device (line 2772) | def current_device(self) -> "paddle.device": method world_size (line 2779) | def world_size(self): method data_parallel_rank (line 2788) | def data_parallel_rank(self): method cp_sharding_degree (line 2802) | def cp_sharding_degree(self): method cp_sharding_rank (line 2823) | def cp_sharding_rank(self): method dataset_rank (line 2836) | def dataset_rank(self): method dataset_world_size (line 2851) | def dataset_world_size(self): method sharding_parallel_rank (line 2868) | def sharding_parallel_rank(self): method tensor_parallel_rank (line 2877) | def tensor_parallel_rank(self): method pipeline_parallel_rank (line 2889) | def pipeline_parallel_rank(self): method expert_parallel_rank (line 2901) | def expert_parallel_rank(self): method moe_sharding_parallel_rank (line 2912) | def moe_sharding_parallel_rank(self): method context_parallel_rank (line 2923) | def context_parallel_rank(self): method _format_name (line 2933) | def _format_name(self, prefix, rank, degree): method optimizer_name_suffix (line 2938) | def optimizer_name_suffix(self): method weight_name_suffix (line 2956) | def weight_name_suffix(self): method sharded_name_suffix (line 2976) | def sharded_name_suffix(self, shard_id=None, pp_id=None, moe_id=None, ... method process_index (line 3007) | def process_index(self): method logical_process_index (line 3016) | def logical_process_index(self): method local_process_index (line 3038) | def local_process_index(self): method should_log (line 3047) | def should_log(self): method should_save (line 3059) | def should_save(self): method should_save_model_state (line 3077) | def should_save_model_state(self): method _no_sync_in_gradient_accumulation (line 3105) | def _no_sync_in_gradient_accumulation(self): method should_save_sharding_stage1_model (line 3112) | def should_save_sharding_stage1_model(self): method should_load_sharding_stage1_model (line 3122) | def should_load_sharding_stage1_model(self): method should_load_dataset (line 3128) | def should_load_dataset(self): method get_auto_dist_flag (line 3137) | def get_auto_dist_flag(self): method main_process_first (line 3152) | def main_process_first(self, local=True, desc="work"): method get_warmup_steps (line 3194) | def get_warmup_steps(self, num_training_steps: int): method to_dict (line 3203) | def to_dict(self): method to_json_string (line 3218) | def to_json_string(self): method to_sanitized_dict (line 3224) | def to_sanitized_dict(self) -> Dict[str, Any]: method print_config (line 3236) | def print_config(self, args=None, key=""): method should_save_model_with_tensor_fusion (line 3260) | def should_save_model_with_tensor_fusion(self): FILE: paddleformers/trainer/unified_checkpoint/async_handler.py class AsyncCheckpointHandler (line 41) | class AsyncCheckpointHandler: method __init__ (line 42) | def __init__(self, args): method _file_save_async_or_sync (line 73) | def _file_save_async_or_sync( method _save_file_async_in_process (line 193) | def _save_file_async_in_process( method _reset_and_update (line 232) | def _reset_and_update(self, shared_array, new_value): method unlink_shared_memory (line 240) | def unlink_shared_memory(self): FILE: paddleformers/trainer/unified_checkpoint/check_completion.py function check_unified_checkpoint (line 42) | def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_s... function check_unified_optimizer (line 106) | def check_unified_optimizer(args, model, optimizer, resume_from_checkpoi... FILE: paddleformers/trainer/unified_checkpoint/load_dynamic.py function create_send_table (line 56) | def create_send_table(file_keyname_mappings, file_machine_mappings): function create_dispatch_table (line 75) | def create_dispatch_table(args, model, file_keyname_mappings, file_machi... function create_optimizer_dispatch_table (line 113) | def create_optimizer_dispatch_table( function get_file_mappings (line 172) | def get_file_mappings(index, resume_from_checkpoint): function distributed_send_recv (line 200) | def distributed_send_recv( function load_unified_checkpoint_dynamically (line 258) | def load_unified_checkpoint_dynamically(args, model, resume_from_checkpo... function load_unified_optimizer_dynamically (line 317) | def load_unified_optimizer_dynamically(args, model, optimizer, resume_fr... FILE: paddleformers/trainer/unified_checkpoint/load_local.py function load_unified_checkpoint_locally (line 53) | def load_unified_checkpoint_locally( function load_unified_optimizer_locally (line 158) | def load_unified_optimizer_locally(args, model, optimizer, resume_from_c... FILE: paddleformers/trainer/unified_checkpoint/load_save_single_card.py function save_file_sync (line 62) | def save_file_sync(state_dict, path, save_to_hf=False): function save_single_card_checkpoint (line 67) | def save_single_card_checkpoint(model_to_save, output_dir, save_to_hf=Fa... function save_single_card_optimizer (line 105) | def save_single_card_optimizer(model, optimizer, output_dir): function load_single_card_checkpoint (line 170) | def load_single_card_checkpoint(model, resume_from_checkpoint: str, conv... function load_single_card_optimizer (line 204) | def load_single_card_optimizer(model, optimizer, resume_from_checkpoint:... FILE: paddleformers/trainer/unified_checkpoint/sharding_split_param_utils.py function merge_splited_param (line 49) | def merge_splited_param( function gather_splited_param_for_optimizer (line 117) | def gather_splited_param_for_optimizer(optimizer, ckpt_quant_stage="O0"): function get_params_info (line 182) | def get_params_info(comm_buffer_list): function reshape_params (line 202) | def reshape_params(state_dict, struct2static_name_mappings, param_shape_... function load_unified_optimizer_split_param (line 225) | def load_unified_optimizer_split_param(args, model, optimizer, resume_fr... function load_non_merge_optimizer_with_split_param (line 364) | def load_non_merge_optimizer_with_split_param(args, model, optimizer, re... FILE: paddleformers/trainer/unified_checkpoint/shared_memory_utils.py class TensorMeta (line 26) | class TensorMeta: function _write_shared_memory (line 49) | def _write_shared_memory(value: paddle.Tensor, meta: TensorMeta, buffer): function _traverse_copy_to_shm (line 63) | def _traverse_copy_to_shm(value, meta, buffer): function _read_ndarray_from_buf (line 86) | def _read_ndarray_from_buf(value, shm_tensor_buffer): function _read_state_dict_from_shm (line 105) | def _read_state_dict_from_shm(meta_dict, tensor_shm): function _traverse_state_dict (line 113) | def _traverse_state_dict(value, visitor): function create_meta_dict (line 131) | def create_meta_dict(state_dict): FILE: paddleformers/trainer/unified_checkpoint/unified_checkpoint.py class UnifiedCheckpointHandler (line 92) | class UnifiedCheckpointHandler: method __init__ (line 93) | def __init__(self, args): method save_unified_checkpoint (line 97) | def save_unified_checkpoint(self, model, optimizer, output_dir, signal... method load_unified_checkpoint (line 182) | def load_unified_checkpoint(self, model, resume_from_checkpoint: str, ... method save_non_merge_optimizer (line 208) | def save_non_merge_optimizer( method load_non_merge_optimizer (line 326) | def load_non_merge_optimizer(self, model, optimizer, resume_from_check... method save_unified_optimizer (line 422) | def save_unified_optimizer(self, model, optimizer, output_dir, signal_... method load_unified_optimizer (line 523) | def load_unified_optimizer(self, model, optimizer, resume_from_checkpo... method unlink_shared_memory (line 593) | def unlink_shared_memory(self): function unified_checkpoint_into_shards (line 597) | def unified_checkpoint_into_shards( function unified_optimizer_into_shards (line 680) | def unified_optimizer_into_shards( FILE: paddleformers/trainer/unified_checkpoint/utils.py class UnifiedCheckpointOption (line 70) | class UnifiedCheckpointOption(ExplicitEnum): function unwrap_optimizer (line 87) | def unwrap_optimizer(optimizer): function is_need_master_weight (line 96) | def is_need_master_weight(optimizer, is_fp16_or_bp16): function update_master_weight_status (line 104) | def update_master_weight_status(args, optimizer, has_master_weight, safe... function reduce_master_weights_status (line 142) | def reduce_master_weights_status(has_master_weights=False): function select_model_weight_index (line 163) | def select_model_weight_index(model, resume_from_checkpoint, safe_serial... function mapping_optimizer_tp_actions (line 189) | def mapping_optimizer_tp_actions(tp_actions, optimizer_loaded_keys): function get_expected_state_dict (line 208) | def get_expected_state_dict(model_to_save, **kwargs): function get_expected_keys (line 224) | def get_expected_keys(args, sharded_metadata, model, optimizer, is_maste... function get_optimizer_shard_files (line 264) | def get_optimizer_shard_files(optimizer_path, index_filename): function generate_base_static_name (line 301) | def generate_base_static_name(vname): function merge_large_tensor_parallel (line 318) | def merge_large_tensor_parallel(tensor, tp_group, tp_action, dst_rank, i... function merge_tensor_parallel_with_shard (line 357) | def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_... function merge_tensor_parallel_for_optimizer (line 412) | def merge_tensor_parallel_for_optimizer(state_dict, model_state_dict, tp... function filter_params (line 464) | def filter_params(model_to_save, state_dict, args, is_optimizer=False): function get_sharded_file_name (line 569) | def get_sharded_file_name(args, file_name, is_optimizer=False): function get_sharded_index (line 607) | def get_sharded_index( function gather_sharded_object (line 630) | def gather_sharded_object( function rename_shard_file (line 692) | def rename_shard_file(args, shard_file, file_name): function is_sharding_split_param_mode (line 733) | def is_sharding_split_param_mode(args): function save_model_config (line 737) | def save_model_config(model_to_save, save_directory, save_to_hf=False): function filter_sync_parameters (line 771) | def filter_sync_parameters( FILE: paddleformers/trainer/utils/async_save.py function _save_optimizer (line 26) | def _save_optimizer(obj, name_mapping, path, saved_signal_path, protocol): class AsyncSaver (line 47) | class AsyncSaver: method __init__ (line 48) | def __init__(self): method run (line 57) | def run(self, optimizer_state_dict, path, saved_signal_path, protocol=4): method _wait_for_previous_result (line 71) | def _wait_for_previous_result(self): method _reset_state (line 100) | def _reset_state(self, path, saved_signal_path, protocol): method _process_optimizer_state_dict (line 107) | def _process_optimizer_state_dict(self, optimizer_state_dict): method shutdown (line 121) | def shutdown(self): method __del__ (line 125) | def __del__(self): FILE: paddleformers/trainer/utils/ckpt_converter.py class CheckpointConverter (line 43) | class CheckpointConverter: method __init__ (line 44) | def __init__( method load_from_hybrid_parallel_checkpoint (line 116) | def load_from_hybrid_parallel_checkpoint(self): method gen_metadata_and_prepare_source_state_dict (line 183) | def gen_metadata_and_prepare_source_state_dict(self): method rename_local_view_state_dict (line 518) | def rename_local_view_state_dict(self, state_dict, file_name): method load_state_dict_and_rename (line 558) | def load_state_dict_and_rename(self): method get_sharded_tensor_infos (line 825) | def get_sharded_tensor_infos(self, file, state_dict, cur_rank_sharded_... method gen_metadata_for_tp_sharded_tensor (line 847) | def gen_metadata_for_tp_sharded_tensor(self): method rename_using_model_meta (line 897) | def rename_using_model_meta(self, file_name): method rename_auto_parallel_state_dict (line 926) | def rename_auto_parallel_state_dict(self): method rename_using_parameter_to_structured_name_mapping (line 935) | def rename_using_parameter_to_structured_name_mapping(self, state_dict... method rename_using_optimizer_state_order (line 969) | def rename_using_optimizer_state_order(self, model_state_keys, optimiz... method partition_parameters (line 1018) | def partition_parameters(self, model_state_shapes, is_sort, shard_num): method get_is_model_meta_exists_flag (line 1042) | def get_is_model_meta_exists_flag(self): method get_is_model_state_stored_flag (line 1048) | def get_is_model_state_stored_flag(self): method flatten_state_dict (line 1064) | def flatten_state_dict(self, state_dict): method gather_global_object (line 1073) | def gather_global_object(self, cur_rank_object): method get_local_checkpoint_file_names (line 1094) | def get_local_checkpoint_file_names(self): method get_distribution_rank_from_file_name (line 1110) | def get_distribution_rank_from_file_name(self, file_name): method initial_distributed_configuration (line 1128) | def initial_distributed_configuration(self): method infer_sharding_stage1_v (line 1145) | def infer_sharding_stage1_v(self): method infer_is_sharding_stage3 (line 1160) | def infer_is_sharding_stage3(self): method get_model_state_file_from (line 1188) | def get_model_state_file_from(self, optimizer_state_file_name): method optimizer_key_to_model_state_key (line 1196) | def optimizer_key_to_model_state_key(self, optimizer_key): method print_checkpoint_file_info (line 1205) | def print_checkpoint_file_info(self, flags): FILE: paddleformers/trainer/utils/doc.py function add_start_docstrings (line 19) | def add_start_docstrings(*docstr): function add_start_docstrings_to_model_forward (line 27) | def add_start_docstrings_to_model_forward(*docstr): function add_end_docstrings (line 49) | def add_end_docstrings(*docstr): FILE: paddleformers/trainer/utils/helper.py function distributed_concat (line 46) | def distributed_concat(tensor: Any, num_total_examples: Optional[int] = ... function paddle_pad_and_concatenate (line 63) | def paddle_pad_and_concatenate(tensor1, tensor2, padding_index=-100): function numpy_pad_and_concatenate (line 83) | def numpy_pad_and_concatenate(array1, array2, padding_index=-100): function nested_concat (line 98) | def nested_concat(tensors, new_tensors, padding_index=-100): function nested_detach (line 116) | def nested_detach(tensors): function nested_numpify (line 123) | def nested_numpify(tensors): function nested_truncate (line 133) | def nested_truncate(tensors, limit): function distributed_isfile (line 140) | def distributed_isfile(filename): function distributed_file (line 155) | def distributed_file(filename): function broadcast_dp_optimizer (line 188) | def broadcast_dp_optimizer(state_dict): function broadcast_moe_optimizer (line 233) | def broadcast_moe_optimizer(state_dict, model_state_dict=None, broadcast... function broadcast_dataset_rank0_model (line 315) | def broadcast_dataset_rank0_model(model): FILE: paddleformers/trainer/utils/offload_optimizer.py function offload (line 25) | def offload(tensor): function reload (line 37) | def reload(tensor): function hack_offload_optimizer (line 42) | def hack_offload_optimizer(mode=None): function hack_offload_optimizer_eb5 (line 96) | def hack_offload_optimizer_eb5(): FILE: paddleformers/trainer/utils/reshard/common.py function is_sharding_opt (line 34) | def is_sharding_opt(optimizer): function get_sharding_strategy (line 51) | def get_sharding_strategy(optimizer): function convert_opt_name_to_tname (line 59) | def convert_opt_name_to_tname(tensor_names, opt_names): class NodeModelState (line 102) | class NodeModelState: method __init__ (line 103) | def __init__(self, group): method group (line 111) | def group(self): method _add_kv (line 114) | def _add_kv(self, d, k, v): method model_weights (line 119) | def model_weights(self): method add_weight (line 122) | def add_weight(self, k, v): method add_weights (line 125) | def add_weights(self, model_state_dict, rank=None): method set_weights (line 131) | def set_weights(self, model_state_dict): method set_opt_state (line 134) | def set_opt_state(self, opt_state_dict): method set_master_weights (line 137) | def set_master_weights(self, master_weights): method opt_state (line 141) | def opt_state(self): method add_opt (line 144) | def add_opt(self, k, v): method add_opts (line 147) | def add_opts(self, opts, rank=None): method master_weights (line 164) | def master_weights(self): method add_master_weight (line 167) | def add_master_weight(self, k, v): method add_master_weights (line 170) | def add_master_weights(self, master, rank=None): method lr_scheduler (line 177) | def lr_scheduler(self): method set_lr_scheduler (line 180) | def set_lr_scheduler(self, lr_scheduler): method map_names (line 184) | def map_names(self, map_func): method drop_rank (line 222) | def drop_rank(self): method collapse_key (line 244) | def collapse_key(self): method flatten_key (line 272) | def flatten_key(self): method pack_keys (line 293) | def pack_keys(self, structure_name_mapping=None): method unpack_keys (line 341) | def unpack_keys(self): method split_state (line 378) | def split_state(self, split_func): method even_distribute (line 403) | def even_distribute(self): method reshard (line 452) | def reshard(self, filter_func): method split_items (line 464) | def split_items(self, split_func): method merge_items (line 485) | def merge_items(self, merge_func): method merge_from (line 508) | def merge_from(self, other, rank=None): method get_opt_state_dict (line 517) | def get_opt_state_dict(self): function split_model_state (line 527) | def split_model_state(model_state, group_getter): function merge_model_state (line 537) | def merge_model_state(model_state_map): function split_opt_state (line 544) | def split_opt_state(opt_state, group_getter): function merge_opt_state (line 565) | def merge_opt_state(opt_state_map): function split_structure_name_mapping (line 579) | def split_structure_name_mapping(structure_name_mapping, group_getter): function all_gather_simple_object (line 589) | def all_gather_simple_object(obj, group): function all_gather_state_dict (line 597) | def all_gather_state_dict(state_dict, filter_func, group): function _all_gather_state_dict (line 646) | def _all_gather_state_dict(state_dict, filter_func, group): function get_moe_sharding_group (line 658) | def get_moe_sharding_group(hcg=None): function get_param_sharding_group (line 667) | def get_param_sharding_group(param, hcg=None): FILE: paddleformers/trainer/utils/reshard/pp_reshard.py function regitser_extract_layer_name_func (line 23) | def regitser_extract_layer_name_func(func): function get_extract_layer_name_func (line 28) | def get_extract_layer_name_func(): function register_index_layer_func (line 37) | def register_index_layer_func(func): function get_index_layer_func (line 42) | def get_index_layer_func(): function register_sname_to_tname_func (line 51) | def register_sname_to_tname_func(func): function has_register_sname_to_tname_func (line 56) | def has_register_sname_to_tname_func(): function get_sname_to_tname_func (line 61) | def get_sname_to_tname_func(): class LayerNameScope (line 67) | class LayerNameScope: method __init__ (line 74) | def __init__(self, prefix, template): method get_layer_prefix (line 83) | def get_layer_prefix(cls, old_layer_name): method register_layer_prefix (line 90) | def register_layer_prefix(cls, prefix): method get_next_scope (line 95) | def get_next_scope(self, layer_id, old_layer_name): method get_layer_name (line 103) | def get_layer_name(self): method get_sub_scope (line 111) | def get_sub_scope(self, sub_layer_name): function register_layername_prefix (line 123) | def register_layername_prefix(layer_name): function extract_param_names_groupby_layer (line 127) | def extract_param_names_groupby_layer( function build_pipeline_context (line 150) | def build_pipeline_context(meta, pp_model): class LayerReNamingManager (line 161) | class LayerReNamingManager: method __init__ (line 162) | def __init__(self): method get_new_layer_name (line 165) | def get_new_layer_name(self, layer_id: str, old_name: str): method get_new_param_name (line 169) | def get_new_param_name(self, layer_id, old_name: str): class PipeLinelayer (line 176) | class PipeLinelayer: method __init__ (line 177) | def __init__(self, layer_name, param_names): method params (line 193) | def params(self): method name (line 197) | def name(self): class PipeLineSegment (line 201) | class PipeLineSegment: method __init__ (line 202) | def __init__(self, start_index, end_index): method add_layer (line 208) | def add_layer(self, layer_name, param_names): method layers (line 215) | def layers(self): class PipeLineStage (line 220) | class PipeLineStage: method __init__ (line 221) | def __init__(self): method add_segment (line 229) | def add_segment(self, start_index, end_index): method add_layer (line 235) | def add_layer(self, layer_index, layer_name, param_names): method build_name_mapping (line 240) | def build_name_mapping(self, sname_to_tname=None): method map_name (line 253) | def map_name(self, param_name, t_name): method print_name_mapping (line 261) | def print_name_mapping(self): class PipeLineSegmentContext (line 267) | class PipeLineSegmentContext: method __init__ (line 268) | def __init__( method _index_layers (line 309) | def _index_layers(self): method _segment (line 315) | def _segment(self): method map_name (line 324) | def map_name(self, param_name, t_name): method map_name_to_stage (line 332) | def map_name_to_stage(self, name): method print_name_mapping (line 339) | def print_name_mapping(self): function reshard (line 345) | def reshard(node_model_state, reshard_context, hcg): FILE: paddleformers/trainer/utils/reshard/sharding_v1.py function shard (line 23) | def shard(node_model_state, model, optimizer): function restore (line 46) | def restore(node_model_state, model, optimizer): FILE: paddleformers/trainer/utils/reshard/sharding_v2.py function shard (line 32) | def shard(node_model_state, model, optimizer): function restore (line 89) | def restore(node_model_state, model, optimizer): function merge_tensors (line 109) | def merge_tensors(k, tensor_list, shape): function pad_tensor (line 124) | def pad_tensor(k, tensor, padded_size): function slice_tensor (line 135) | def slice_tensor(tensor, begin, end): function collect_split_info (line 139) | def collect_split_info(optimizer, model, only_return_lengths=False): function is_matched_optimizer_state_dict (line 175) | def is_matched_optimizer_state_dict(opt_state_dict, optimizer, model, hc... function is_bata (line 230) | def is_bata(name): FILE: paddleformers/trainer/utils/sharding_io.py function to_device (line 54) | def to_device(tensor, place=None): function filter_sharded_params (line 70) | def filter_sharded_params(state_dict, optimizer, sharding_group, include... function exclude_parameters_in_state_dict (line 106) | def exclude_parameters_in_state_dict( class ParameterNameRemapper (line 135) | class ParameterNameRemapper: method __init__ (line 136) | def __init__(self, old_mapping, new_mapping, checkpoint): method _map_tensor (line 150) | def _map_tensor(self, tensor, old_p_name=None): method remap_model_state (line 176) | def remap_model_state(self, model_state): method remap_optimizer_state (line 182) | def remap_optimizer_state(self, opt_state): class GroupGetter (line 214) | class GroupGetter: method __init__ (line 215) | def __init__(self, model, hcg=None): method _get_parameter_name (line 230) | def _get_parameter_name(self, name): method get_group (line 253) | def get_group(self, name): method get_group_by_id (line 260) | def get_group_by_id(self, gid): method get_group_ids (line 263) | def get_group_ids(self): class ShardingIO (line 267) | class ShardingIO: method __init__ (line 268) | def __init__(self, args, model, optimizer=None, hcg=None, remap_parame... method _get_remapper (line 282) | def _get_remapper(self, checkpoint): method _remap_parameter_name (line 297) | def _remap_parameter_name(self, checkpoint, state_dict, is_opt): method set_optimizer (line 306) | def set_optimizer(self, optimizer): method load_state_dict_from_checkpoint_with_reshard (line 309) | def load_state_dict_from_checkpoint_with_reshard( method _load_one_state_dict_from_checkpoint (line 397) | def _load_one_state_dict_from_checkpoint(self, resume_from_checkpoint,... method _load_optimizer_state_of_one_shard (line 415) | def _load_optimizer_state_of_one_shard(self, checkpoint, base_opt_name... method _modify_ckpt_for_compatibility (line 433) | def _modify_ckpt_for_compatibility(self, ckpt): method _need_reshard (line 444) | def _need_reshard(self, checkpoint): method _need_reshard_pp (line 483) | def _need_reshard_pp(self, checkpoint): method load_optimizer_state_with_reshard (line 492) | def load_optimizer_state_with_reshard(self, checkpoint, base_opt_name,... method manipulate_state_dict_and_config (line 608) | def manipulate_state_dict_and_config(self, model_to_save, merge_tensor... method gather_distributed_model_meta (line 663) | def gather_distributed_model_meta(self): method _check_distributed_strategy (line 680) | def _check_distributed_strategy(self, parallel_config): method check_same_strategy (line 688) | def check_same_strategy(self, resume_from_checkpoint=None): method _get_distributed_strategy (line 703) | def _get_distributed_strategy(self): method _recover_params_from_master_weights (line 729) | def _recover_params_from_master_weights(self, state_dict, opt_state_di... method _all_gather_simple_object (line 783) | def _all_gather_simple_object(self, obj, group=None): method _load_model_meta_impl (line 792) | def _load_model_meta_impl(self, dir): method _load_model_meta (line 801) | def _load_model_meta(self, dir): method _sharding_meta_suffix (line 819) | def _sharding_meta_suffix(self, tp_rank=None, pp_rank=None): method _load_distributed_strategy (line 831) | def _load_distributed_strategy(self, dir): method _load_sharding_meta (line 839) | def _load_sharding_meta(self, dir, pp_rank=None): method _map_optimizer_state_to_param (line 857) | def _map_optimizer_state_to_param(self, optimizer_state_names): method _gather_sharding_metas (line 873) | def _gather_sharding_metas(self): FILE: paddleformers/trainer/utils/zero_cost_checkpoint.py function md5 (line 98) | def md5(tensor): class ZCCTaskType (line 105) | class ZCCTaskType(Enum): class ZCCWorkerStatus (line 117) | class ZCCWorkerStatus(Enum): function showmem (line 124) | def showmem(msg): function sharded_state_dict_compatibility (line 134) | def sharded_state_dict_compatibility(func, *, return_sharded_state_dict=... function get_fused_param_mappings (line 192) | def get_fused_param_mappings(optimizer, manipulated_state_dict): class ZeroCostCheckpointEMAProcessor (line 219) | class ZeroCostCheckpointEMAProcessor: method __init__ (line 225) | def __init__(self, optimizer_fusion_storage_helper, param_fusion_stora... method status (line 236) | def status(self): method build_ema_buffer (line 244) | def build_ema_buffer(self): method ema_reset (line 266) | def ema_reset(self): method ema_accumulate (line 271) | def ema_accumulate(self, global_step, loss, zcc_ema_loss_threshold): method ema_state_dict (line 299) | def ema_state_dict(self): method load_ema_state_dict (line 327) | def load_ema_state_dict(self, state_dict): class ParamFusionStorageHelper (line 348) | class ParamFusionStorageHelper: method __init__ (line 349) | def __init__( method reset_meta (line 366) | def reset_meta( method init_buffer (line 391) | def init_buffer(self, meta): method sync_partial_param (line 400) | def sync_partial_param(self, numel_to_sync): method wait_all (line 441) | def wait_all(self): method state_dict (line 454) | def state_dict(self): method restore_tensor_from_meta (line 461) | def restore_tensor_from_meta(self, tensor_meta): class ZeroCostCheckpointCallback (line 473) | class ZeroCostCheckpointCallback(TrainerCallback): method __init__ (line 488) | def __init__(self, args, zcc_manager, timer, sharding_io): method on_substep_end (line 499) | def on_substep_end(self, args, state, control, **kwargs): method on_optimizer_begin (line 502) | def on_optimizer_begin(self, args, state, control, **kwargs): method on_step_end (line 509) | def on_step_end(self, args, state, control, model, lr_scheduler, optim... method get_rng_states (line 525) | def get_rng_states(self, args): method _get_save_infos_based_on_steps (line 541) | def _get_save_infos_based_on_steps(self, state, args, checkpoint_folder): method _pack_dynamic_objects (line 550) | def _pack_dynamic_objects(self): method _pack_static_objects (line 559) | def _pack_static_objects(self, args): method maybe_update_zcc_worker (line 568) | def maybe_update_zcc_worker(self, args, model, optimizer, global_step): method _cache_meta_for_sharded_save (line 591) | def _cache_meta_for_sharded_save(self, model, unused): class ZeroCostCheckpointManager (line 611) | class ZeroCostCheckpointManager: method __init__ (line 612) | def __init__( method set_ema_state_dict (line 674) | def set_ema_state_dict(self, path): method update_zcc_workers (line 681) | def update_zcc_workers(self, new_version, dynamic_objecs, static_objec... method get_idle_worker_for_saving (line 707) | def get_idle_worker_for_saving(self, save_infos_and_non_cached_objects... method sync_offload_status (line 735) | def sync_offload_status(self): method report_error_worker (line 755) | def report_error_worker(self): method zcc_pipeline_hook (line 761) | def zcc_pipeline_hook(self, hook_id): method finalize (line 772) | def finalize(self): method terminate_workers (line 783) | def terminate_workers(self): function worker_loop (line 791) | def worker_loop(worker): class ZeroCostCheckpointWorker (line 795) | class ZeroCostCheckpointWorker: method __init__ (line 796) | def __init__( method process_update_task (line 857) | def process_update_task(self, updates): method process_prepare_task (line 877) | def process_prepare_task(self, prepares): method process_offload_task (line 886) | def process_offload_task(self, dump, global_step): method process_dump_task (line 938) | def process_dump_task(self): method _filter_moe_no_sync_optimizer_params (line 963) | def _filter_moe_no_sync_optimizer_params(self, model_meta, optimzier_s... method _dump_static_objects (line 992) | def _dump_static_objects(self, output_dir): method _dump_states (line 1013) | def _dump_states(self, output_dir): method _dump_args_and_state (line 1038) | def _dump_args_and_state(self, output_dir): method process_dump_task_impl (line 1054) | def process_dump_task_impl(self, output_dir, saved_signal_type="tmp"): method run (line 1087) | def run(self): method build_fusion_storage_helper (line 1142) | def build_fusion_storage_helper(self, optimizer_states_meta, model_sta... method manage_offload_chunk (line 1169) | def manage_offload_chunk(self): class EMABuffer (line 1180) | class EMABuffer(ABC): method __init__ (line 1181) | def __init__(self, resume_from_checkpoint, args, offload=True): method _load (line 1189) | def _load(self, resume_from_checkpoint): method get_ema_state_dict (line 1207) | def get_ema_state_dict(self): method save (line 1212) | def save(self, global_step): method ema_accumulate (line 1222) | def ema_accumulate(self, global_step, loss, ema_loss_threshold): method _ema_impl (line 1235) | def _ema_impl(self, state_dict, ema_state_dict): method _get_master_weight (line 1252) | def _get_master_weight(self): method _get_model_state (line 1256) | def _get_model_state(self): method _check_consistent_dist_strategy (line 1260) | def _check_consistent_dist_strategy(self, resume_from_checkpoint): class EMABufferShardingIOBased (line 1264) | class EMABufferShardingIOBased(EMABuffer): method __init__ (line 1265) | def __init__(self, resume_from_checkpoint, args, sharding_io, offload=... method _ema_path (line 1270) | def _ema_path(self, base_path): method _get_model_state (line 1275) | def _get_model_state(self): method _get_master_weight (line 1281) | def _get_master_weight(self): method _check_consistent_dist_strategy (line 1284) | def _check_consistent_dist_strategy(self, resume_from_checkpoint): class EMABufferFcBased (line 1288) | class EMABufferFcBased(EMABuffer): method __init__ (line 1289) | def __init__(self, resume_from_checkpoint, args, offload=True, hcg=Non... method _get_model_meta (line 1301) | def _get_model_meta(self): method _ema_path (line 1304) | def _ema_path(self, base_path): method _check_consistent_dist_strategy (line 1307) | def _check_consistent_dist_strategy(self, resume_from_checkpoint): method _get_model_state (line 1310) | def _get_model_state(self): method _get_master_weight (line 1314) | def _get_master_weight(self): method save (line 1318) | def save(self, global_step): class NonZCCEMACallback (line 1330) | class NonZCCEMACallback(TrainerCallback): method __init__ (line 1331) | def __init__(self, ema_buffer: EMABuffer, ema_state_assembler=None): method create_nonzcc_callback (line 1336) | def create_nonzcc_callback( method on_step_end (line 1356) | def on_step_end(self, args, state, control, **kwargs): class DistInfoCollectorValidator (line 1384) | class DistInfoCollectorValidator: method __init__ (line 1385) | def __init__(self, args, hcg=None): method _load_model_meta_impl (line 1391) | def _load_model_meta_impl(self, dir): method _all_gather_simple_object (line 1400) | def _all_gather_simple_object(self, obj, group=None): method _sharding_meta_suffix (line 1409) | def _sharding_meta_suffix(self, tp_rank=None, pp_rank=None): method _gather_sharding_metas (line 1421) | def _gather_sharding_metas(self, model, optimizer): method _check_distributed_strategy (line 1465) | def _check_distributed_strategy(self, parallel_config): method _get_distributed_strategy (line 1473) | def _get_distributed_strategy(self): method gather_distributed_model_meta (line 1499) | def gather_distributed_model_meta(self, model, optimizer): method check_same_strategy (line 1516) | def check_same_strategy(self, resume_from_checkpoint=None): function saved_ckptmeta (line 1532) | def saved_ckptmeta(state_dict, ckpt_file_name, process_group=None, repli... class ZeroCostCheckpointCallbackFcBased (line 1645) | class ZeroCostCheckpointCallbackFcBased(ZeroCostCheckpointCallback): method __init__ (line 1646) | def __init__(self, args, zcc_manager, timer, unused_arg): method _manipulate_state_dict_and_config (line 1658) | def _manipulate_state_dict_and_config(self, model_to_save, optimizer): method _cache_meta_for_sharded_save (line 1692) | def _cache_meta_for_sharded_save(self, model, optimizer): method _gen_unified_name (line 1757) | def _gen_unified_name(self, optimizer, model_sharded_state_dict): method _pack_dynamic_objects (line 1840) | def _pack_dynamic_objects(self): method maybe_update_zcc_worker (line 1861) | def maybe_update_zcc_worker(self, args, model, optimizer, global_step): class ZeroCostCheckpointWorkerFcBased (line 1884) | class ZeroCostCheckpointWorkerFcBased(ZeroCostCheckpointWorker): method process_update_task (line 1885) | def process_update_task(self, updates): method _replace_pname_with_unified (line 1914) | def _replace_pname_with_unified(self, state_dict): method _filter_state_dict (line 1923) | def _filter_state_dict(state_dict, filter_map): method _slice_padded_tensor (line 1938) | def _slice_padded_tensor(static_dict, param_slice_info): method _save_model_state (line 1954) | def _save_model_state(self, output_dir): method _save_opt_state (line 1971) | def _save_opt_state(self, output_dir): method _save_ema_state (line 2012) | def _save_ema_state(self, output_dir): method _dump_states (line 2024) | def _dump_states(self, output_dir): FILE: paddleformers/transformers/activations.py class PaddleGELUTanh (line 24) | class PaddleGELUTanh(nn.Layer): method forward (line 33) | def forward(self, input: paddle.Tensor) -> paddle.Tensor: class NewGELUActivation (line 37) | class NewGELUActivation(nn.Layer): method forward (line 43) | def forward(self, input: Tensor) -> Tensor: class GELUActivation (line 49) | class GELUActivation(nn.Layer): method __init__ (line 57) | def __init__(self, use_gelu_python: bool = False): method _gelu_python (line 64) | def _gelu_python(self, input: Tensor) -> Tensor: method forward (line 67) | def forward(self, input: Tensor) -> Tensor: class FastGELUActivation (line 71) | class FastGELUActivation(nn.Layer): method forward (line 76) | def forward(self, input: Tensor) -> Tensor: class QuickGELUActivation (line 80) | class QuickGELUActivation(nn.Layer): method forward (line 85) | def forward(self, input: Tensor) -> Tensor: class ClippedGELUActivation (line 89) | class ClippedGELUActivation(nn.Layer): method __init__ (line 102) | def __init__(self, min: float, max: float): method forward (line 110) | def forward(self, x: Tensor) -> Tensor: class SiLUActivation (line 114) | class SiLUActivation(nn.Layer): method forward (line 123) | def forward(self, input: Tensor) -> Tensor: class MishActivation (line 127) | class MishActivation(nn.Layer): method forward (line 133) | def forward(self, input: Tensor) -> Tensor: class LinearActivation (line 137) | class LinearActivation(nn.Layer): method forward (line 142) | def forward(self, input: Tensor) -> Tensor: class ClassInstantier (line 146) | class ClassInstantier(OrderedDict): method __getitem__ (line 147) | def __getitem__(self, key): function get_activation (line 173) | def get_activation(activation_string): FILE: paddleformers/transformers/aistudio_utils.py class UnauthorizedError (line 21) | class UnauthorizedError(Exception): class EntryNotFoundError (line 25) | class EntryNotFoundError(Exception): function _add_subfolder (line 29) | def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -... function aistudio_download (line 35) | def aistudio_download( FILE: paddleformers/transformers/attention_utils.py class Registry (line 26) | class Registry(object): method __init__ (line 27) | def __init__(self): method register (line 30) | def register(self, name): function create_bigbird_rand_mask_idx (line 41) | def create_bigbird_rand_mask_idx( function create_bigbird_rand_mask_idx_list (line 90) | def create_bigbird_rand_mask_idx_list( function _convert_param_attr_to_list (line 111) | def _convert_param_attr_to_list(param_attr, n): class Linear3D (line 140) | class Linear3D(Layer): method __init__ (line 141) | def __init__(self, hidden_size, num_attention_heads, size_per_head, we... method forward (line 154) | def forward(self, input): class Attention (line 166) | class Attention(Layer): method __init__ (line 167) | def __init__(self, num_heads=1, block_size=1, window_size=3, num_globa... method forward (line 170) | def forward( class DefaultAttention (line 186) | class DefaultAttention(Attention): method forward (line 187) | def forward( class BigBirdSparseAttention (line 214) | class BigBirdSparseAttention(Attention): method __init__ (line 215) | def __init__(self, num_heads=1, block_size=1, window_size=3, num_globa... method _get_band_mask (line 227) | def _get_band_mask(self, blocked_query_mask, blocked_key_mask, batch_s... method _get_band_matrix (line 291) | def _get_band_matrix(self, blocked_matrix, B, T): method _get_rand_mask (line 350) | def _get_rand_mask(self, blocked_query_mask, blocked_key_mask, rand_ma... method _gather_random_key_value (line 374) | def _gather_random_key_value(self, blocked_matrix, rand_mask_idx, B, T): method _get_global_out (line 389) | def _get_global_out(self, query_matrix, key_matrix, value_matrix, key_... method _get_splited_matrix (line 404) | def _get_splited_matrix(self, matrix): method forward (line 408) | def forward( class MultiHeadAttention (line 519) | class MultiHeadAttention(Layer): method __init__ (line 524) | def __init__( method _prepare_qkv (line 560) | def _prepare_qkv(self, query, key, value, cache=None): method compute_kv (line 577) | def compute_kv(self, key, value): method gen_cache (line 582) | def gen_cache(self, key, value=None, type=Cache): method forward (line 595) | def forward( FILE: paddleformers/transformers/audio_processing_utils.py class ExplicitEnum (line 34) | class ExplicitEnum(Enum): method _missing_ (line 40) | def _missing_(cls, value): class SequenceFeatureExtractor (line 46) | class SequenceFeatureExtractor(FeatureExtractionMixin): method __init__ (line 59) | def __init__(self, feature_size: int, sampling_rate: int, padding_valu... method pad (line 69) | def pad( method _pad (line 239) | def _pad( method _truncate (line 311) | def _truncate( method _get_padding_strategies (line 354) | def _get_padding_strategies(self, padding=False, max_length=None): function process_audio_info (line 387) | def process_audio_info(conversations: list[dict] | list[list[dict]], use... FILE: paddleformers/transformers/audio_utils.py function hertz_to_mel (line 28) | def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk")... function mel_to_hertz (line 62) | def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk")... function _create_triangular_filter_bank (line 96) | def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: ... function mel_filter_bank (line 118) | def mel_filter_bank( function optimal_fft_length (line 196) | def optimal_fft_length(window_length: int) -> int: function window_function (line 209) | def window_function( function spectrogram (line 270) | def spectrogram( function power_to_db (line 458) | def power_to_db( function amplitude_to_db (line 509) | def amplitude_to_db( function get_mel_filter_banks (line 558) | def get_mel_filter_banks( function fram_wave (line 582) | def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size... function stft (line 638) | def stft(frames: np.array, windowing_function: np.array, fft_window_size... FILE: paddleformers/transformers/auto/configuration.py function config_class_to_model_type (line 103) | def config_class_to_model_type(config): class _LazyConfigMapping (line 115) | class _LazyConfigMapping(OrderedDict): method __init__ (line 120) | def __init__(self, mapping): method __getitem__ (line 125) | def __getitem__(self, key): method keys (line 148) | def keys(self): method values (line 151) | def values(self): method items (line 154) | def items(self): method __iter__ (line 157) | def __iter__(self): method __contains__ (line 160) | def __contains__(self, item): method register (line 163) | def register(self, key, value, exist_ok=False): function get_configurations (line 175) | def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]: function model_type_to_module_name (line 208) | def model_type_to_module_name(key): class AutoConfig (line 219) | class AutoConfig(PretrainedConfig): method _get_config_class_from_config (line 235) | def _get_config_class_from_config( method from_file (line 270) | def from_file(cls, config_file: str, **kwargs) -> AutoConfig: method from_pretrained (line 286) | def from_pretrained(cls, pretrained_model_name_or_path: str, *model_ar... method register (line 377) | def register(model_type, config, exist_ok=False): FILE: paddleformers/transformers/auto/factory.py function getattribute_from_module (line 22) | def getattribute_from_module(module, attr): class _LazyAutoMapping (line 42) | class _LazyAutoMapping(OrderedDict): method __init__ (line 51) | def __init__(self, config_mapping, model_mapping): method __len__ (line 59) | def __len__(self): method __getitem__ (line 63) | def __getitem__(self, key): method _load_attr_from_module (line 79) | def _load_attr_from_module(self, model_type, attr): method keys (line 90) | def keys(self): method get (line 98) | def get(self, key, default): method __bool__ (line 104) | def __bool__(self): method values (line 107) | def values(self): method items (line 115) | def items(self): method __iter__ (line 126) | def __iter__(self): method __contains__ (line 129) | def __contains__(self, item): method register (line 137) | def register(self, key, value, exist_ok=False): FILE: paddleformers/transformers/auto/feature_extraction.py function safe_load_json_file (line 47) | def safe_load_json_file(json_file: str): function feature_extractor_class_from_name (line 58) | def feature_extractor_class_from_name(class_name: str): function get_feature_extractor_config (line 81) | def get_feature_extractor_config( class AutoFeatureExtractor (line 188) | class AutoFeatureExtractor: method __init__ (line 196) | def __init__(self): method from_pretrained (line 203) | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): method register (line 262) | def register(config_class, feature_extractor_class, exist_ok=False): FILE: paddleformers/transformers/auto/image_processing.py function get_image_processor_class_from_name (line 70) | def get_image_processor_class_from_name(class_name: str): function get_image_processor_config (line 97) | def get_image_processor_config( function _bind_paddle_mixin_if_available (line 222) | def _bind_paddle_mixin_if_available(image_processor_class): class AutoImageProcessor (line 238) | class AutoImageProcessor(hf.AutoImageProcessor): method from_pretrained (line 256) | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwa... FILE: paddleformers/transformers/auto/modeling.py function get_name_mapping (line 114) | def get_name_mapping(task="Model"): function get_task_name (line 133) | def get_task_name(model_class): class _BaseAutoModelClass (line 140) | class _BaseAutoModelClass: method __init__ (line 148) | def __init__(self, *args, **kwargs): method _get_model_class_from_config (line 156) | def _get_model_class_from_config(cls, pretrained_model_name_or_path, c... method from_config (line 249) | def from_config(cls, config, **kwargs): method _from_pretrained (line 254) | def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *m... method register (line 330) | def register(cls, config_class, model_class, exist_ok=False): class AutoBackbone (line 348) | class AutoBackbone(_BaseAutoModelClass): method from_pretrained (line 357) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModel (line 390) | class AutoModel(_BaseAutoModelClass): method from_pretrained (line 403) | def from_pretrained(cls, pretrained_model_name_or_path, task=None, *mo... class AutoModelForPretraining (line 461) | class AutoModelForPretraining(_BaseAutoModelClass): method from_pretrained (line 470) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForSequenceClassification (line 507) | class AutoModelForSequenceClassification(_BaseAutoModelClass): method from_pretrained (line 516) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForTokenClassification (line 553) | class AutoModelForTokenClassification(_BaseAutoModelClass): method from_pretrained (line 562) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForQuestionAnswering (line 599) | class AutoModelForQuestionAnswering(_BaseAutoModelClass): method from_pretrained (line 608) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForMultipleChoice (line 645) | class AutoModelForMultipleChoice(_BaseAutoModelClass): method from_pretrained (line 654) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForMaskedLM (line 691) | class AutoModelForMaskedLM(_BaseAutoModelClass): method from_pretrained (line 700) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForCausalLM (line 737) | class AutoModelForCausalLM(_BaseAutoModelClass): method from_pretrained (line 746) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForCausalLMPipe (line 783) | class AutoModelForCausalLMPipe(_BaseAutoModelClass): method from_pretrained (line 792) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoEncoder (line 796) | class AutoEncoder(_BaseAutoModelClass): method from_pretrained (line 805) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoDecoder (line 837) | class AutoDecoder(_BaseAutoModelClass): method from_pretrained (line 846) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoGenerator (line 878) | class AutoGenerator(_BaseAutoModelClass): method from_pretrained (line 887) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoDiscriminator (line 924) | class AutoDiscriminator(_BaseAutoModelClass): method from_pretrained (line 933) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForConditionalGeneration (line 970) | class AutoModelForConditionalGeneration(_BaseAutoModelClass): method from_pretrained (line 979) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... class AutoModelForConditionalGenerationPipe (line 1012) | class AutoModelForConditionalGenerationPipe(_BaseAutoModelClass): method from_pretrained (line 1021) | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, *... FILE: paddleformers/transformers/auto/processing.py function processor_class_from_name (line 66) | def processor_class_from_name(class_name: str): class AutoProcessor (line 90) | class AutoProcessor: method from_pretrained (line 106) | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): FILE: paddleformers/transformers/auto/tokenizer.py function get_paddleformers_tokenizer_config (line 53) | def get_paddleformers_tokenizer_config( function tokenizer_class_from_name (line 150) | def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]: function _bind_paddle_mixin_if_available (line 175) | def _bind_paddle_mixin_if_available(tokenizer_class): class AutoTokenizer (line 191) | class AutoTokenizer(hf.AutoTokenizer): method from_pretrained (line 209) | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwa... FILE: paddleformers/transformers/auto/video_processing.py function video_processor_class_from_name (line 53) | def video_processor_class_from_name(class_name: str): function get_video_processor_config (line 77) | def get_video_processor_config( class AutoVideoProcessor (line 201) | class AutoVideoProcessor: method from_pretrained (line 217) | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwa... FILE: paddleformers/transformers/auto_utils.py function get_mesh (line 19) | def get_mesh(pp_idx=None): function einsum (line 29) | def einsum(rule, a, b): FILE: paddleformers/transformers/cache_utils.py class CacheLayerMixin (line 26) | class CacheLayerMixin(ABC): method __init__ (line 31) | def __init__(self): method __repr__ (line 36) | def __repr__(self): method lazy_initialization (line 40) | def lazy_initialization(self, key_states: paddle.Tensor): method update (line 44) | def update( method get_mask_sizes (line 50) | def get_mask_sizes(self, cache_position: paddle.Tensor) -> tuple[int, ... method get_seq_length (line 54) | def get_seq_length(self) -> int: method get_max_cache_shape (line 58) | def get_max_cache_shape(self) -> int: method offload (line 61) | def offload(self): method prefetch (line 67) | def prefetch(self): method reset (line 73) | def reset(self) -> None: method reorder_cache (line 82) | def reorder_cache(self, beam_idx: paddle.LongTensor) -> None: class DynamicLayer (line 89) | class DynamicLayer(CacheLayerMixin): method lazy_initialization (line 97) | def lazy_initialization(self, key_states: paddle.Tensor, value_states:... method update (line 108) | def update( method get_mask_sizes (line 133) | def get_mask_sizes(self, cache_position: paddle.Tensor) -> tuple[int, ... method get_seq_length (line 140) | def get_seq_length(self) -> int: method get_max_cache_shape (line 146) | def get_max_cache_shape(self) -> int: method crop (line 150) | def crop(self, max_length: int) -> None: method batch_repeat_interleave (line 164) | def batch_repeat_interleave(self, repeats: int) -> None: method batch_select_indices (line 170) | def batch_select_indices(self, indices: paddle.Tensor) -> None: class Cache (line 177) | class Cache: method __init__ (line 197) | def __init__( method __repr__ (line 221) | def __repr__(self): method prefetch (line 224) | def prefetch(self, layer_idx: int, only_non_sliding: bool = True): method offload (line 253) | def offload(self, layer_idx: int, only_non_sliding: bool = True): method update (line 262) | def update( method early_initialization (line 306) | def early_initialization( method get_seq_length (line 322) | def get_seq_length(self, layer_idx: int = 0) -> int: method get_mask_sizes (line 328) | def get_mask_sizes(self, cache_position: paddle.Tensor, layer_idx: int... method get_max_cache_shape (line 340) | def get_max_cache_shape(self, layer_idx: int = 0) -> int: method reset (line 348) | def reset(self): method reorder_cache (line 353) | def reorder_cache(self, beam_idx: paddle.LongTensor): method crop (line 358) | def crop(self, max_length: int): method batch_repeat_interleave (line 363) | def batch_repeat_interleave(self, repeats: int): method batch_select_indices (line 368) | def batch_select_indices(self, indices: paddle.Tensor): method max_batch_size (line 374) | def max_batch_size(self) -> int: method max_cache_len (line 382) | def max_cache_len(self) -> int: method is_compileable (line 388) | def is_compileable(self) -> bool: method is_initialized (line 396) | def is_initialized(self) -> bool: method is_sliding (line 401) | def is_sliding(self) -> list[bool]: method __len__ (line 405) | def __len__(self): class DynamicCache (line 414) | class DynamicCache(Cache): method __init__ (line 457) | def __init__( method __iter__ (line 518) | def __iter__(self): class DynamicSlidingWindowLayer (line 523) | class DynamicSlidingWindowLayer(DynamicLayer): method __init__ (line 531) | def __init__(self, sliding_window: int): method lazy_initialization (line 537) | def lazy_initialization(self, key_states: paddle.Tensor, value_states:... method update (line 541) | def update( method get_mask_sizes (line 574) | def get_mask_sizes(self, cache_position: paddle.Tensor) -> tuple[int, ... method get_seq_length (line 587) | def get_seq_length(self) -> int: method get_max_cache_shape (line 591) | def get_max_cache_shape(self) -> int: method crop (line 595) | def crop(self, max_length: int) -> None: FILE: paddleformers/transformers/configuration_utils.py function custom_object_save (line 45) | def custom_object_save(obj, folder, config=None): function attribute_map (line 95) | def attribute_map(config: PretrainedConfig, kwargs: Dict[str, Any]) -> D... function convert_to_legacy_config (line 110) | def convert_to_legacy_config(attribute_map: Dict[str, str], config: Dict... function flatten_model_config (line 134) | def flatten_model_config(config: dict) -> dict: function is_standard_config (line 166) | def is_standard_config(config: Union[PretrainedConfig, Dict[str, Any]]) ... function resolve_hf_config_path (line 178) | def resolve_hf_config_path(repo_id: str, cache_dir: str, subfolder=None)... function set_expected_keys (line 204) | def set_expected_keys(config, llm_meta, kwargs): function llmmetaclass (line 213) | def llmmetaclass(cls): class LlmMetaConfig (line 229) | class LlmMetaConfig: method _get_defaults (line 519) | def _get_defaults(cls): method _get_init (line 538) | def _get_init(cls): method _get_all_meta (line 556) | def _get_all_meta(cls): method _get_unsavable_keys (line 574) | def _get_unsavable_keys(cls): method set_llm_config (line 590) | def set_llm_config(cls, config, args): class PretrainedConfig (line 598) | class PretrainedConfig: method __setattr__ (line 802) | def __setattr__(self, key, value): method __getattribute__ (line 808) | def __getattribute__(self, key): method __getitem__ (line 813) | def __getitem__(self, key): method __setitem__ (line 816) | def __setitem__(self, key, value): method __init__ (line 820) | def __init__(self, **kwargs): method _create_id_label_maps (line 951) | def _create_id_label_maps(self, num_labels: int): method _get_generation_defaults (line 956) | def _get_generation_defaults() -> Dict[str, Any]: method _has_non_default_generation_parameters (line 985) | def _has_non_default_generation_parameters(self) -> bool: method name_or_path (line 995) | def name_or_path(self) -> str: method name_or_path (line 999) | def name_or_path(self, value): method use_return_dict (line 1003) | def use_return_dict(self) -> bool: method num_labels (line 1010) | def num_labels(self) -> int: method num_labels (line 1017) | def num_labels(self, num_labels: int): method save_pretrained (line 1021) | def save_pretrained(self, save_directory: Union[str, os.PathLike], **k... method from_pretrained (line 1051) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.... method get_config_dict (line 1120) | def get_config_dict( method _get_config_dict (line 1159) | def _get_config_dict( method from_dict (line 1217) | def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "Pretrain... method from_json_file (line 1279) | def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "Pretra... method _dict_from_json_file (line 1295) | def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): method __eq__ (line 1300) | def __eq__(self, other): method __repr__ (line 1303) | def __repr__(self): method to_diff_dict (line 1306) | def to_diff_dict(self, saving_file=False) -> Dict[str, Any]: method register_unsavable_keys (line 1360) | def register_unsavable_keys(self, keys): method to_dict (line 1369) | def to_dict(self, saving_file=False) -> Dict[str, Any]: method to_json_string (line 1419) | def to_json_string(self, use_diff: bool = True, saving_file=False) -> ... method to_json_file (line 1438) | def to_json_file(self, json_file_path: Union[str, os.PathLike], use_di... method update (line 1458) | def update(self, config_dict: Dict[str, Any]): method update_from_string (line 1468) | def update_from_string(self, update_str: str): method _remove_keys_not_serialized (line 1506) | def _remove_keys_not_serialized(self, d: dict[str, Any], saving_file: ... method register_for_auto_class (line 1526) | def register_for_auto_class(cls, auto_class="AutoConfig"): method get (line 1551) | def get(self, key, default=None): method get_text_config (line 1563) | def get_text_config(self, decoder=None, encoder=None) -> "PretrainedCo... function get_configuration_file (line 1636) | def get_configuration_file(configuration_files: List[str]) -> str: function recursive_diff_dict (line 1676) | def recursive_diff_dict(dict_a, dict_b, config_obj=None): function layer_type_validation (line 1700) | def layer_type_validation(layer_types: List[str], num_hidden_layers: Opt... FILE: paddleformers/transformers/context_parallel_utils.py function split_inputs_sequence_dim_load_balance (line 33) | def split_inputs_sequence_dim_load_balance(inputs, rank=None, degree=None): function auto_split_sequence_dim_load_balance (line 68) | def auto_split_sequence_dim_load_balance(inputs): FILE: paddleformers/transformers/contrastive_loss.py class SimpleContrastiveLoss (line 21) | class SimpleContrastiveLoss(nn.Layer): method __init__ (line 22) | def __init__(self, embedding_temperature: float = 0.02): method forward (line 27) | def forward(self, q_reps, p_reps): class MatryoshkaContrastiveLoss (line 41) | class MatryoshkaContrastiveLoss(nn.Layer): method __init__ (line 42) | def __init__(self, embedding_temperature: float = 0.02, embedding_matr... method forward (line 51) | def forward(self, q_reps, p_reps): class SimpleInfclLoss (line 68) | class SimpleInfclLoss(nn.Layer): method __init__ (line 69) | def __init__(self, inf_cl_head_dim=64): method forward (line 79) | def forward(self, q_reps, p_reps): class MatryoshkaInfclLoss (line 103) | class MatryoshkaInfclLoss(nn.Layer): method __init__ (line 104) | def __init__(self, embedding_matryoshka_dims: Optional[List[int]] = No... method forward (line 120) | def forward(self, q_reps, p_reps): FILE: paddleformers/transformers/conversion_utils.py function add_quant_mapping (line 62) | def add_quant_mapping(name_action_mappings, quantization_config): function tensor_summary (line 98) | def tensor_summary(tensor: Union[str, Tensor, PytorchTensor, tuple, list... function compare_model_weights (line 134) | def compare_model_weights(first_state_dict: Dict[str, ndarray], second_s... function state_dict_contains_prefix (line 153) | def state_dict_contains_prefix(state_dict: Dict[str, ndarray], prefix: s... function init_name_mappings (line 159) | def init_name_mappings(mappings: list[StateDictNameMapping]) -> list[Sta... class StateDictKeysChecker (line 177) | class StateDictKeysChecker: method __init__ (line 180) | def __init__( method change_base_downstream_mismatched_keys (line 200) | def change_base_downstream_mismatched_keys(self): method change_downstream_base_mismatched_keys (line 215) | def change_downstream_base_mismatched_keys(self): method change_diff_keys (line 232) | def change_diff_keys(self) -> List[str]: method get_unexpected_keys (line 263) | def get_unexpected_keys(self): method get_mismatched_keys (line 269) | def get_mismatched_keys(self): method get_diff_keys (line 275) | def get_diff_keys(self, return_all_diff: bool = False) -> List[str]: function naive_fuse_merge_tp (line 293) | def naive_fuse_merge_tp(weight_list, is_column=True, fuse_tensor_parts=2... function naive_fuse_split_tp (line 344) | def naive_fuse_split_tp( function normal_fuse_merge_tp (line 436) | def normal_fuse_merge_tp(weight_list, is_column=True): function normal_fuse_split_tp (line 467) | def normal_fuse_split_tp(weight, tensor_model_parallel_size, tensor_para... function tensor_parallel_qkv_to_naive_merged_qkv (line 564) | def tensor_parallel_qkv_to_naive_merged_qkv(weight, num_attention_heads): function naive_merged_qkv_to_tensor_parallel_qkv (line 580) | def naive_merged_qkv_to_tensor_parallel_qkv(weight, num_attention_heads): function splited_qkv_to_tensor_parallel_qkv (line 603) | def splited_qkv_to_tensor_parallel_qkv(weight_list, num_attention_heads): function fuse_param_func (line 617) | def fuse_param_func(): function split_param_func (line 673) | def split_param_func(): function split_or_fuse_func (line 725) | def split_or_fuse_func(is_fuse=True): function get_tensor_parallel_merge_func (line 729) | def get_tensor_parallel_merge_func(tensor_model_parallel_size, tensor_pa... function get_tensor_parallel_split_func (line 761) | def get_tensor_parallel_split_func(tensor_model_parallel_size, tensor_pa... function split_or_merge_func (line 801) | def split_or_merge_func(is_split, tensor_model_parallel_size, tensor_par... class StateDictNameMapping (line 808) | class StateDictNameMapping: method __post_init__ (line 819) | def __post_init__(self): method should_transpose (line 822) | def should_transpose(self) -> bool: method should_merge_last_two_dim (line 825) | def should_merge_last_two_dim(self) -> bool: method run (line 829) | def run(self, state_dict: dict[str, ndarray], name: str) -> ndarray: method matched (line 858) | def matched(self, text: str) -> bool: class TensorInfoSaver (line 874) | class TensorInfoSaver: method __init__ (line 875) | def __init__(self) -> None: method add (line 878) | def add(self, state_dict_key: str, key: str, values: Union[float, ndar... method summary (line 894) | def summary(self, output_path: Optional[str] = None): method summary_to_excel (line 906) | def summary_to_excel(self, file: str): method summary_to_terminal (line 921) | def summary_to_terminal(self): method clear (line 928) | def clear(self): class LogitHooker (line 933) | class LogitHooker: method __init__ (line 936) | def __init__(self, mappings: List[StateDictNameMapping], tensor_info_s... method _paddle_hooks (line 946) | def _paddle_hooks(self, layer: Layer, inputs: Tuple[Tensor], outputs: ... method _pytorch_hooks (line 960) | def _pytorch_hooks( method register_paddle_model_hooks (line 983) | def register_paddle_model_hooks(self, model: Layer): method register_pytorch_model_hooks (line 1018) | def register_pytorch_model_hooks(self, model: Module): method summary (line 1053) | def summary(self): class LogitComparer (line 1058) | class LogitComparer: method __init__ (line 1076) | def __init__(self, input_dir: str) -> None: method get_paddle_pytorch_model_classes (line 1079) | def get_paddle_pytorch_model_classes(self) -> Tuple[object, object]: method get_inputs (line 1089) | def get_inputs(self): method resolve_paddle_output_logits (line 1095) | def resolve_paddle_output_logits(self, paddle_outputs: Tuple[Tensor]): method resolve_pytorch_output_logits (line 1108) | def resolve_pytorch_output_logits(self, pytorch_outputs: Module): method get_model_state_dict (line 1117) | def get_model_state_dict(model: Union[Layer, Module], copy: bool = Fal... method compare_model_state_dicts (line 1134) | def compare_model_state_dicts( method compare_logits (line 1175) | def compare_logits(self) -> bool: method on_converted (line 1236) | def on_converted(self): class ConversionMixin (line 1265) | class ConversionMixin: method convert_transpose_selected_weights (line 1270) | def convert_transpose_selected_weights(state_dict: dict, transpose_wei... method get_tensor_parallel_convert_actions (line 1293) | def get_tensor_parallel_convert_actions( method convert_tensor_parallel (line 1314) | def convert_tensor_parallel( method merge_tensor_parallel (line 1350) | def merge_tensor_parallel(cls, state_dict, config) -> None: method _get_tensor_parallel_mappings (line 1397) | def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_sp... method _resolve_prefix_keys (line 1412) | def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_erro... method convert_fuse_and_split (line 1442) | def convert_fuse_and_split(cls, config: PretrainedConfig, state_dict, ... method get_fuse_or_split_param_convert_actions (line 1487) | def get_fuse_or_split_param_convert_actions( method _get_fuse_or_split_param_mappings (line 1523) | def _get_fuse_or_split_param_mappings(cls, config: PretrainedConfig, i... method _resolve_prefix_keys_for_fuse_and_split (line 1542) | def _resolve_prefix_keys_for_fuse_and_split(state_keys_base, state_key... FILE: paddleformers/transformers/deepseek_v3/configuration.py class DeepseekV3Config (line 24) | class DeepseekV3Config(PretrainedConfig): method __init__ (line 136) | def __init__( FILE: paddleformers/transformers/deepseek_v3/mfu_utils.py class DeepSeekProjection (line 18) | class DeepSeekProjection: method __init__ (line 19) | def __init__(self, model_config, train_options=None): method get_num_params (line 66) | def get_num_params(self, include_embedding: bool = True) -> tuple[int,... method get_num_flop_fwd (line 132) | def get_num_flop_fwd(self, batch_size: int) -> int: method get_num_flop_per_token (line 184) | def get_num_flop_per_token(self): method _get_num_flop_QK_fwd (line 190) | def _get_num_flop_QK_fwd(self, batch_size: int) -> int: method get_num_flop_bwd (line 198) | def get_num_flop_bwd(self, batch_size: int) -> int: FILE: paddleformers/transformers/deepseek_v3/modeling.py function scaled_dot_product_attention (line 80) | def scaled_dot_product_attention( function yarn_get_mscale (line 134) | def yarn_get_mscale(scale, mscale=1): class DeepseekV3YarnRotaryEmbedding (line 140) | class DeepseekV3YarnRotaryEmbedding(nn.Layer): method __init__ (line 141) | def __init__(self, config: DeepseekV3Config, device=None): method compute_default_rope_parameters (line 158) | def compute_default_rope_parameters( method forward (line 183) | def forward(self, x, position_ids): function rotate_half (line 199) | def rotate_half(x): function apply_rotary_pos_emb (line 206) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion... class FakeGate (line 250) | class FakeGate(paddle.autograd.PyLayer): method forward (line 252) | def forward(ctx, hidden_states, weight): method backward (line 264) | def backward(ctx, grad_output): class MoEGate (line 268) | class MoEGate(PretrainedMoEGate): method __init__ (line 269) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs): method forward (line 297) | def forward(self, hidden_states): class AddAuxiliaryLoss (line 320) | class AddAuxiliaryLoss(paddle.autograd.PyLayer): method forward (line 327) | def forward(ctx, x, loss): method backward (line 334) | def backward(ctx, grad_output): class DeepseekV3TopkRouter (line 341) | class DeepseekV3TopkRouter(nn.Layer): method __init__ (line 342) | def __init__(self, config): method get_topk_indices (line 360) | def get_topk_indices(self, scores): method forward (line 379) | def forward(self, hidden_states): class DeepseekV3NaiveMoe (line 394) | class DeepseekV3NaiveMoe(nn.Layer): method __init__ (line 395) | def __init__(self, config): method sharded_state_dict (line 413) | def sharded_state_dict( method forward (line 428) | def forward( class DeepseekV3MoE (line 456) | class DeepseekV3MoE(nn.Layer): method __init__ (line 461) | def __init__(self, config): method moe (line 487) | def moe(self, hidden_states: paddle.Tensor, topk_indices: paddle.Tenso... method forward (line 512) | def forward(self, hidden_states): class DeepseekV3MoEFlexToken (line 526) | class DeepseekV3MoEFlexToken(MoEFlexTokenLayer): method __init__ (line 531) | def __init__(self, config: DeepseekV3Config): method forward (line 575) | def forward(self, hidden_states): class DeepseekV3Attention (line 587) | class DeepseekV3Attention(nn.Layer): method __init__ (line 590) | def __init__(self, config: DeepseekV3Config, layer_idx: int): method _shape (line 713) | def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): method forward (line 716) | def forward( class DeepseekV3DecoderLayer (line 849) | class DeepseekV3DecoderLayer(nn.Layer): method __init__ (line 850) | def __init__(self, config: DeepseekV3Config, layer_idx: int): method subbatch_recompute_forward (line 890) | def subbatch_recompute_forward( method attn (line 949) | def attn( method post_process (line 1020) | def post_process( method forward (line 1040) | def forward( class DeepseekV3MTPLayer (line 1077) | class DeepseekV3MTPLayer(DeepseekV3DecoderLayer): method __init__ (line 1078) | def __init__( method subbatch_recompute_forward (line 1101) | def subbatch_recompute_forward( method forward (line 1138) | def forward( class DeepseekV3PretrainedModel (line 1176) | class DeepseekV3PretrainedModel(PretrainedModel): method _gen_aoa_config (line 1195) | def _gen_aoa_config(cls, config: DeepseekV3Config): method _gen_inv_aoa_config (line 1283) | def _gen_inv_aoa_config(cls, config: DeepseekV3Config): class DeepseekV3Model (line 1382) | class DeepseekV3Model(DeepseekV3PretrainedModel): method __init__ (line 1390) | def __init__(self, config: DeepseekV3Config): method _prepare_decoder_attention_mask (line 1420) | def _prepare_decoder_attention_mask(attention_mask, input_shape, past_... method recompute_training_full (line 1448) | def recompute_training_full( method forward (line 1481) | def forward( class DeepseekV3PretrainingCriterion (line 1713) | class DeepseekV3PretrainingCriterion(nn.Layer): method __init__ (line 1719) | def __init__(self, config: DeepseekV3Config, **kwargs): method forward (line 1730) | def forward(self, prediction_scores, masked_lm_labels, router_loss=Non... class DeepseekV3ForCausalLM (line 1834) | class DeepseekV3ForCausalLM(DeepseekV3PretrainedModel): method __init__ (line 1837) | def __init__(self, config: DeepseekV3Config): method get_input_embeddings (line 1845) | def get_input_embeddings(self): method set_input_embeddings (line 1848) | def set_input_embeddings(self, value): method get_output_embeddings (line 1851) | def get_output_embeddings(self): method set_output_embeddings (line 1854) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 1857) | def set_decoder(self, decoder): method get_decoder (line 1860) | def get_decoder(self): method forward (line 1863) | def forward( method prepare_inputs_for_generation (line 1985) | def prepare_inputs_for_generation( method _get_model_inputs_spec (line 2010) | def _get_model_inputs_spec(self, dtype: str): method _reorder_cache (line 2018) | def _reorder_cache(past_key_values, beam_idx): class DeepseekV3ForSequenceClassification (line 2025) | class DeepseekV3ForSequenceClassification(DeepseekV3PretrainedModel): method __init__ (line 2026) | def __init__(self, config): method get_input_embeddings (line 2035) | def get_input_embeddings(self): method set_input_embeddings (line 2038) | def set_input_embeddings(self, value): method forward (line 2041) | def forward( class DeepseekV3MTPLayerPipe (line 2128) | class DeepseekV3MTPLayerPipe(DeepseekV3MTPLayer): method forward (line 2129) | def forward(self, args): class DeepseekV3EmbeddingPipe (line 2219) | class DeepseekV3EmbeddingPipe(EmbeddingPipe): method __init__ (line 2220) | def __init__(self, config, embed_cls=None, rotary_emb_cls=None): method forward (line 2224) | def forward(self, args): class DeepseekV3DecoderLayerPipe (line 2305) | class DeepseekV3DecoderLayerPipe(DeepseekV3DecoderLayer): method forward (line 2306) | def forward(self, args): class DeepseekV3LMHeadPipe (line 2377) | class DeepseekV3LMHeadPipe(GeneralLMHead): method forward (line 2378) | def forward(self, args): class DeepseekV3PretrainingCriterionPipe (line 2390) | class DeepseekV3PretrainingCriterionPipe(DeepseekV3PretrainingCriterion): method forward (line 2391) | def forward(self, logits, labels): class DeepseekV3RMSNormLayerPipe (line 2404) | class DeepseekV3RMSNormLayerPipe(RMSNorm): method __init__ (line 2405) | def __init__(self, *args, **kwargs): method forward (line 2410) | def forward(self, args): class DeepseekV3ForCausalLMPipe (line 2427) | class DeepseekV3ForCausalLMPipe(GeneralModelForCausalLMPipe): FILE: paddleformers/transformers/dpo_criterion.py class DPOCriterion (line 32) | class DPOCriterion(nn.Layer): method __init__ (line 35) | def __init__(self, config, dpo_config=None, use_infohub=False, ignore_... method dpo_loss (line 51) | def dpo_loss(self, policy_chosen_logps, policy_rejected_logps, referen... method dpo_logps (line 128) | def dpo_logps( method forward (line 279) | def forward( class AutoDPOCriterion (line 318) | class AutoDPOCriterion(DPOCriterion): method __init__ (line 319) | def __init__(self, config, dpo_config=None, use_infohub=False, ignore_... method forward (line 323) | def forward( method dpo_logps (line 341) | def dpo_logps( FILE: paddleformers/transformers/embedding_utils.py function dist_gather_tensor_with_gradient (line 19) | def dist_gather_tensor_with_gradient(tensor): FILE: paddleformers/transformers/ernie4_5/configuration.py class Ernie4_5Config (line 20) | class Ernie4_5Config(PretrainedConfig): method __init__ (line 30) | def __init__( FILE: paddleformers/transformers/ernie4_5/modeling.py function rotate_half (line 50) | def rotate_half(x): function apply_rotary_pos_emb (line 57) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di... function apply_fused_rope (line 94) | def apply_fused_rope(query_states, key_states, rope_theta): class Ernie4_5RotaryEmbedding (line 113) | class Ernie4_5RotaryEmbedding(nn.Layer): method __init__ (line 114) | def __init__(self, config): method compute_default_rope_parameters (line 132) | def compute_default_rope_parameters( method forward (line 162) | def forward(self, x, position_ids): class Ernie4_5Attention (line 188) | class Ernie4_5Attention(nn.Layer): method __init__ (line 191) | def __init__(self, config, layer_idx=0): method forward (line 245) | def forward( class Ernie4_5DecoderLayer (line 330) | class Ernie4_5DecoderLayer(nn.Layer): method __init__ (line 337) | def __init__(self, config, layer_idx): method forward (line 376) | def forward( class Ernie4_5PretrainedModel (line 442) | class Ernie4_5PretrainedModel(PretrainedModel): method _gen_aoa_config (line 450) | def _gen_aoa_config(cls, config: Ernie4_5Config): method _gen_inv_aoa_config (line 484) | def _gen_inv_aoa_config(cls, config: Ernie4_5Config): class Ernie4_5Model (line 525) | class Ernie4_5Model(Ernie4_5PretrainedModel): method __init__ (line 528) | def __init__(self, config: Ernie4_5Config): method recompute_training (line 555) | def recompute_training( method forward (line 603) | def forward( class Ernie4_5ForCausalLM (line 763) | class Ernie4_5ForCausalLM(Ernie4_5PretrainedModel): method __init__ (line 768) | def __init__(self, config): method prepare_attention_mask_for_generation (line 788) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i... method forward (line 794) | def forward( class Ernie4_5ForCausalLMPipe (line 879) | class Ernie4_5ForCausalLMPipe(GeneralModelForCausalLMPipe): FILE: paddleformers/transformers/ernie4_5_moe/configuration.py class Ernie4_5_MoeConfig (line 25) | class Ernie4_5_MoeConfig(PretrainedConfig): method __init__ (line 35) | def __init__( method to_json_string (line 284) | def to_json_string(self, use_diff: bool = True, saving_file=False) -> ... FILE: paddleformers/transformers/ernie4_5_moe/modeling.py function mtp_hidden_states_set_zero (line 62) | def mtp_hidden_states_set_zero(hidden_states, inbatch_pack_offset): class Ernie4_5_MoeRotaryEmbedding (line 91) | class Ernie4_5_MoeRotaryEmbedding(nn.Layer): method __init__ (line 92) | def __init__(self, config): method compute_default_rope_parameters (line 110) | def compute_default_rope_parameters( method forward (line 135) | def forward(self, x, position_ids): class Ernie4_5_MoeMLP (line 161) | class Ernie4_5_MoeMLP(Ernie4_5MLP): method __init__ (line 164) | def __init__(self, config, hidden_size, moe_intermediate_size, layer_i... method forward (line 184) | def forward(self, x): class FakeMoERouterLoss (line 213) | class FakeMoERouterLoss(PyLayer): method forward (line 221) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss): method backward (line 240) | def backward(ctx, out_grad): class Ernie4_5_MoeSparseMoeBlock (line 259) | class Ernie4_5_MoeSparseMoeBlock(MOEAllGatherLayerV2): method __init__ (line 260) | def __init__(self, config, layer_idx): class Ernie4_5_MoeDecoderLayer (line 340) | class Ernie4_5_MoeDecoderLayer(nn.Layer): method __init__ (line 347) | def __init__(self, config, layer_idx): method forward (line 410) | def forward( class Ernie4_5_MoePretrainedModel (line 503) | class Ernie4_5_MoePretrainedModel(PretrainedModel): method _gen_aoa_config (line 522) | def _gen_aoa_config(cls, config: Ernie4_5_MoeConfig): method _gen_inv_aoa_config (line 571) | def _gen_inv_aoa_config(cls, config: Ernie4_5_MoeConfig): class Ernie4_5_MoeModel (line 649) | class Ernie4_5_MoeModel(Ernie4_5_MoePretrainedModel): method __init__ (line 652) | def __init__(self, config: Ernie4_5_MoeConfig): method recompute_training (line 746) | def recompute_training( method forward (line 793) | def forward( class Ernie4_5_MoeForCausalLM (line 1104) | class Ernie4_5_MoeForCausalLM(Ernie4_5_MoePretrainedModel): method __init__ (line 1109) | def __init__(self, config): method prepare_attention_mask_for_generation (line 1129) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i... method forward (line 1135) | def forward( class Ernie4_5_MoeForCausalLMPipe (line 1225) | class Ernie4_5_MoeForCausalLMPipe(GeneralModelForCausalLMPipe): FILE: paddleformers/transformers/ernie4_5_moe_vl/image_processor.py function smart_resize (line 50) | def smart_resize( function is_scaled_image (line 100) | def is_scaled_image(image: np.ndarray) -> bool: function make_batched_images (line 111) | def make_batched_images(images) -> List[List[ImageInput]]: function make_batched_videos (line 134) | def make_batched_videos(videos) -> List[VideoInput]: class Ernie4_5_VLImageProcessor (line 151) | class Ernie4_5_VLImageProcessor(BaseImageProcessor): method __init__ (line 192) | def __init__( method set_pixels (line 226) | def set_pixels(self, min_pixels=None, max_pixels=None, msg=""): method get_smarted_resize (line 239) | def get_smarted_resize(self, height, width, min_pixels=None, max_pixel... method _preprocess (line 255) | def _preprocess( method preprocess (line 406) | def preprocess( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/comm_utils.py function all_gather_varlen (line 22) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py class Ernie4_5_Config (line 52) | class Ernie4_5_Config(PretrainedConfig): method __init__ (line 65) | def __init__( class Ernie4_5_MoeConfig (line 226) | class Ernie4_5_MoeConfig(Ernie4_5_Config): method __init__ (line 249) | def __init__( method multimodel_experts (line 397) | def multimodel_experts(self) -> bool: method use_moe (line 402) | def use_moe(self) -> bool: method to_json_string (line 411) | def to_json_string(self, use_diff: bool = True, saving_file=False) -> ... class Ernie4_5_VLMoeConfig (line 456) | class Ernie4_5_VLMoeConfig(Ernie4_5_MoeConfig): method __init__ (line 498) | def __init__( method multimodel_experts (line 553) | def multimodel_experts(self) -> bool: method use_moe (line 558) | def use_moe(self) -> bool: method to_dict (line 567) | def to_dict(self, saving_file=False): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/activation.py class NewGELUActivation (line 24) | class NewGELUActivation(nn.Layer): method forward (line 30) | def forward(self, input: Tensor) -> Tensor: class GELUActivation (line 43) | class GELUActivation(nn.Layer): method __init__ (line 51) | def __init__(self, use_gelu_python: bool = False): method _gelu_python (line 62) | def _gelu_python(self, input: Tensor) -> Tensor: method forward (line 72) | def forward(self, input: Tensor) -> Tensor: class FastGELUActivation (line 83) | class FastGELUActivation(nn.Layer): method forward (line 88) | def forward(self, input: Tensor) -> Tensor: class QuickGELUActivation (line 99) | class QuickGELUActivation(nn.Layer): method forward (line 104) | def forward(self, input: Tensor) -> Tensor: class ClippedGELUActivation (line 115) | class ClippedGELUActivation(nn.Layer): method __init__ (line 128) | def __init__(self, min: float, max: float): method forward (line 136) | def forward(self, x: Tensor) -> Tensor: class SiLUActivation (line 147) | class SiLUActivation(nn.Layer): method forward (line 156) | def forward(self, input: Tensor) -> Tensor: class MishActivation (line 167) | class MishActivation(nn.Layer): method forward (line 173) | def forward(self, input: Tensor) -> Tensor: class LinearActivation (line 184) | class LinearActivation(nn.Layer): method forward (line 189) | def forward(self, input: Tensor) -> Tensor: class ClassInstantier (line 200) | class ClassInstantier(OrderedDict): method __getitem__ (line 203) | def __getitem__(self, key): function get_activation (line 229) | def get_activation(activation_string): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/configuration.py class DFNRopeVisionTransformerConfig (line 24) | class DFNRopeVisionTransformerConfig(PretrainedConfig): method __init__ (line 33) | def __init__( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/modeling.py class _AllToAll (line 37) | class _AllToAll(paddle.autograd.PyLayer): method forward (line 39) | def forward( method backward (line 84) | def backward(ctx, *grad_output): function rotate_half (line 97) | def rotate_half(x): function apply_rotary_pos_emb_vision (line 104) | def apply_rotary_pos_emb_vision(tensor: paddle.Tensor, freqs: paddle.Ten... function qkv_reshard_head (line 126) | def qkv_reshard_head(tensor, group): class VisionFlashAttention2 (line 150) | class VisionFlashAttention2(nn.Layer): method __init__ (line 153) | def __init__(self, dim: int, num_heads: int = 16) -> None: method forward (line 165) | def forward( class PatchEmbed (line 212) | class PatchEmbed(nn.Layer): method __init__ (line 215) | def __init__( method forward (line 233) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: class VisionMlp (line 248) | class VisionMlp(nn.Layer): method __init__ (line 251) | def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None: method forward (line 257) | def forward(self, x) -> paddle.Tensor: class VisionRotaryEmbedding (line 268) | class VisionRotaryEmbedding(nn.Layer): method __init__ (line 271) | def __init__(self, dim: int, theta: float = 10000.0) -> None: method forward (line 280) | def forward(self, seqlen: int) -> paddle.Tensor: class DFNRopeVisionBlock (line 293) | class DFNRopeVisionBlock(nn.Layer): method __init__ (line 296) | def __init__(self, config, attn_implementation: str = "sdpa") -> None: method forward (line 315) | def forward(self, hidden_states, startend_row_indices, rotary_pos_emb,... class DFNRopeVisionTransformerPretrainedModel (line 335) | class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): method __init__ (line 340) | def __init__(self, config) -> None: method get_dtype (line 366) | def get_dtype(self) -> paddle.dtype: method rot_pos_emb (line 373) | def rot_pos_emb(self, grid_thw, num_pad=0): method forward (line 419) | def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tenso... method extract_feature (line 471) | def extract_feature(self, images, grid_thw): method _extract_feature (line 511) | def _extract_feature(self, images, grid_thw, num_pad=0): method _get_tensor_parallel_mappings (line 519) | def _get_tensor_parallel_mappings(cls, config, is_split=True): method set_state_dict (line 525) | def set_state_dict(self, state_dict, *args, **kwargs): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/dfnrope/modeling_pp.py class DFNRopeVisionTransformerPipe (line 31) | class DFNRopeVisionTransformerPipe(DFNRopeVisionTransformerPretrainedMod... method __init__ (line 36) | def __init__(self, config, use_full_recompute=False): method extract_feature (line 53) | def extract_feature(self, images, grid_thw, second_fwd=False): method _extract_feature (line 99) | def _extract_feature(self, images, grid_thw, num_pad=0): method forward (line 106) | def forward(self, args): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/distributed/__init__.py function parallel_matmul (line 73) | def parallel_matmul( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/distributed/common_dist_utils.py function get_hcg (line 68) | def get_hcg(): function _parallel_matmul (line 75) | def _parallel_matmul( function scatter_axis (line 130) | def scatter_axis(input, group=None, axis=0): function mp_slice (line 162) | def mp_slice(x, indices=None, group=None, axis=0): function all_gather_varlen (line 191) | def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True): class ReduceScatterGroupOp (line 226) | class ReduceScatterGroupOp(PyLayer): method forward (line 232) | def forward(ctx, input, group=None): method backward (line 247) | def backward(ctx, grad): class AllGatherGroupOp (line 258) | class AllGatherGroupOp(PyLayer): method forward (line 264) | def forward(ctx, input, group=None): method backward (line 279) | def backward(ctx, grad): class RRColumnSequenceParallelLinear (line 290) | class RRColumnSequenceParallelLinear(ColumnSequenceParallelLinear): method __init__ (line 295) | def __init__( method forward (line 346) | def forward(self, x): class RRRowSequenceParallelLinear (line 378) | class RRRowSequenceParallelLinear(RowSequenceParallelLinear): method __init__ (line 383) | def __init__( method forward (line 425) | def forward(self, x): class AllGatherVarlenOp (line 468) | class AllGatherVarlenOp(PyLayer): method forward (line 480) | def forward(ctx, input): method backward (line 524) | def backward(ctx, grad): function sequence_parallel_sparse_mask_labels (line 547) | def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100): function get_async_loader (line 592) | def get_async_loader(): function hack_offload_wait (line 606) | def hack_offload_wait(task): function hack_reload_wait (line 611) | def hack_reload_wait(task): function all_gather_group (line 616) | def all_gather_group(input, group=None, axis=0): function reduce_scatter_group (line 652) | def reduce_scatter_group(input, group=None): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/fusion_ops/__init__.py function fusion_flash_attention (line 40) | def fusion_flash_attention( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/fusion_ops/common_fusion_ops.py function _fusion_flash_attention (line 38) | def _fusion_flash_attention( function _gen_from_sparse_attn_mask_indices (line 136) | def _gen_from_sparse_attn_mask_indices(attn_mask_start_row_indices, dtype): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/fusion_ops/npu_fusion_ops.py function npu_combining (line 24) | def npu_combining(x, combine_weights, scatter_index, hard_gate=False): function npu_cal_aux_loss_func (line 40) | def npu_cal_aux_loss_func( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/longcontext_ops.py class MaxHeap (line 30) | class MaxHeap: method __init__ (line 36) | def __init__(self, data=None): method push (line 48) | def push(self, item): method pop (line 56) | def pop(self): method top (line 65) | def top(self): method is_empty (line 73) | def is_empty(self): method __len__ (line 79) | def __len__(self): function redistribute_tokens (line 86) | def redistribute_tokens(piles): class TensorBalanceByTokenType (line 141) | class TensorBalanceByTokenType(PyLayer): method forward (line 145) | def forward( method backward (line 306) | def backward(ctx, tensor_grad, token_type_ids_grad): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/loss/dpo.py class ErnieDPOCriterion (line 34) | class ErnieDPOCriterion(DPOCriterion): method dpo_logps (line 37) | def dpo_logps( method dpo_loss (line 183) | def dpo_loss( method forward (line 268) | def forward( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py function calc_lm_head_logits (line 61) | def calc_lm_head_logits(config, hidden_states, weight, bias, tensor_para... function subbatch (line 104) | def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_ar... class FusedDropoutImpl (line 167) | class FusedDropoutImpl(nn.Layer): method __init__ (line 184) | def __init__(self, prob, mode): method forward (line 197) | def forward(self, x, y): class RMSNorm (line 215) | class RMSNorm(nn.Layer): method __init__ (line 225) | def __init__(self, config): method forward (line 245) | def forward(self, hidden_states): class LayerNorm (line 271) | class LayerNorm(nn.LayerNorm): method __init__ (line 282) | def __init__(self, config): class RopeEmbedding (line 296) | class RopeEmbedding(nn.Layer): method __init__ (line 314) | def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_a... method forward (line 331) | def forward(self, seq_length, position_ids=None): method apply_rotary (line 359) | def apply_rotary(self, rp, q, k): method apply_rotary_3d (line 398) | def apply_rotary_3d(self, rp, q, k, position_ids): method forward_single (line 479) | def forward_single(self, position_ids): method apply_rotary_single (line 497) | def apply_rotary_single(x, rope_emb): class Ernie4_5_MLP (line 505) | class Ernie4_5_MLP(nn.Layer): method __init__ (line 510) | def __init__(self, config, layer_idx=0): method forward (line 578) | def forward(self, x): class Ernie4_5_Attention (line 602) | class Ernie4_5_Attention(nn.Layer): method __init__ (line 605) | def __init__(self, config, layer_idx=0): method set_attn_func (line 745) | def set_attn_func(self): method forward (line 758) | def forward( method _flash_attention_wrapper (line 849) | def _flash_attention_wrapper( method core_attn (line 885) | def core_attn( method rope_attn (line 966) | def rope_attn( class FusedHeadParallelCrossEntropy (line 1073) | class FusedHeadParallelCrossEntropy(PyLayer): method forward (line 1081) | def forward( method backward (line 1198) | def backward(ctx, loss_all_grad, labels_all_grad): class ErniePretrainingCriterion (line 1313) | class ErniePretrainingCriterion(paddle.nn.Layer): method __init__ (line 1316) | def __init__(self, config, return_tuple=True): method forward (line 1338) | def forward(self, prediction_scores, masked_lm_labels, loss_mask=None): method forward_impl_with_fused_head_loss_fn (line 1414) | def forward_impl_with_fused_head_loss_fn( method forward_impl_with_calc_logits (line 1470) | def forward_impl_with_calc_logits( method loss_impl (line 1497) | def loss_impl(self, prediction_scores, masked_lm_labels): method forward_impl (line 1511) | def forward_impl(self, prediction_scores, masked_lm_labels, loss_mask=... class Ernie4_5_LMHead (line 1579) | class Ernie4_5_LMHead(nn.Layer): method __init__ (line 1582) | def __init__(self, config): method forward (line 1635) | def forward(self, hidden_states, tensor_parallel_output=None): method sharded_state_dict (line 1679) | def sharded_state_dict( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py function mtp_hidden_states_set_zero (line 65) | def mtp_hidden_states_set_zero(hidden_states, inbatch_pack_offset): class BaseModelOutputWithPastAndCrossAttentions (line 95) | class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput): class CausalLMOutputWithCrossAttentions (line 120) | class CausalLMOutputWithCrossAttentions(_CausalLMOutput): function get_gate (line 154) | def get_gate( function _parse_moe_group (line 258) | def _parse_moe_group( class Ernie4_5_MoeMLP (line 314) | class Ernie4_5_MoeMLP(Ernie4_5_MLP): method __init__ (line 317) | def __init__(self, config, layer_idx=0): method forward (line 335) | def forward(self, x): class FakeMoERouterLoss (line 359) | class FakeMoERouterLoss(PyLayer): method forward (line 367) | def forward(ctx, x, router_loss, num_acc_steps, enable_delay_scale_loss): method backward (line 386) | def backward(ctx, out_grad): class Ernie4_5_DecoderLayer (line 405) | class Ernie4_5_DecoderLayer(nn.Layer): method __init__ (line 412) | def __init__(self, config, layer_idx): method _init_shared_experts (line 535) | def _init_shared_experts(self): method _init_gate_and_experts (line 558) | def _init_gate_and_experts(self, layer_idx): method forward (line 622) | def forward( method model_parallel_dropout (line 766) | def model_parallel_dropout(self): class Ernie4_5_PretrainedModel (line 778) | class Ernie4_5_PretrainedModel(PretrainedModel): method _get_tensor_parallel_mappings (line 796) | def _get_tensor_parallel_mappings(cls, config, is_split=True): class Ernie4_5_Model (line 924) | class Ernie4_5_Model(Ernie4_5_PretrainedModel): method __init__ (line 927) | def __init__(self, config: Ernie4_5_MoeConfig): method get_input_embeddings (line 1003) | def get_input_embeddings(self): method set_input_embeddings (line 1011) | def set_input_embeddings(self, value): method recompute_training (line 1020) | def recompute_training( method forward (line 1067) | def forward( class ErniePretrainingCriterion (line 1372) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase): method __init__ (line 1375) | def __init__(self, config, return_tuple=True): method forward (line 1396) | def forward( class Ernie4_5_MoeForCausalLM (line 1464) | class Ernie4_5_MoeForCausalLM(Ernie4_5_PretrainedModel): method __init__ (line 1469) | def __init__(self, config): method set_state_dict (line 1505) | def set_state_dict(self, state_dict, *args, **kwargs): method get_input_embeddings (line 1515) | def get_input_embeddings(self): method set_input_embeddings (line 1519) | def set_input_embeddings(self, value): method get_output_embeddings (line 1523) | def get_output_embeddings(self): method set_output_embeddings (line 1527) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 1531) | def set_decoder(self, decoder): method get_decoder (line 1535) | def get_decoder(self): method prepare_attention_mask_for_generation (line 1543) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i... method prepare_inputs_for_generation (line 1547) | def prepare_inputs_for_generation( method update_model_kwargs_for_generation (line 1611) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is... method forward (line 1665) | def forward( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py function parse_args (line 28) | def parse_args(args, mtp_enable=False): function get_pp_vp_split_layers (line 92) | def get_pp_vp_split_layers(config, skip_recompute_num=-1): function create_skip_config_for_refined_recompute (line 162) | def create_skip_config_for_refined_recompute(layer_idx, config): class Ernie4_5_EmbeddingPipe (line 210) | class Ernie4_5_EmbeddingPipe(nn.Layer): method __init__ (line 213) | def __init__(self, config): method embedding_weight (line 234) | def embedding_weight(self): method forward (line 243) | def forward(self, args): class EmptyLayer (line 326) | class EmptyLayer(nn.Layer): method __init__ (line 331) | def __init__(self): method forward (line 341) | def forward(self, x): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl.py class TokenType (line 64) | class TokenType: function monkey_patch_param_hook (line 76) | def monkey_patch_param_hook(param): function get_backbone_lm_param_regex (line 124) | def get_backbone_lm_param_regex(config): function create_freeze_hook (line 156) | def create_freeze_hook(name, param, factor=0.0): function create_partial_freeze_hook (line 168) | def create_partial_freeze_hook(name, param, factor, index): class ModalityDetach (line 181) | class ModalityDetach(PyLayer): method forward (line 185) | def forward( method backward (line 211) | def backward(ctx, *last_hidden_grad): class VariableResolutionResamplerModel (line 218) | class VariableResolutionResamplerModel(nn.Layer): method __init__ (line 223) | def __init__(self, in_dim, out_dim, spatial_conv_size, temporal_conv_s... method spatial_conv_reshape (line 288) | def spatial_conv_reshape(self, x, spatial_conv_size): method forward (line 296) | def forward(self, x, image_mask, token_type_ids, image_type_ids, grid_... method _get_tensor_parallel_mappings (line 420) | def _get_tensor_parallel_mappings(cls, config, is_split=True): class ErniePretrainingCriterion (line 434) | class ErniePretrainingCriterion(ErniePretrainingCriterionBase): method __init__ (line 439) | def __init__(self, config): method forward (line 445) | def forward( method update_log (line 550) | def update_log(self, loss, token_type_ids_untouched): function calc_multimodal_logits (line 568) | def calc_multimodal_logits( class Ernie4_5_MoeVLHead (line 668) | class Ernie4_5_MoeVLHead(Ernie4_5_LMHead): method __init__ (line 671) | def __init__(self, config): method forward (line 684) | def forward(self, hidden_state, token_type_ids_labels, use_cache=False): class Ernie4_5_VLMoeForConditionalGeneration (line 728) | class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM): method __init__ (line 753) | def __init__(self, config: Ernie4_5_VLMoeConfig): method add_vision_model (line 787) | def add_vision_model( method add_image_preprocess (line 795) | def add_image_preprocess(self, preprocess): method _get_tensor_parallel_mappings (line 801) | def _get_tensor_parallel_mappings(cls, config, is_split=True): method _gen_aoa_config (line 831) | def _gen_aoa_config(cls, config): method _gen_inv_aoa_config (line 901) | def _gen_inv_aoa_config(cls, config): method _set_modality_param_mapping (line 970) | def _set_modality_param_mapping(self): method update_params_stat (line 989) | def update_params_stat(self, param_group, stop_gradient): method freeze_vision (line 1003) | def freeze_vision(self): method vision_forward (line 1012) | def vision_forward( method vision_mapping_forward (line 1038) | def vision_mapping_forward( method get_rope_index (line 1073) | def get_rope_index( method get_token_type_ids (line 1221) | def get_token_type_ids( method prepare_inputs_for_generation (line 1315) | def prepare_inputs_for_generation( method _post_init (line 1383) | def _post_init(self, original_init, *args, **kwargs): method forward (line 1396) | def forward( method _resolve_prefix_keys (line 1646) | def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_erro... FILE: paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py class PipelinePretrainedModel (line 75) | class PipelinePretrainedModel(PipelinePretrainedModelBase): method _set_pipeline_name_mapping (line 77) | def _set_pipeline_name_mapping(self, mappings=None): class ErniePretrainingCriterionPipe (line 143) | class ErniePretrainingCriterionPipe(ErniePretrainingCriterion): method __init__ (line 148) | def __init__(self, config): method forward (line 158) | def forward(self, logits, labels): function modality_detach (line 192) | def modality_detach(wrapped_class): function inbatch_pack_offset_to_attn_mask_start_row_indices (line 252) | def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offs... class ErnieMoELMHeadPipe (line 268) | class ErnieMoELMHeadPipe(Ernie4_5_MoeVLHead): method __init__ (line 273) | def __init__(self, config): method embedding_weight (line 277) | def embedding_weight(self): method forward (line 281) | def forward(self, args): class ErnieVLEmbeddingPipe (line 314) | class ErnieVLEmbeddingPipe(Ernie4_5_EmbeddingPipe): method __init__ (line 317) | def __init__(self, config, use_full_recompute=False): method forward (line 339) | def forward(self, args): class ErnieDecoderLayerPipe (line 498) | class ErnieDecoderLayerPipe(ErnieMoEDecoderLayer): method __init__ (line 505) | def __init__(self, config, layer_idx, use_full_recompute=False): method forward (line 513) | def forward(self, args): class LayerNormPipe (line 590) | class LayerNormPipe(LayerNorm): method __init__ (line 593) | def __init__(self, config): method forward (line 599) | def forward(self, args): class RMSNormPipe (line 608) | class RMSNormPipe(RMSNorm): method __init__ (line 611) | def __init__(self, config): method forward (line 616) | def forward(self, args): function multimodal_data_provider (line 624) | def multimodal_data_provider( function exchange_pp_imgs_with_thw (line 746) | def exchange_pp_imgs_with_thw( function get_len_and_offset (line 795) | def get_len_and_offset(input_len, group): class Ernie4_5_VLModel (line 807) | class Ernie4_5_VLModel(nn.Layer): method __init__ (line 810) | def __init__(self, config): class Ernie4_5_VLMoeForConditionalGenerationPipe (line 814) | class Ernie4_5_VLMoeForConditionalGenerationPipe(PipelinePretrainedModel... method _prepare_pipeline_inputs_func (line 829) | def _prepare_pipeline_inputs_func(self, data: Union[List, Dict]): method __init__ (line 1154) | def __init__(self, config, recompute=False): method add_vision_model (line 1298) | def add_vision_model( method add_image_preprocess (line 1305) | def add_image_preprocess(self, preprocess): method set_pp_need_data_degree (line 1310) | def set_pp_need_data_degree(self, p): method _set_modality_param_mapping (line 1321) | def _set_modality_param_mapping(self, use_stop_grad=True): method update_params_stat (line 1357) | def update_params_stat(self, param_group, stop_gradient): method freeze_vision (line 1371) | def freeze_vision(self): method state_dict (line 1381) | def state_dict(self, *args, **kwargs): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/moe/moe_all_gather_layer.py function allgather_async (line 51) | def allgather_async(input, group=None): function reduce_scatter_async (line 76) | def reduce_scatter_async(input, group=None): class AllGatherAsync (line 111) | class AllGatherAsync(PyLayer): method forward (line 117) | def forward(ctx, input, *fn_args, group=None, fn=None, is_first_fwd=Fa... method backward (line 141) | def backward(ctx, grad, *fn_out_grads): class ReshardCombineWeight (line 162) | class ReshardCombineWeight(PyLayer): method forward (line 168) | def forward(ctx, input, group=None): method backward (line 188) | def backward(ctx, grad): class AlltoAllSmart (line 205) | class AlltoAllSmart(paddle.autograd.PyLayer): method forward (line 211) | def forward( method backward (line 402) | def backward( class AlltoAllSmartXPU (line 483) | class AlltoAllSmartXPU(paddle.autograd.PyLayer): method forward (line 489) | def forward( method backward (line 679) | def backward( class MOEAllGatherLayerV2 (line 764) | class MOEAllGatherLayerV2(MOELayer): method __init__ (line 769) | def __init__( method forward (line 818) | def forward( method fused_gate_logits_process_fused (line 1063) | def fused_gate_logits_process_fused(self, gate_logits_lm, gate_logits_... method fused_gate_and_dispatch (line 1148) | def fused_gate_and_dispatch(self, input, token_type_ids=None, global_d... method forward_experts (line 1358) | def forward_experts(self, *dispatched_input): method calc_router_loss_and_logging (line 1441) | def calc_router_loss_and_logging( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/moe/moe_layer.py class MoEStatics (line 51) | class MoEStatics(nn.Layer): method __init__ (line 57) | def __init__(self, config, layer_idx): class GateCombine (line 93) | class GateCombine(PyLayer): method forward (line 99) | def forward(ctx, x, combine_weights, scatter_index): method backward (line 118) | def backward(ctx, grad_y, *_): function combining (line 138) | def combining(x, combine_weights, scatter_index, hard_gate=False): class AlltoAll (line 164) | class AlltoAll(PyLayer): method forward (line 170) | def forward(ctx, x, group, sync_op=True): method backward (line 194) | def backward(ctx, *dx): class AlltoAllAsync (line 207) | class AlltoAllAsync(PyLayer): method forward (line 213) | def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): method backward (line 247) | def backward(ctx, dx_out, *fn_out_grads): function detach_and_requires_grad_ (line 277) | def detach_and_requires_grad_(*args): class FakeClone (line 294) | class FakeClone(paddle.autograd.PyLayer): method forward (line 300) | def forward(ctx, input): method backward (line 318) | def backward(ctx, grad_output): function manual_backward (line 331) | def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]): class MOELayer (line 382) | class MOELayer(nn.Layer): method __init__ (line 387) | def __init__( method forward_experts (line 468) | def forward_experts(self, dispatched_input): method fused_gate_logits_process (line 513) | def fused_gate_logits_process(self, gate_logits, token_type_ids=None, ... method gate_and_dispatch (line 577) | def gate_and_dispatch(self, input, token_type_ids=None): method _calc_router_loss (line 667) | def _calc_router_loss( method calc_router_loss_and_logging (line 719) | def calc_router_loss_and_logging( method combine_expert_output (line 815) | def combine_expert_output(self, expert_output, combine_weights, scatte... method forward_single_stage (line 835) | def forward_single_stage(self, dispatched_input, stage_id): method all2all_expert_overlap (line 849) | def all2all_expert_overlap(self, x, group): method forward (line 877) | def forward( FILE: paddleformers/transformers/ernie4_5_moe_vl/model/moe/topk_gate.py function masked_fill (line 36) | def masked_fill(x, mask, value): function compute_optimal_transport (line 53) | def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters:... function cast_if_needed (line 84) | def cast_if_needed(x, dtype): class FusedGateDetachMatmul (line 98) | class FusedGateDetachMatmul(paddle.autograd.PyLayer): method forward (line 105) | def forward(ctx, x, w): method backward (line 122) | def backward(ctx, y_grad): function gate_detach_matmul (line 143) | def gate_detach_matmul(x, weight, use_fuse): class TopKGate (line 162) | class TopKGate(nn.Layer): method __init__ (line 167) | def __init__(self, config, layer_idx: int, group, gate_weight=None) ->... method _create_gate_parameter (line 273) | def _create_gate_parameter(self): method get_gate_weight (line 312) | def get_gate_weight(self, transform_weight): method forward (line 340) | def forward( method get_capacity (line 374) | def get_capacity(self, num_tokens, cap_factor=None): method _cal_aux_loss (line 400) | def _cal_aux_loss( method _cal_z_loss (line 499) | def _cal_z_loss(self, logits, loss_mask=None): method _cal_orthogonal_loss_opt_each_weight (line 520) | def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): method _cal_orthogonal_loss (line 550) | def _cal_orthogonal_loss(self, weight_id=None, use_group=None): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/refined_recompute/utils.py function is_second_fwd (line 35) | def is_second_fwd(): function set_second_fwd (line 43) | def set_second_fwd(value=True): class CustomSavedTensorsHooks (line 51) | class CustomSavedTensorsHooks: method __init__ (line 57) | def __init__(self, pack_hook, unpack_hook) -> None: method __enter__ (line 72) | def __enter__(self) -> None: method __exit__ (line 79) | def __exit__(self, *args: object) -> None: function create_skip_config_for_refined_recompute (line 91) | def create_skip_config_for_refined_recompute(layer_idx, config): class RefinedRcomputeQueue (line 146) | class RefinedRcomputeQueue: method __init__ (line 154) | def __init__(self): method update (line 160) | def update(self, queue: queue.Queue, queue_name="unknown"): method check (line 177) | def check(self): class _NoopSaveInputs (line 193) | class _NoopSaveInputs(paddle.autograd.PyLayer): method forward (line 200) | def forward(ctx, *args): method backward (line 208) | def backward(ctx, *args): class RefinedRecomputeFunction (line 213) | class RefinedRecomputeFunction: method __init__ (line 216) | def __init__(self): method post_init (line 222) | def post_init(self, function, function_name=None): method __call__ (line 235) | def __call__(self, function, *args, **kwargs): method forward (line 245) | def forward(self, *args, **kwargs): method _first_fwd (line 253) | def _first_fwd(self, *args, **kwargs): method _second_fwd (line 303) | def _second_fwd(self, *args, **kwargs): method parse_to_args (line 316) | def parse_to_args(self, *args, **kwargs): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/sequence_parallel_utils.py function contiguous (line 31) | def contiguous(self): function _md5sum (line 42) | def _md5sum(self): class AllGatherVarlenOpV2 (line 53) | class AllGatherVarlenOpV2(PyLayer): method forward (line 59) | def forward(ctx, input, indices, axis=0, group=None): method backward (line 67) | def backward(ctx, grad): class SliceVarlenOp (line 72) | class SliceVarlenOp(PyLayer): method forward (line 89) | def forward( method backward (line 102) | def backward(ctx, grad): class ScatterOp (line 107) | class ScatterOp(PyLayer): method forward (line 120) | def forward(ctx, input, axis=0, group=None): method backward (line 127) | def backward(ctx, grad): class GatherOp (line 135) | class GatherOp(PyLayer): method forward (line 143) | def forward(ctx, input, axis=0, group=None): method backward (line 150) | def backward(ctx, grad): class AllGatherOp (line 155) | class AllGatherOp(PyLayer): method forward (line 163) | def forward(ctx, input, group=None): method backward (line 171) | def backward(ctx, grad): function mark_as_sequence_parallel_parameter (line 183) | def mark_as_sequence_parallel_parameter(parameter): FILE: paddleformers/transformers/ernie4_5_moe_vl/model/utils/misc.py class SmoothedValue (line 44) | class SmoothedValue: method __init__ (line 49) | def __init__( method update (line 58) | def update(self, value): method global_avg (line 74) | def global_avg(self): method reset (line 82) | def reset(self): class TrainingLogs (line 88) | class TrainingLogs: method __new__ (line 103) | def __new__(cls, *args, **kw): method __init__ (line 108) | def __init__(self): method set_trainer_interval (line 117) | def set_trainer_interval(self, trainer, logging_interval): method global_meters_keys (line 125) | def global_meters_keys(self): method global_meters_keys (line 130) | def global_meters_keys(self, lst): method enable_skip_zero (line 134) | def enable_skip_zero(self, keys=None): method update (line 146) | def update(self, **kwargs): method is_enabled (line 151) | def is_enabled(self): method __setitem__ (line 157) | def __setitem__(self, k, v): method __getitem__ (line 165) | def __getitem__(self, v): method __getattr__ (line 169) | def __getattr__(self, attr): method dict (line 187) | def dict(self, use_async=False): method reset (line 246) | def reset(self): method take_snapshot (line 252) | def take_snapshot(self): method restore_snapshot (line 256) | def restore_snapshot(self): FILE: paddleformers/transformers/ernie4_5_moe_vl/processor.py class Ernie4_5_VLProcessor (line 39) | class Ernie4_5_VLProcessor(ProcessorMixin): method __init__ (line 70) | def __init__( method _build_token_type_mapping (line 121) | def _build_token_type_mapping(self) -> Dict[Any, int]: method _download_image (line 128) | def _download_image( method _download_video (line 144) | def _download_video(self, item: Dict): method process_vision_info (line 154) | def process_vision_info(self, messages: List[Dict[str, Any]]): method __call__ (line 174) | def __call__( method _add_special_token (line 232) | def _add_special_token(self, token: Union[str, int], outputs: Dict) ->... method _add_text (line 241) | def _add_text(self, text: str, outputs: Dict) -> None: method _add_image (line 252) | def _add_image(self, img: Image.Image, outputs: Dict) -> None: method render_frame_timestamp (line 287) | def render_frame_timestamp(self, frame, timestamp, font_rate=0.1): method _add_video (line 290) | def _add_video(self, pixel_stack, outputs: Dict) -> None: method _load_and_process_video (line 328) | def _load_and_process_video(self, url: str, item: Dict) -> List[Image.... method _set_video_frame_args (line 360) | def _set_video_frame_args(self, video_frame_args, video_meta): method _compute_3d_positions (line 398) | def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int... method _pack_outputs (line 409) | def _pack_outputs(self, outs: Dict) -> Dict[str, Any]: method model_input_names (line 427) | def model_input_names(self): FILE: paddleformers/transformers/ernie4_5_moe_vl/tokenizer.py class Ernie4_5_VLTokenizer (line 30) | class Ernie4_5_VLTokenizer(PreTrainedTokenizer): method __init__ (line 43) | def __init__( method space_token (line 98) | def space_token(self): method space_token_id (line 103) | def space_token_id(self): method gend_token (line 108) | def gend_token(self): method gend_token_id (line 113) | def gend_token_id(self): method im_start_id (line 118) | def im_start_id(self): method im_end_id (line 123) | def im_end_id(self): method vocab_size (line 128) | def vocab_size(self): method get_vocab (line 132) | def get_vocab(self): method _tokenize (line 138) | def _tokenize(self, text): method _convert_token_to_id (line 142) | def _convert_token_to_id(self, token): method _convert_id_to_token (line 146) | def _convert_id_to_token(self, id): method convert_tokens_to_string (line 150) | def convert_tokens_to_string(self, tokens): method prepare_for_model (line 167) | def prepare_for_model(self, *args, **kwargs): method save_vocabulary (line 174) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st... method _decode (line 205) | def _decode(self, *args, **kwargs): method _pad (line 219) | def _pad( FILE: paddleformers/transformers/ernie4_5_moe_vl/vision_process.py function get_filename (line 43) | def get_filename(url=None): function file_download (line 62) | def file_download(url, download_dir, save_to_disk=False, retry=0, retry_... function get_downloadable (line 94) | def get_downloadable(url, download_dir=RAW_VIDEO_DIR, save_to_disk=False... function get_downloadable_image (line 113) | def get_downloadable_image(download_path, need_exif_info, retry_max_time... function read_video_decord (line 187) | def read_video_decord(video_path, save_to_disk): function get_frame_indices (line 205) | def get_frame_indices( function read_frames_decord (line 271) | def read_frames_decord( function render_single_image_with_timestamp (line 315) | def render_single_image_with_timestamp(image: Image, number: str, rate: ... function timestamp_converting (line 343) | def timestamp_converting(time_stamp_in_seconds): function render_frame_timestamp (line 365) | def render_frame_timestamp(frame, timestamp, font_rate=0.1): FILE: paddleformers/transformers/feature_extraction_utils.py class BatchFeature (line 34) | class BatchFeature(UserDict): method __init__ (line 47) | def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type:... method __getitem__ (line 51) | def __getitem__(self, item: str): method __getattr__ (line 61) | def __getattr__(self, item: str): method __getstate__ (line 67) | def __getstate__(self): method __setstate__ (line 70) | def __setstate__(self, state): method keys (line 74) | def keys(self): method values (line 77) | def values(self): method items (line 80) | def items(self): method convert_to_tensors (line 83) | def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorTy... method to (line 145) | def to(self, *args, **kwargs) -> "BatchFeature": class FeatureExtractionMixin (line 202) | class FeatureExtractionMixin(PushToHubMixin): method __init__ (line 213) | def __init__(self, **kwargs): method _set_processor_class (line 225) | def _set_processor_class(self, processor_class: str): method from_pretrained (line 230) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.... method save_pretrained (line 280) | def save_pretrained(self, save_directory: Union[str, os.PathLike], **k... method get_feature_extractor_dict (line 305) | def get_feature_extractor_dict( method from_dict (line 357) | def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs): method to_dict (line 392) | def to_dict(self, *args, **kwargs) -> Dict[str, Any]: method from_json_file (line 405) | def from_json_file(cls, json_file: Union[str, os.PathLike]): method to_json_string (line 423) | def to_json_string(self) -> str: method to_json_file (line 444) | def to_json_file(self, json_file_path: Union[str, os.PathLike]): method __repr__ (line 455) | def __repr__(self): FILE: paddleformers/transformers/fp8_utils.py function swiglu (line 26) | def swiglu(x, y=None): function get_sm_num (line 54) | def get_sm_num(): function set_parameter_color (line 58) | def set_parameter_color( function extract_first_if_tuple (line 74) | def extract_first_if_tuple(x): function _get_fp8_weight_and_scale (line 78) | def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False): function fused_stack_quant (line 93) | def fused_stack_quant(expert_weight_list, transpose=False): function weight_quant (line 107) | def weight_quant(weight, transpose=False): class FP8LinearFunctionBase (line 136) | class FP8LinearFunctionBase: method dequantize_fp8_to_fp32 (line 138) | def dequantize_fp8_to_fp32(fp8_tensor, scale): method padding (line 143) | def padding(x, axis): method padding_and_quant_input (line 157) | def padding_and_quant_input(tensor): method kitchen_gemm (line 179) | def kitchen_gemm( method compute_fp8_linear (line 210) | def compute_fp8_linear( method compute_expert_w_grad (line 280) | def compute_expert_w_grad( method common_fp8_mlp_bwd (line 339) | def common_fp8_mlp_bwd( method fp8_mlp_fwd (line 435) | def fp8_mlp_fwd(x, w1, w2): method fp8_mlp_fwd_norm_rc (line 456) | def fp8_mlp_fwd_norm_rc(x, norm_w, norm_eps, w1, w2): method fp8_mlp_bwd (line 464) | def fp8_mlp_bwd(do3, x, w1, w2, apply_backward_hook=False): method fp8_mlp_bwd_norm_rc (line 506) | def fp8_mlp_bwd_norm_rc(do3, x, norm_w, norm_eps, w1, w2): class FP8LinearFunction (line 530) | class FP8LinearFunction(paddle.autograd.PyLayer): method forward (line 532) | def forward(ctx, x, custom_map, keep_x=False): method backward (line 561) | def backward(ctx, dout): class FP8Linear (line 599) | class FP8Linear(paddle.nn.Layer): method __init__ (line 600) | def __init__(self, in_features: int, out_features: int, bias_attr: boo... method forward (line 610) | def forward(self, x): function cache_fp8_weight (line 614) | def cache_fp8_weight(weight, quant_transpose=None): class FP8KeepXLinear (line 654) | class FP8KeepXLinear(paddle.nn.Layer): method __init__ (line 655) | def __init__(self, in_features: int, out_features: int, bias_attr: boo... method fp8_quant_weight (line 666) | def fp8_quant_weight(self, quant_transpose=None): method forward (line 669) | def forward(self, x): class FusedNormFP8MLPFunction (line 673) | class FusedNormFP8MLPFunction(paddle.autograd.PyLayer): method forward (line 675) | def forward(ctx, x, norm_w, w1, w2, norm_eps): method backward (line 703) | def backward(ctx, do3): class FP8MlpFunction (line 730) | class FP8MlpFunction(paddle.autograd.PyLayer): method forward (line 732) | def forward(ctx, x, w1, w2, recompute_fwd_gate_up): method backward (line 756) | def backward(ctx, do3): class FP8Mlp (line 792) | class FP8Mlp(paddle.nn.Layer): method __init__ (line 793) | def __init__( method fp8_quant_weight (line 828) | def fp8_quant_weight(self, quant_transpose=None): method forward (line 832) | def forward(self, x): function split_group_gemm (line 839) | def split_group_gemm(x_fp8, x_scale, w_fp8, w_scale, tokens_per_expert, ... class FP8GroupGemmMlpFunctionNode (line 859) | class FP8GroupGemmMlpFunctionNode: method __init__ (line 860) | def __init__( method reset_statue (line 879) | def reset_statue(self): method clear_activation_tensors (line 885) | def clear_activation_tensors(self): method gen_m_indices (line 892) | def gen_m_indices(self, tokens_per_expert): method fwd_gate_up (line 899) | def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, m_i... method fwd_swiglu (line 947) | def fwd_swiglu(self, o1): method fwd_down (line 951) | def fwd_down( method bwd_dowm_input (line 997) | def bwd_dowm_input(self, expert_w2, unzipped_grad, o1, tokens_per_expe... method bwd_swiglu (line 1040) | def bwd_swiglu(self, o1, do2): method bwd_gate_up_input (line 1044) | def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, m_indic... method fused_transpose_split_quant (line 1082) | def fused_transpose_split_quant(self, x, scale, tokens_per_expert, pow... method bwd_down_weight (line 1088) | def bwd_down_weight(self, do3, o2, expert_w2, tokens_per_expert): method bwd_gate_up_weight (line 1122) | def bwd_gate_up_weight( method forward (line 1178) | def forward(self, hs_out, unzipped_probs, tokens_per_expert, m_indices... method backward (line 1221) | def backward( FILE: paddleformers/transformers/fused_a2a.py function barrier_ep (line 28) | def barrier_ep(ep_group): function get_hidden_bytes (line 33) | def get_hidden_bytes(x: paddle.Tensor) -> int: function get_buffer (line 45) | def get_buffer(group: Group, hidden_bytes: int): function fused_dispatch_forward_func (line 77) | def fused_dispatch_forward_func( function fused_dispatch_backward_func (line 134) | def fused_dispatch_backward_func( function fused_combine_forward_func (line 159) | def fused_combine_forward_func( function fused_combine_backward_func (line 177) | def fused_combine_backward_func( class FusedDispatch (line 204) | class FusedDispatch(PyLayer): method forward (line 208) | def forward(ctx, x, token_indices, token_probs, num_experts, group, pr... method backward (line 221) | def backward(ctx, grad_output, grad_token_probs): class FusedCombine (line 226) | class FusedCombine(PyLayer): method forward (line 230) | def forward(ctx, x, group, states, previous_event=None): method backward (line 241) | def backward(ctx, grad_output): function fused_dispatch (line 248) | def fused_dispatch(x, token_indices, token_probs, num_experts, group: Gr... function fused_combine (line 264) | def fused_combine(x, group, handle, previous_event=None): class DispatchNode (line 285) | class DispatchNode: method __init__ (line 286) | def __init__(self, name="dispatch"): method reset_statue (line 289) | def reset_statue(self): method forward (line 292) | def forward( method backward (line 321) | def backward( class CombineNode (line 338) | class CombineNode: method __init__ (line 339) | def __init__(self, name="combine"): method reset_statue (line 342) | def reset_statue(self): method forward (line 345) | def forward(self, x, group, handle, previous_event=None, async_finish=... method backward (line 364) | def backward(self, grad_output, previous_event=None, async_finish=Fals... FILE: paddleformers/transformers/gemma3_text/configuration.py class Gemma3TextConfig (line 27) | class Gemma3TextConfig(PretrainedConfig): method __init__ (line 151) | def __init__( class Gemma3Config (line 232) | class Gemma3Config(PretrainedConfig): method __init__ (line 285) | def __init__( FILE: paddleformers/transformers/gemma3_text/modeling.py class Gemma3TextScaledWordEmbedding (line 45) | class Gemma3TextScaledWordEmbedding(nn.Embedding): method __init__ (line 50) | def __init__(self, config): method forward (line 61) | def forward(self, input_ids: paddle.Tensor): class Gemma3MLP (line 65) | class Gemma3MLP(BaseMLP): method __init__ (line 66) | def __init__(self, config: Gemma3TextConfig, fuse_up_gate=False): class Gemma3RMSNorm (line 71) | class Gemma3RMSNorm(nn.Layer): method __init__ (line 72) | def __init__(self, hidden_size: int, eps: float = 1e-6, input_is_paral... method _norm (line 84) | def _norm(self, x): method forward (line 91) | def forward(self, x): method enable_sequence_parallel (line 98) | def enable_sequence_parallel(self): class Gemma3RMSNormPipe (line 102) | class Gemma3RMSNormPipe(Gemma3RMSNorm): method __init__ (line 103) | def __init__(self, config): method forward (line 109) | def forward(self, x): class Gemma3RotaryEmbedding (line 115) | class Gemma3RotaryEmbedding(nn.Layer): method __init__ (line 116) | def __init__(self, config): method compute_default_rope_parameters (line 134) | def compute_default_rope_parameters( method forward (line 159) | def forward(self, x, position_ids): function rotate_half (line 175) | def rotate_half(x): function apply_rotary_pos_emb (line 182) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di... class Gemma3Attention (line 191) | class Gemma3Attention(nn.Layer): method __init__ (line 194) | def __init__(self, config: Gemma3TextConfig, layer_idx: int): method forward (line 249) | def forward( class Gemma3DecoderLayer (line 321) | class Gemma3DecoderLayer(nn.Layer): method __init__ (line 322) | def __init__(self, config: Gemma3TextConfig, layer_idx: int): method forward (line 335) | def forward( class Gemma3PreTrainedModel (line 381) | class Gemma3PreTrainedModel(PretrainedModel): method _gen_aoa_config (line 397) | def _gen_aoa_config(cls, config: Gemma3TextConfig): method _gen_inv_aoa_config (line 436) | def _gen_inv_aoa_config(cls, config: Gemma3TextConfig): class Gemma3TextModel (line 479) | class Gemma3TextModel(Gemma3PreTrainedModel): method __init__ (line 482) | def __init__(self, config: Gemma3TextConfig): method recompute_training (line 500) | def recompute_training( method forward (line 532) | def forward( class Gemma3ForCausalLM (line 688) | class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin): method __init__ (line 695) | def __init__(self, config: Gemma3TextConfig): method _get_model_inputs_spec (line 702) | def _get_model_inputs_spec(self, dtype: str): method forward (line 709) | def forward( class Gemma3TextForSequenceClassification (line 794) | class Gemma3TextForSequenceClassification(Gemma3PreTrainedModel): class Gemma3ForCausalLMPipe (line 803) | class Gemma3ForCausalLMPipe(GeneralModelForCausalLMPipe): FILE: paddleformers/transformers/glm4_moe/configuration.py class Glm4MoeConfig (line 19) | class Glm4MoeConfig(PretrainedConfig): method __init__ (line 128) | def __init__( FILE: paddleformers/transformers/glm4_moe/modeling.py class GLMMoEModelProvider (line 55) | class GLMMoEModelProvider(GPTModelProvider): function eager_attention_forward (line 91) | def eager_attention_forward( function rotate_half (line 123) | def rotate_half(x): function apply_rotary_pos_emb (line 130) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di... class Glm4MoeAttention (line 151) | class Glm4MoeAttention(nn.Layer): method __init__ (line 154) | def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = N... method forward (line 218) | def forward( class Glm4MoeTopkFlexRouter (line 289) | class Glm4MoeTopkFlexRouter(PretrainedMoEGate): method __init__ (line 290) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs): method forward (line 310) | def forward(self, hidden_states): class Glm4MoeTopkRouter (line 328) | class Glm4MoeTopkRouter(nn.Layer): method __init__ (line 329) | def __init__(self, config: Glm4MoeConfig): method get_topk_indices (line 351) | def get_topk_indices(self, scores): method forward (line 370) | def forward(self, hidden_states): class GLm4MoeNaiveMoe (line 383) | class GLm4MoeNaiveMoe(nn.Module): method __init__ (line 384) | def __init__(self, config): method sharded_state_dict (line 402) | def sharded_state_dict( method forward (line 417) | def forward( class Glm4MoeMoE (line 445) | class Glm4MoeMoE(nn.Layer): method __init__ (line 450) | def __init__(self, config): method moe (line 478) | def moe(self, hidden_states: paddle.Tensor, topk_indices: paddle.Tenso... method forward (line 504) | def forward(self, hidden_states): class AddAuxiliaryLoss (line 522) | class AddAuxiliaryLoss(paddle.autograd.PyLayer): method forward (line 529) | def forward(ctx, x, loss): method backward (line 536) | def backward(ctx, grad_output): class Glm4MoeFlexMoE (line 543) | class Glm4MoeFlexMoE(MoEFlexTokenLayer): method __init__ (line 548) | def __init__(self, config): method forward (line 605) | def forward(self, hidden_states): class Glm4MoeDecoderLayer (line 614) | class Glm4MoeDecoderLayer(nn.Layer): method __init__ (line 615) | def __init__(self, config: Glm4MoeConfig, layer_idx: int): method subbatch_recompute_forward (line 666) | def subbatch_recompute_forward( method attn (line 759) | def attn( method post_process (line 822) | def post_process( method forward (line 834) | def forward( class Glm4MoePreTrainedModel (line 864) | class Glm4MoePreTrainedModel(PretrainedModel): method _gen_aoa_config (line 872) | def _gen_aoa_config(cls, config: Glm4MoeConfig): method _gen_inv_aoa_config (line 1023) | def _gen_inv_aoa_config(cls, config: Glm4MoeConfig): class Glm4MoeRotaryEmbedding (line 1193) | class Glm4MoeRotaryEmbedding(nn.Layer): method __init__ (line 1194) | def __init__(self, config: Glm4MoeConfig, device=None): method compute_default_rope_parameters (line 1211) | def compute_default_rope_parameters( method forward (line 1239) | def forward(self, x, position_ids): class Glm4MoeModel (line 1256) | class Glm4MoeModel(Glm4MoePreTrainedModel): method __init__ (line 1259) | def __init__(self, config: Glm4MoeConfig): method recompute_training_full (line 1284) | def recompute_training_full( method forward (line 1314) | def forward( class Glm4MoeForCausalLM (line 1450) | class Glm4MoeForCausalLM(Glm4MoePreTrainedModel): method __new__ (line 1453) | def __new__(cls, config): class Glm4MoeForCausalLMDeprecated (line 1475) | class Glm4MoeForCausalLMDeprecated(Glm4MoePreTrainedModel): method __init__ (line 1480) | def __init__(self, config): method forward (line 1488) | def forward( class Glm4MoeDecoderLayerPipe (line 1545) | class Glm4MoeDecoderLayerPipe(Glm4MoeDecoderLayer): method forward (line 1546) | def forward(self, args): class Glm4MoeForCausalLMPipe (line 1625) | class Glm4MoeForCausalLMPipe(Glm4MoePreTrainedModel, GeneralModelForCaus... method __new__ (line 1628) | def __new__(cls, config): class Glm4MoeForCausalLMPipeDeprecated (line 1652) | class Glm4MoeForCausalLMPipeDeprecated(GeneralModelForCausalLMPipe): FILE: paddleformers/transformers/glm4v_moe/configuration.py class RopeParameters (line 22) | class RopeParameters(TypedDict, total=False): class Glm4vMoeVisionConfig (line 77) | class Glm4vMoeVisionConfig(PretrainedConfig): method __init__ (line 132) | def __init__( class Glm4vMoeTextConfig (line 170) | class Glm4vMoeTextConfig(PretrainedConfig): method __init__ (line 258) | def __init__( class Glm4vMoeConfig (line 332) | class Glm4vMoeConfig(PretrainedConfig): method __init__ (line 378) | def __init__( method __setattr__ (line 411) | def __setattr__(self, key, value): method __getattribute__ (line 421) | def __getattribute__(self, key): FILE: paddleformers/transformers/glm4v_moe/image_processor_fast.py function smart_resize (line 45) | def smart_resize( class Glm4vFastImageProcessorKwargs (line 78) | class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): class Glm4vImageProcessorFast (line 93) | class Glm4vImageProcessorFast(BaseImageProcessorFast): method __init__ (line 108) | def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]): method _further_process_kwargs (line 115) | def _further_process_kwargs( method _preprocess (line 129) | def _preprocess( method preprocess (line 232) | def preprocess( FILE: paddleformers/transformers/glm4v_moe/modeling.py class MoeModelOutputWithPast (line 51) | class MoeModelOutputWithPast(ModelOutput): class Glm4vMoeModelOutputWithPast (line 64) | class Glm4vMoeModelOutputWithPast(ModelOutput): class Glm4vMoeTextRotaryEmbedding (line 82) | class Glm4vMoeTextRotaryEmbedding(nn.Layer): method __init__ (line 85) | def __init__(self, config: Glm4vMoeTextConfig, device=None, layer_type... method compute_default_rope_parameters (line 102) | def compute_default_rope_parameters( method forward (line 131) | def forward(self, x, position_ids): function rotate_half (line 151) | def rotate_half(x): function apply_multimodal_rotary_pos_emb (line 158) | def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqu... class Glm4vMoeTextAttention (line 211) | class Glm4vMoeTextAttention(nn.Layer): method __init__ (line 214) | def __init__(self, config: Glm4vMoeTextConfig, layer_idx: Optional[int... method forward (line 259) | def forward( class Glm4vMoeTextDecoderLayer (line 325) | class Glm4vMoeTextDecoderLayer(nn.Layer): method __init__ (line 326) | def __init__(self, config: Glm4vMoeTextConfig, layer_idx: int): method forward (line 378) | def forward( class Glm4vMoePreTrainedModel (line 415) | class Glm4vMoePreTrainedModel(PretrainedModel): method _gen_aoa_config (line 438) | def _gen_aoa_config(cls, config: Glm4vMoeConfig): method _gen_inv_aoa_config (line 531) | def _gen_inv_aoa_config(cls, config: Glm4vMoeConfig): class Glm4vMoeCausalLMOutputWithPast (line 625) | class Glm4vMoeCausalLMOutputWithPast(ModelOutput): class Glm4vMoeVisionPatchEmbed (line 649) | class Glm4vMoeVisionPatchEmbed(nn.Layer): method __init__ (line 650) | def __init__(self, config: Glm4vMoeVisionConfig) -> None: method forward (line 660) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: class Glm4vMoeVisionRotaryEmbedding (line 669) | class Glm4vMoeVisionRotaryEmbedding(nn.Layer): method __init__ (line 672) | def __init__(self, dim: int, theta: float = 10000.0) -> None: method forward (line 677) | def forward(self, seqlen: int) -> paddle.Tensor: class Glm4vMoeVisionPatchMerger (line 683) | class Glm4vMoeVisionPatchMerger(nn.Layer): method __init__ (line 684) | def __init__(self, config, dim: int, context_dim: int, hidden_act: str... method forward (line 695) | def forward(self, hidden_state: paddle.Tensor) -> paddle.Tensor: class Glm4vMoeVisionEmbeddings (line 701) | class Glm4vMoeVisionEmbeddings(nn.Layer): method __init__ (line 702) | def __init__(self, config: Glm4vMoeVisionConfig): method forward (line 718) | def forward(self, embeddings, lengths, image_shapes, h_coords, w_coord... function apply_rotary_pos_emb_vision (line 785) | def apply_rotary_pos_emb_vision( class Glm4vMoeVisionAttention (line 799) | class Glm4vMoeVisionAttention(nn.Layer): method __init__ (line 800) | def __init__(self, config: Glm4vMoeVisionConfig) -> None: method forward (line 823) | def forward( class Glm4vMoeVisionBlock (line 899) | class Glm4vMoeVisionBlock(nn.Layer): method __init__ (line 900) | def __init__(self, config) -> None: method forward (line 924) | def forward( class Glm4vMoeVisionModel (line 943) | class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel): method __init__ (line 948) | def __init__(self, config) -> None: method rot_pos_emb (line 990) | def rot_pos_emb(self, grid_thw): method recompute_training_full (line 1024) | def recompute_training_full( method forward (line 1045) | def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tenso... class Glm4vMoeTextModel (line 1103) | class Glm4vMoeTextModel(Glm4vMoePreTrainedModel): method __init__ (line 1107) | def __init__(self, config: Glm4vMoeTextConfig): method recompute_training_full (line 1133) | def recompute_training_full( method forward (line 1165) | def forward( class Glm4vMoeModel (line 1312) | class Glm4vMoeModel(Glm4vMoePreTrainedModel): method __init__ (line 1321) | def __init__(self, config): method get_input_embeddings (line 1327) | def get_input_embeddings(self): method set_input_embeddings (line 1330) | def set_input_embeddings(self, value): method get_rope_index (line 1333) | def get_rope_index( method get_video_features (line 1530) | def get_video_features( method get_image_features (line 1554) | def get_image_features(self, pixel_values: paddle.FloatTensor, image_g... method get_placeholder_mask (line 1570) | def get_placeholder_mask( method forward (line 1611) | def forward( class Glm4vMoeForConditionalGeneration (line 1724) | class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel): method __init__ (line 1732) | def __init__(self, config): method get_input_embeddings (line 1740) | def get_input_embeddings(self): method set_input_embeddings (line 1743) | def set_input_embeddings(self, value): method get_video_features (line 1746) | def get_video_features( method get_image_features (line 1751) | def get_image_features(self, pixel_values: paddle.FloatTensor, image_g... method language_model (line 1756) | def language_model(self): method visual (line 1760) | def visual(self): method forward (line 1763) | def forward( method prepare_inputs_for_generation (line 1869) | def prepare_inputs_for_generation( method _get_image_nums_and_video_nums (line 1918) | def _get_image_nums_and_video_nums( method _expand_inputs_for_generation (line 1968) | def _expand_inputs_for_generation( FILE: paddleformers/transformers/glm4v_moe/processor.py class Glm4vVideosProcessorKwargs (line 41) | class Glm4vVideosProcessorKwargs(VideosKwargs, total=False): class Glm4vProcessorKwargs (line 45) | class Glm4vProcessorKwargs(ProcessingKwargs, total=False): class Glm4vProcessor (line 57) | class Glm4vProcessor(ProcessorMixin): method __init__ (line 77) | def __init__(self, image_processor=None, tokenizer=None, video_process... method __call__ (line 92) | def __call__( method _get_num_multimodal_tokens (line 224) | def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=Non... method post_process_image_text_to_text (line 262) | def post_process_image_text_to_text( method replace_frame_token_id (line 289) | def replace_frame_token_id(self, timestamp_sec): FILE: paddleformers/transformers/glm4v_moe/video_processor.py class Glm4vVideoProcessorInitKwargs (line 44) | class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False): class Glm4vVideoProcessor (line 52) | class Glm4vVideoProcessor(BaseVideoProcessor): method __init__ (line 73) | def __init__(self, **kwargs: Unpack[Glm4vVideoProcessorInitKwargs]): method _further_process_kwargs (line 80) | def _further_process_kwargs( method sample_frames (line 94) | def sample_frames( method _preprocess (line 144) | def _preprocess( FILE: paddleformers/transformers/glm_ocr/configuration.py class GlmOcrVisionConfig (line 21) | class GlmOcrVisionConfig(PretrainedConfig): method __init__ (line 68) | def __init__( class GlmOcrTextConfig (line 106) | class GlmOcrTextConfig(PretrainedConfig): method __init__ (line 126) | def __init__( class GlmOcrConfig (line 174) | class GlmOcrConfig(PretrainedConfig): method __setattr__ (line 221) | def __setattr__(self, key, value): method __getattribute__ (line 238) | def __getattribute__(self, key): method __init__ (line 250) | def __init__( FILE: paddleformers/transformers/glm_ocr/image_processor.py function is_scaled_image (line 44) | def is_scaled_image(image: np.ndarray) -> bool: function make_batched_images (line 53) | def make_batched_images(images) -> List[ImageInput]: function smart_resize (line 74) | def smart_resize( class Glm46VImageProcessor (line 133) | class Glm46VImageProcessor(BaseImageProcessor): method __init__ (line 169) | def __init__( method get_smarted_resize (line 202) | def get_smarted_resize(self, height, width, min_pixels=None, max_pixel... method set_pixels (line 219) | def set_pixels(self, min_pixels: Optional[int] = None, max_pixels: Opt... method get_number_of_image_patches (line 232) | def get_number_of_image_patches( method _preprocess (line 268) | def _preprocess( method preprocess (line 412) | def preprocess( FILE: paddleformers/transformers/glm_ocr/modeling.py class GlmOcrVisionMlp (line 39) | class GlmOcrVisionMlp(nn.Layer): method __init__ (line 40) | def __init__(self, config, bias: bool = True): method forward (line 49) | def forward(self, hidden_state): function repeat_kv (line 53) | def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: function rotate_half_llm (line 65) | def rotate_half_llm(x): function apply_rotary_pos_emb (line 72) | def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): class GlmOcrTextAttention (line 91) | class GlmOcrTextAttention(nn.Layer): method __init__ (line 97) | def __init__(self, config: GlmOcrTextConfig, layer_idx: int | None = N... method forward (line 116) | def forward( class GlmOcrVisionRotaryEmbedding (line 160) | class GlmOcrVisionRotaryEmbedding(nn.Layer): method __init__ (line 163) | def __init__(self, dim: int, theta: float = 10000.0) -> None: method forward (line 170) | def forward(self, seqlen: int) -> paddle.Tensor: class GlmOcrTextMLP (line 176) | class GlmOcrTextMLP(nn.Layer): method __init__ (line 177) | def __init__(self, config): method forward (line 185) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: class GlmOcrTextDecoderLayer (line 194) | class GlmOcrTextDecoderLayer(nn.Layer): method __init__ (line 195) | def __init__(self, config, layer_idx: int): method forward (line 215) | def forward( class GlmOcrPreTrainedModel (line 253) | class GlmOcrPreTrainedModel(PretrainedModel): method _init_weights (line 279) | def _init_weights(self, layer): method _gen_aoa_config (line 286) | def _gen_aoa_config(cls, config: GlmOcrConfig): method _gen_inv_aoa_config (line 372) | def _gen_inv_aoa_config(cls, config: GlmOcrConfig): class GlmOcrModelOutputWithPast (line 477) | class GlmOcrModelOutputWithPast(ModelOutput): function rotate_half (line 485) | def rotate_half(x): function apply_rotary_pos_emb_vision (line 492) | def apply_rotary_pos_emb_vision(q, k, cos, sin): class GlmOcrVisionAttention (line 507) | class GlmOcrVisionAttention(nn.Layer): method __init__ (line 508) | def __init__(self, config: GlmOcrVisionConfig) -> None: method forward (line 529) | def forward( class GlmOcrVisionBlock (line 605) | class GlmOcrVisionBlock(nn.Layer): method __init__ (line 606) | def __init__(self, config) -> None: method forward (line 617) | def forward( class GlmOcrVisionPatchMerger (line 636) | class GlmOcrVisionPatchMerger(nn.Layer): method __init__ (line 637) | def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: ... method forward (line 647) | def forward(self, hidden_state: paddle.Tensor) -> paddle.Tensor: class GlmOcrVisionPatchEmbed (line 653) | class GlmOcrVisionPatchEmbed(nn.Layer): method __init__ (line 654) | def __init__(self, config: GlmOcrVisionConfig) -> None: method forward (line 669) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: class GlmOcrVisionModel (line 679) | class GlmOcrVisionModel(GlmOcrPreTrainedModel): method __init__ (line 688) | def __init__(self, config) -> None: method rot_pos_emb (line 715) | def rot_pos_emb(self, grid_thw: paddle.Tensor): method forward (line 758) | def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tenso... class GlmOcrTextRotaryEmbedding (line 797) | class GlmOcrTextRotaryEmbedding(nn.Layer): method __init__ (line 798) | def __init__(self, config): method compute_default_rope_parameters (line 811) | def compute_default_rope_parameters(config): method apply_mrope (line 826) | def apply_mrope(self, freqs, mrope_section): method forward (line 838) | def forward(self, hidden_states: paddle.Tensor, position_ids: paddle.T... class GlmOcrTextModel (line 860) | class GlmOcrTextModel(GlmOcrPreTrainedModel): method __init__ (line 864) | def __init__(self, config: GlmOcrTextConfig): method forward (line 887) | def forward( function masked_scatter (line 970) | def masked_scatter(inputs: paddle.Tensor, mask: paddle.Tensor, updates: ... class GlmOcrModel (line 987) | class GlmOcrModel(GlmOcrPreTrainedModel): method __init__ (line 994) | def __init__(self, config): method get_input_embeddings (line 1000) | def get_input_embeddings(self): method set_input_embeddings (line 1003) | def set_input_embeddings(self, value): method get_rope_index (line 1006) | def get_rope_index(self, input_ids, image_grid_thw=None, video_grid_th... method get_video_features (line 1139) | def get_video_features( method get_image_features (line 1171) | def get_image_features( method get_placeholder_mask (line 1194) | def get_placeholder_mask( method forward (line 1243) | def forward( class GlmOcrCausalLMOutputWithPast (line 1321) | class GlmOcrCausalLMOutputWithPast(ModelOutput): class GlmOcrForConditionalGeneration (line 1341) | class GlmOcrForConditionalGeneration(GlmOcrPreTrainedModel, GenerationMi... method __init__ (line 1346) | def __init__(self, config): method get_input_embeddings (line 1355) | def get_input_embeddings(self): method set_input_embeddings (line 1358) | def set_input_embeddings(self, value): method get_video_features (line 1361) | def get_video_features(self, pixel_values_videos, video_grid_thw=None,... method get_image_features (line 1368) | def get_image_features(self, pixel_values, image_grid_thw=None, **kwar... method forward (line 1375) | def forward( method prepare_inputs_for_generation (line 1430) | def prepare_inputs_for_generation( method update_model_kwargs_for_generation (line 1479) | def update_model_kwargs_for_generation(self, outputs, model_kwargs, is... method _get_image_nums_and_video_nums (line 1501) | def _get_image_nums_and_video_nums( method _expand_inputs_for_generation (line 1533) | def _expand_inputs_for_generation( FILE: paddleformers/transformers/glm_ocr/processor.py class Glm46VProcessorKwargs (line 26) | class Glm46VProcessorKwargs(ProcessingKwargs, total=False): class Glm46VProcessor (line 35) | class Glm46VProcessor(ProcessorMixin): method __init__ (line 40) | def __init__(self, image_processor=None, tokenizer=None, chat_template... method apply_chat_template (line 49) | def apply_chat_template( method __call__ (line 181) | def __call__( method _get_num_multimodal_tokens (line 226) | def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): method post_process_image_text_to_text (line 242) | def post_process_image_text_to_text( FILE: paddleformers/transformers/gpt_oss/configuration.py class GptOssConfig (line 20) | class GptOssConfig(PretrainedConfig): method __init__ (line 28) | def __init__( FILE: paddleformers/transformers/gpt_oss/modeling.py function is_casual_mask (line 42) | def is_casual_mask(attention_mask): function _make_causal_mask (line 49) | def _make_causal_mask(input_ids_shape, past_key_values_length): function _expand_2d_mask (line 65) | def _expand_2d_mask(mask, dtype, tgt_length): class GptOssExperts (line 79) | class GptOssExperts(nn.Layer): method __init__ (line 80) | def __init__(self, config): method forward (line 111) | def forward(self, hidden_states: paddle.Tensor, router_indices=None, r... class GptOssTopKRouter (line 182) | class GptOssTopKRouter(nn.Layer): method __init__ (line 183) | def __init__(self, config): method forward (line 199) | def forward(self, hidden_states): class GptOssMLP (line 209) | class GptOssMLP(nn.Layer): method __init__ (line 210) | def __init__(self, config): method forward (line 216) | def forward(self, hidden_states): class GptOssRotaryEmbedding (line 226) | class GptOssRotaryEmbedding(nn.Layer): method __init__ (line 227) | def __init__(self, config: GptOssConfig, device=None): method compute_default_rope_parameters (line 248) | def compute_default_rope_parameters( method forward (line 274) | def forward(self, x, position_ids): function _apply_rotary_emb (line 290) | def _apply_rotary_emb( function apply_rotary_pos_emb (line 301) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di... class GptOssAttention (line 309) | class GptOssAttention(nn.Layer): method __init__ (line 315) | def __init__(self, config, layer_idx=0): method forward (line 390) | def forward( class GptOssDecoderLayer (line 469) | class GptOssDecoderLayer(nn.Layer): method __init__ (line 470) | def __init__(self, config: GptOssConfig, layer_idx: int): method forward (line 499) | def forward( class GptOssPreTrainedModel (line 562) | class GptOssPreTrainedModel(PretrainedModel): method _gen_aoa_config (line 570) | def _gen_aoa_config(cls, config: GptOssConfig): method _gen_inv_aoa_config (line 606) | def _gen_inv_aoa_config(cls, config: GptOssConfig): class GptOssModel (line 639) | class GptOssModel(GptOssPreTrainedModel): method __init__ (line 647) | def __init__(self, config: GptOssConfig): method recompute_training_full (line 683) | def recompute_training_full( method forward (line 718) | def forward( function load_balancing_loss_func (line 873) | def load_balancing_loss_func( class GptOssForCausalLM (line 948) | class GptOssForCausalLM(GptOssPreTrainedModel): method __init__ (line 952) | def __init__(self, config: GptOssConfig): method get_input_embeddings (line 962) | def get_input_embeddings(self): method set_input_embeddings (line 965) | def set_input_embeddings(self, value): method get_output_embeddings (line 968) | def get_output_embeddings(self): method set_output_embeddings (line 971) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 974) | def set_decoder(self, decoder): method get_decoder (line 977) | def get_decoder(self): method prepare_inputs_for_generation (line 980) | def prepare_inputs_for_generation( method _get_model_inputs_spec (line 1011) | def _get_model_inputs_spec(self, dtype: str): method forward (line 1018) | def forward( class GptOssForCausalLMPipe (line 1099) | class GptOssForCausalLMPipe(GeneralModelForCausalLMPipe): FILE: paddleformers/transformers/gpt_provider.py class GPTModel (line 57) | class GPTModel(FleetGPTModel, PretrainedModel): function local_layer_spec (line 67) | def local_layer_spec(config: "GPTModelProvider") -> LayerSpec: class GPTModelProvider (line 85) | class GPTModelProvider(GPTConfig, ModelProviderMixin[GPTModel]): method provide (line 150) | def provide(self, pre_process=None, post_process=None, vp_stage=None, ... function mtp_block_spec (line 204) | def mtp_block_spec(config: "GPTModelProvider", vp_stage: Optional[int] =... FILE: paddleformers/transformers/image_processing_utils.py class PaddleImageProcessingMixin (line 40) | class PaddleImageProcessingMixin: method __init__ (line 63) | def __init__(self, **kwargs): method _wrap_return_tensor_methods (line 67) | def _wrap_return_tensor_methods(self): method _wrap_single_method (line 91) | def _wrap_single_method(self, method_name): method __call__ (line 151) | def __call__(self, images, *args, **kwargs) -> BatchFeature: method from_pretrained (line 156) | def from_pretrained( method get_image_processor_dict (line 166) | def get_image_processor_dict( method from_dict (line 256) | def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): method to_dict (line 300) | def to_dict(self): function warp_image_processormixin (line 314) | def warp_image_processormixin(hf_image_processormixin_class: ImageProces... function warp_base_image_processor (line 320) | def warp_base_image_processor(hf_base_image_processor_class: BaseImagePr... class ImageProcessingMixin (line 326) | class ImageProcessingMixin(PaddleImageProcessingMixin, ImageProcessingMi... method init (line 327) | def init(self, *args, **kwargs): class BaseImageProcessor (line 331) | class BaseImageProcessor(PaddleImageProcessingMixin, BaseImageProcessor_... method init (line 332) | def init(self, *args, **kwargs): FILE: paddleformers/transformers/image_processing_utils_fast.py function validate_fast_preprocess_arguments (line 56) | def validate_fast_preprocess_arguments( function safe_squeeze (line 94) | def safe_squeeze(tensor: "paddle.Tensor", axis: Optional[int] = None) ->... function max_across_indices (line 107) | def max_across_indices(values: Iterable[Any]) -> list[Any]: function get_max_height_width (line 114) | def get_max_height_width(images: list["paddle.Tensor"]) -> tuple[int, ...]: function divide_to_patches (line 124) | def divide_to_patches( class DefaultFastImageProcessorKwargs (line 148) | class DefaultFastImageProcessorKwargs(TypedDict, total=False): class BaseImageProcessorFast (line 170) | class BaseImageProcessorFast(BaseImageProcessor): method __init__ (line 193) | def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]): method is_fast (line 218) | def is_fast(self) -> bool: method pad (line 224) | def pad( method resize (line 292) | def resize( method compile_friendly_resize (line 344) | def compile_friendly_resize( method rescale (line 365) | def rescale( method normalize (line 383) | def normalize( method _fuse_mean_std_and_rescale_factor (line 403) | def _fuse_mean_std_and_rescale_factor( method rescale_and_normalize (line 418) | def rescale_and_normalize( method center_crop (line 445) | def center_crop( method convert_to_rgb (line 486) | def convert_to_rgb( method filter_out_unused_kwargs (line 502) | def filter_out_unused_kwargs(self, kwargs: dict): method _prepare_images_structure (line 515) | def _prepare_images_structure( method _process_image (line 534) | def _process_image( method _prepare_image_like_inputs (line 571) | def _prepare_image_like_inputs( method _further_process_kwargs (line 615) | def _further_process_kwargs( method _validate_preprocess_kwargs (line 659) | def _validate_preprocess_kwargs( method __call__ (line 693) | def __call__(self, images: ImageInput, *args, **kwargs: Unpack[Default... method preprocess (line 696) | def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[Defau... method _preprocess_image_like_inputs (line 722) | def _preprocess_image_like_inputs( method _preprocess (line 742) | def _preprocess( method to_dict (line 790) | def to_dict(self): FILE: paddleformers/transformers/image_transforms.py function is_paddle_tensor (line 39) | def is_paddle_tensor(tensor): function to_channel_dimension_format (line 43) | def to_channel_dimension_format( function rescale (line 80) | def rescale( function to_pil_image (line 110) | def to_pil_image( function get_size_with_aspect_ratio (line 151) | def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple... function get_resize_output_image_size (line 191) | def get_resize_output_image_size( function resize (line 256) | def resize( function normalize (line 315) | def normalize( function center_crop (line 376) | def center_crop( function _center_to_corners_format_paddle (line 471) | def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> ... function _center_to_corners_format_numpy (line 481) | def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.nda... function center_to_corners_format (line 492) | def center_to_corners_format(bboxes_center: TensorType) -> TensorType: function _corners_to_center_format_paddle (line 511) | def _corners_to_center_format_paddle(bboxes_corners: "paddle.Tensor") ->... function _corners_to_center_format_numpy (line 522) | def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.nd... function corners_to_center_format (line 536) | def corners_to_center_format(bboxes_corners: TensorType) -> TensorType: function rgb_to_id (line 557) | def rgb_to_id(color): function id_to_rgb (line 568) | def id_to_rgb(id_map): class PaddingMode (line 587) | class PaddingMode(ExplicitEnum): function pad (line 598) | def pad( function convert_to_rgb (line 684) | def convert_to_rgb(image: ImageInput) -> ImageInput: function _group_images_by_shape (line 701) | def _group_images_by_shape(nested_images, is_nested: bool = False): function _reconstruct_nested_structure (line 717) | def _reconstruct_nested_structure(indices, processed_images): function group_images_by_shape (line 743) | def group_images_by_shape( function reorder_images (line 794) | def reorder_images( FILE: paddleformers/transformers/image_utils.py function is_paddle_tensor (line 57) | def is_paddle_tensor(tensor): function is_pil_image (line 61) | def is_pil_image(img): function is_numpy_array (line 65) | def is_numpy_array(img): function to_numpy (line 69) | def to_numpy(obj): class ChannelDimension (line 83) | class ChannelDimension(ExplicitEnum): class ImageType (line 88) | class ImageType(ExplicitEnum): function get_image_type (line 94) | def get_image_type(image): function is_valid_image (line 104) | def is_valid_image(img): function is_valid_list_of_images (line 108) | def is_valid_list_of_images(images: list): function valid_images (line 112) | def valid_images(imgs): function is_batched (line 124) | def is_batched(img): function make_list_of_images (line 130) | def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageIn... function make_flat_list_of_images (line 167) | def make_flat_list_of_images( function make_nested_list_of_images (line 205) | def make_nested_list_of_images( function to_numpy_array (line 244) | def to_numpy_array(img) -> np.ndarray: function infer_channel_dimension_format (line 253) | def infer_channel_dimension_format( function get_channel_dimension_axis (line 288) | def get_channel_dimension_axis(image: np.ndarray) -> int: function get_image_size (line 307) | def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = No... function get_image_size_for_max_height_width (line 331) | def get_image_size_for_max_height_width( function is_valid_annotation_coco_detection (line 362) | def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List,... function is_valid_annotation_coco_panoptic (line 378) | def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, ... function valid_coco_detection_annotations (line 395) | def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Uni... function valid_coco_panoptic_annotations (line 399) | def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Unio... function load_image (line 403) | def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image": function validate_preprocess_arguments (line 436) | def validate_preprocess_arguments( class ImageFeatureExtractionMixin (line 488) | class ImageFeatureExtractionMixin: method _ensure_format_supported (line 493) | def _ensure_format_supported(self, image): method to_pil_image (line 500) | def to_pil_image(self, image, rescale=None): method convert_rgb (line 530) | def convert_rgb(self, image): method rescale (line 544) | def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.n... method to_numpy_array (line 551) | def to_numpy_array(self, image, rescale=None, channel_first=True): method expand_dims (line 583) | def expand_dims(self, image): method normalize (line 603) | def normalize(self, image, mean, std, rescale=False): method resize (line 648) | def resize(self, image, size, resample=None, default_to_square=True, m... method center_crop (line 715) | def center_crop(self, image, size): method flip_channel_order (line 790) | def flip_channel_order(self, image): method rotate (line 807) | def rotate(self, image, angle, resample=None, expand=0, center=None, t... function validate_kwargs (line 832) | def validate_kwargs(valid_processor_keys: list[str], captured_kwargs: li... class SizeDict (line 840) | class SizeDict: method __getitem__ (line 852) | def __getitem__(self, key): FILE: paddleformers/transformers/kimi_k2/configuration.py class KimiK2Config (line 20) | class KimiK2Config(PretrainedConfig): method __init__ (line 142) | def __init__( FILE: paddleformers/transformers/kimi_k2/modeling.py class KimiK2Provider (line 24) | class KimiK2Provider(GPTModelProvider): method __post_init__ (line 33) | def __post_init__(config): class KimiK2PretrainedModel (line 37) | class KimiK2PretrainedModel(PretrainedModel): method _gen_aoa_config (line 54) | def _gen_aoa_config(cls, config: KimiK2Config): class KimiK2ForCausalLM (line 86) | class KimiK2ForCausalLM(KimiK2PretrainedModel): method __new__ (line 89) | def __new__(cls, config, have_criterion=True): class KimiK2ForCausalLMPipe (line 126) | class KimiK2ForCausalLMPipe(KimiK2PretrainedModel, GeneralModelForCausal... method __new__ (line 129) | def __new__(cls, config): FILE: paddleformers/transformers/kimi_k2/tokenizer.py class KimiK2TikTokenTokenizer (line 31) | class KimiK2TikTokenTokenizer(PreTrainedTokenizer): method __init__ (line 74) | def __init__( method encode (line 152) | def encode(self, text: str, **kwargs) -> List[int]: method decode (line 204) | def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str: method _split_whitespaces_or_nonwhitespaces (line 223) | def _split_whitespaces_or_nonwhitespaces(s: str, max_consecutive_slice... method pre_tokenizer_process (line 246) | def pre_tokenizer_process(self, text: str) -> List[str]: method vocab_size (line 256) | def vocab_size(self) -> int: method get_vocab (line 259) | def get_vocab(self) -> Dict[str, int]: method _tokenize (line 262) | def _tokenize(self, text: str, **kwargs) -> List[str]: method _convert_token_to_id (line 265) | def _convert_token_to_id(self, token: str) -> int: method _convert_id_to_token (line 268) | def _convert_id_to_token(self, index: int) -> str: method clean_up_tokenization (line 272) | def clean_up_tokenization(out_string: str) -> str: method convert_tokens_to_string (line 275) | def convert_tokens_to_string(self, tokens: List[str]) -> str: method save_vocabulary (line 280) | def save_vocabulary(self, save_directory: str, filename_prefix: Option... FILE: paddleformers/transformers/kimi_k25/media_utils.py class VideoSpec (line 34) | class VideoSpec: method __post_init__ (line 43) | def __post_init__(self): class ImageInput (line 56) | class ImageInput(TypedDict): class VideoChunkInput (line 61) | class VideoChunkInput(TypedDict): function _read_video_decord (line 70) | def _read_video_decord( function _read_video_paddlecodec (line 121) | def _read_video_paddlecodec( function get_video_meta (line 217) | def get_video_meta(video_src: bytes | str | os.PathLike, accurate: bool ... function timestamp_as_str (line 230) | def timestamp_as_str(timestamp: float, timestamp_mode: str = "hh:mm:ss.f... function navit_resize_image (line 248) | def navit_resize_image( function navit_resize_video (line 297) | def navit_resize_video( function real_sample_fps_and_max_num_frames (line 333) | def real_sample_fps_and_max_num_frames( function _to_pil (line 348) | def _to_pil(data: str | bytes): function ensure_media_type (line 364) | def ensure_media_type(media: MediaInput) -> MediaInput: function image_in_tensor (line 378) | def image_in_tensor( function image_to_np (line 452) | def image_to_np( function navit_patchify (line 527) | def navit_patchify(pixel_values: paddle.Tensor, patch_size: int) -> dict... FILE: paddleformers/transformers/kimi_k25/processor.py class KimiK25Processor (line 22) | class KimiK25Processor(ProcessorMixin): method __init__ (line 41) | def __init__( method update_raw_text (line 53) | def update_raw_text(self, text: str, video_prompts: list[str]) -> str: method preprocess_medias (line 65) | def preprocess_medias(self, medias: list[dict], **kwargs) -> list[dict]: method __call__ (line 79) | def __call__( method _extract_medias_from_messages (line 126) | def _extract_medias_from_messages(messages: list[dict]) -> list[dict]: method apply_chat_template (line 156) | def apply_chat_template(self, messages, **kwargs): method batch_decode (line 159) | def batch_decode(self, *args, **kwargs): method decode (line 162) | def decode(self, *args, **kwargs): method model_input_names (line 166) | def model_input_names(self): FILE: paddleformers/transformers/kimi_k25/tokenizer.py class TikTokenTokenizer (line 34) | class TikTokenTokenizer(PreTrainedTokenizer): method __init__ (line 77) | def __init__( method encode (line 162) | def encode(self, text: str, allow_special_tokens: bool = True, **kwarg... method decode (line 223) | def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str: method _split_whitespaces_or_nonwhitespaces (line 242) | def _split_whitespaces_or_nonwhitespaces(s: str, max_consecutive_slice... method pre_tokenizer_process (line 265) | def pre_tokenizer_process(self, text: str) -> List[str]: method vocab_size (line 275) | def vocab_size(self) -> int: method get_vocab (line 278) | def get_vocab(self) -> Dict[str, int]: method _tokenize (line 281) | def _tokenize(self, text: str, **kwargs) -> List[str]: method _convert_token_to_id (line 284) | def _convert_token_to_id(self, token: str) -> int: method _convert_id_to_token (line 287) | def _convert_id_to_token(self, index: int) -> str: method clean_up_tokenization (line 291) | def clean_up_tokenization(out_string: str) -> str: method convert_tokens_to_string (line 294) | def convert_tokens_to_string(self, tokens: List[str]) -> str: method save_vocabulary (line 299) | def save_vocabulary(self, save_directory: str, filename_prefix: Option... method apply_chat_template (line 311) | def apply_chat_template( function deep_sort_dict (line 346) | def deep_sort_dict(obj: Any) -> Any: FILE: paddleformers/transformers/kimi_k25/tool_declaration_ts.py class _SchemaRegistry (line 30) | class _SchemaRegistry: method __init__ (line 33) | def __init__(self): method register_definitions (line 37) | def register_definitions(self, defs: dict[str, Any]): method resolve_ref (line 44) | def resolve_ref(self, ref: str) -> dict[str, Any]: function _format_description (line 58) | def _format_description(description: str, indent: str = "") -> str: class _BaseType (line 62) | class _BaseType: method __init__ (line 66) | def __init__( method to_typescript_style (line 75) | def to_typescript_style(self, indent: str = "") -> str: method format_docstring (line 78) | def format_docstring(self, indent: str) -> str: class _ParameterTypeScalar (line 89) | class _ParameterTypeScalar(_BaseType): method __init__ (line 92) | def __init__(self, type: str, extra_props: dict[str, Any] | None = None): method to_typescript_style (line 103) | def to_typescript_style(self, indent: str = "") -> str: class _ParameterTypeObject (line 110) | class _ParameterTypeObject(_BaseType): method __init__ (line 114) | def __init__(self, json_schema_object: dict[str, Any], registry: _Sche... method to_typescript_style (line 146) | def to_typescript_style(self, indent: str = "") -> str: class _ParameterTypeArray (line 182) | class _ParameterTypeArray(_BaseType): method __init__ (line 185) | def __init__(self, json_schema_object: dict[str, Any], registry: _Sche... method to_typescript_style (line 192) | def to_typescript_style(self, indent: str = "") -> str: class _ParameterTypeEnum (line 209) | class _ParameterTypeEnum(_BaseType): method __init__ (line 213) | def __init__(self, json_schema_object: dict[str, Any]): method to_typescript_style (line 242) | def to_typescript_style(self, indent: str = "") -> str: class _ParameterTypeAnyOf (line 246) | class _ParameterTypeAnyOf(_BaseType): method __init__ (line 249) | def __init__( method to_typescript_style (line 257) | def to_typescript_style(self, indent: str = "") -> str: class _ParameterTypeUnion (line 261) | class _ParameterTypeUnion(_BaseType): method __init__ (line 264) | def __init__(self, json_schema_object: dict[str, Any]): method to_typescript_style (line 278) | def to_typescript_style(self, indent: str = "") -> str: class _ParameterTypeRef (line 282) | class _ParameterTypeRef(_BaseType): method __init__ (line 286) | def __init__(self, json_schema_object: dict[str, Any], registry: _Sche... method to_typescript_style (line 298) | def to_typescript_style(self, indent: str = "") -> str: class _Parameter (line 314) | class _Parameter: method parse_extended (line 326) | def parse_extended(cls, attributes: dict[str, Any]) -> "_Parameter": method to_typescript_style (line 337) | def to_typescript_style(self, indent: str = "") -> str: function _parse_parameter_type (line 354) | def _parse_parameter_type( function _openai_function_to_typescript_style (line 387) | def _openai_function_to_typescript_style( function encode_tools_to_typescript_style (line 437) | def encode_tools_to_typescript_style( FILE: paddleformers/transformers/kimi_k25/vision_processor.py function resampling (line 46) | def resampling( class KimiK25VisionProcessor (line 60) | class KimiK25VisionProcessor(BaseImageProcessor): method __init__ (line 63) | def __init__( method media_tokens_calculator (line 72) | def media_tokens_calculator(self, media: MediaInput): method make_chunk_prompt (line 78) | def make_chunk_prompt(cls, timestamp_text: str) -> str: method split_video_chunks (line 81) | def split_video_chunks( method get_resize_config (line 116) | def get_resize_config(self, media_input: MediaInput) -> dict: method resize_image (line 163) | def resize_image( method preprocess (line 185) | def preprocess( method __repr__ (line 240) | def __repr__(self): method to_dict (line 243) | def to_dict(self) -> Dict[str, Any]: method from_dict (line 251) | def from_dict(cls, config_dict: Dict[str, Any], **kwargs): method to_json_string (line 262) | def to_json_string(self): FILE: paddleformers/transformers/kto_criterion.py class KTOCriterion (line 33) | class KTOCriterion(nn.Layer): method __init__ (line 36) | def __init__(self, config, kto_config=None, ignore_label=0, use_infohu... method _nested_gather (line 61) | def _nested_gather(self, tensors): method kto_logps (line 80) | def kto_logps(self, logits, response_labels, response_kl_labels, respo... method kto_loss (line 181) | def kto_loss( method forward (line 213) | def forward( FILE: paddleformers/transformers/legacy/tokenizer_utils.py function convert_to_unicode (line 80) | def convert_to_unicode(text): function whitespace_tokenize (line 96) | def whitespace_tokenize(text): function _is_whitespace (line 111) | def _is_whitespace(char): function _is_control (line 125) | def _is_control(char): function _is_punctuation (line 137) | def _is_punctuation(char): function _is_end_of_word (line 152) | def _is_end_of_word(text): function _is_start_of_word (line 158) | def _is_start_of_word(text): function _insert_one_token_to_ordered_list (line 164) | def _insert_one_token_to_ordered_list(token_list: List[str], new_token: ... function is_chinese_char (line 177) | def is_chinese_char(cp): function _is_nonnormalized_char (line 202) | def _is_nonnormalized_char(char): function _is_nonnormalized_numeric (line 217) | def _is_nonnormalized_numeric(char): function normalize_chars (line 231) | def normalize_chars(text): function _is_symbol (line 253) | def _is_symbol(char): function tokenize_special_chars (line 263) | def tokenize_special_chars(text): class Trie (line 282) | class Trie: method __init__ (line 288) | def __init__(self): method add (line 291) | def add(self, word: str): method split (line 320) | def split(self, text: str) -> List[str]: method cut_text (line 473) | def cut_text(self, text, offsets): function tokenize_chinese_chars (line 497) | def tokenize_chinese_chars(text): class ChatTemplate (line 518) | class ChatTemplate: method _compile_jinja_template (line 525) | def _compile_jinja_template(chat_template) -> Template: method render_conversation (line 549) | def render_conversation( method render_query (line 580) | def render_query(self, query: str, index: int = 0, context_data: Dict[... method _init_context_data (line 587) | def _init_context_data(self, context_data: Dict[str, Union[int, str]] ... method render_system (line 592) | def render_system(self, context_data: Dict[str, Union[int, str]] = {})... method __call__ (line 599) | def __call__(self, conversations: list[list[str]] | str, context_data:... method from_dict (line 632) | def from_dict(cls, config: dict): method from_file (line 636) | def from_file(cls, file: str): class ChatTemplateMixin (line 642) | class ChatTemplateMixin: method apply_chat_template (line 645) | def apply_chat_template( method _apply_chat_template_paddle (line 689) | def _apply_chat_template_paddle( method _apply_chat_template (line 707) | def _apply_chat_template( method encode_chat_inputs (line 741) | def encode_chat_inputs( method _encode_chat_inputs_paddle (line 772) | def _encode_chat_inputs_paddle(self, conversations: List[List[str, str... method _encode_chat_inputs_openai_format (line 797) | def _encode_chat_inputs_openai_format( method _encode_chat_inputs (line 839) | def _encode_chat_inputs( method _extract_non_learnable_parts (line 899) | def _extract_non_learnable_parts(self, origin_msg: List[Dict[str, str]... method from_pretrained (line 913) | def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): method init_chat_template (line 941) | def init_chat_template(self, chat_template: str | dict): method save_resources (line 966) | def save_resources(self, save_directory): class PretrainedTokenizer (line 977) | class PretrainedTokenizer(ChatTemplateMixin, PretrainedTokenizerBase): method _pre_init (line 1024) | def _pre_init(self, original_init, *args, **kwargs): method _build_special_tokens_map_extended (line 1050) | def _build_special_tokens_map_extended(self, **kwargs): method vocab_size (line 1067) | def vocab_size(self) -> int: method is_fast (line 1074) | def is_fast(self) -> bool: method get_added_vocab (line 1077) | def get_added_vocab(self) -> Dict[str, int]: method __len__ (line 1086) | def __len__(self): method _add_tokens (line 1092) | def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], ... method _create_trie (line 1156) | def _create_trie(self, unique_no_split_tokens): method prepare_for_tokenization (line 1165) | def prepare_for_tokenization(self, text, is_split_into_words=False, **... method tokenize (line 1188) | def tokenize(self, text: TextInput, **kwargs) -> List[str]: method _tokenize (line 1259) | def _tokenize(self, text, **kwargs): method convert_tokens_to_ids (line 1268) | def convert_tokens_to_ids(self, tokens): method _convert_token_to_id_with_added_voc (line 1281) | def _convert_token_to_id_with_added_voc(self, token): method _convert_token_to_id (line 1289) | def _convert_token_to_id(self, token): method convert_tokens_to_string (line 1293) | def convert_tokens_to_string(self, tokens): method convert_ids_to_tokens (line 1306) | def convert_ids_to_tokens(self, ids, skip_special_tokens=False): method _convert_id_to_token (line 1327) | def _convert_id_to_token(self, index): method load_vocabulary (line 1332) | def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_toke... method save_vocabulary (line 1364) | def save_vocabulary(filepath, vocab): method get_special_tokens_mask (line 1381) | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, alrea... method num_special_tokens_to_add (line 1408) | def num_special_tokens_to_add(self, pair): method _encode_plus (line 1423) | def _encode_plus( method _batch_encode_plus (line 1500) | def _batch_encode_plus( method _batch_prepare_for_model (line 1602) | def _batch_prepare_for_model( method _get_bert_like_offset_mapping (line 1762) | def _get_bert_like_offset_mapping(self, text: str): method get_offset_mapping (line 1845) | def get_offset_mapping(self, text: str, split_tokens: Optional[List[st... method _decode (line 1933) | def _decode( class BPETokenizer (line 1976) | class BPETokenizer(PretrainedTokenizer): class Encoder (line 2006) | class Encoder(object): method __init__ (line 2007) | def __init__(self, encoder, bpe_merges, errors="replace", special_to... method _bytes_to_unicode (line 2024) | def _bytes_to_unicode(self): method _get_pairs (line 2053) | def _get_pairs(self, word): method bpe (line 2064) | def bpe(self, token): method tokenize (line 2106) | def tokenize(self, text): method tokenize_old (line 2122) | def tokenize_old(self, text): method is_special_token (line 2125) | def is_special_token(self, tok): method tokenize_bpe (line 2136) | def tokenize_bpe(self, token): method encode (line 2145) | def encode(self, text): method decode (line 2151) | def decode(self, tokens): method __init__ (line 2175) | def __init__( method _tokenize (line 2194) | def _tokenize(self, text, is_sentencepiece=True): method _get_encoder (line 2205) | def _get_encoder(self, encoder_json_path, vocab_bpe_path): FILE: paddleformers/transformers/legacy/tokenizer_utils_base.py function import_protobuf_decode_error (line 63) | def import_protobuf_decode_error(error_message=""): class AddedToken (line 84) | class AddedToken: method __init__ (line 92) | def __init__( method __getstate__ (line 102) | def __getstate__(self): method __str__ (line 105) | def __str__(self): method __repr__ (line 108) | def __repr__(self) -> str: class EncodingFast (line 112) | class EncodingFast: class ExplicitEnum (line 118) | class ExplicitEnum(Enum): method _missing_ (line 124) | def _missing_(cls, value): class PaddingStrategy (line 130) | class PaddingStrategy(ExplicitEnum): class TensorType (line 141) | class TensorType(ExplicitEnum): function to_py_obj (line 169) | def to_py_obj(obj): function _is_numpy (line 185) | def _is_numpy(x): class TruncationStrategy (line 189) | class TruncationStrategy(ExplicitEnum): class CharSpan (line 201) | class CharSpan(NamedTuple): class TokenSpan (line 214) | class TokenSpan(NamedTuple): class BatchEncoding (line 227) | class BatchEncoding(UserDict): method __init__ (line 247) | def __init__( method n_sequences (line 270) | def n_sequences(self) -> Optional[int]: method is_fast (line 279) | def is_fast(self) -> bool: method __getitem__ (line 286) | def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]: method __getattr__ (line 303) | def __getattr__(self, item: str): method __getstate__ (line 309) | def __getstate__(self): method __setstate__ (line 312) | def __setstate__(self, state): method keys (line 319) | def keys(self): method values (line 322) | def values(self): method items (line 325) | def items(self): method encodings (line 333) | def encodings(self) -> Optional[List[EncodingFast]]: method tokens (line 340) | def tokens(self, batch_index: int = 0) -> List[str]: method sequence_ids (line 355) | def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]: method words (line 376) | def words(self, batch_index: int = 0) -> List[Optional[int]]: method word_ids (line 397) | def word_ids(self, batch_index: int = 0) -> List[Optional[int]]: method token_to_sequence (line 413) | def token_to_sequence(self, batch_or_token_index: int, token_index: Op... method token_to_word (line 452) | def token_to_word(self, batch_or_token_index: int, token_index: Option... method word_to_tokens (line 490) | def word_to_tokens( method token_to_chars (line 541) | def token_to_chars(self, batch_or_token_index: int, token_index: Optio... method char_to_token (line 577) | def char_to_token( method word_to_chars (line 618) | def word_to_chars( method char_to_word (line 663) | def char_to_word(self, batch_or_char_index: int, char_index: Optional[... method convert_to_tensors (line 702) | def convert_to_tensors( class SpecialTokensMixin (line 753) | class SpecialTokensMixin: method __init__ (line 791) | def __init__(self, verbose=True, **kwargs): method sanitize_special_tokens (line 824) | def sanitize_special_tokens(self) -> int: method add_special_tokens (line 836) | def add_special_tokens( method add_tokens (line 930) | def add_tokens( method _add_extra_special_tokens (line 976) | def _add_extra_special_tokens(cls, extra_sp_token: Union[str, AddedTok... method _add_tokens (line 980) | def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], ... method bos_token (line 984) | def bos_token(self) -> str: method eos_token (line 994) | def eos_token(self) -> str: method unk_token (line 1004) | def unk_token(self) -> str: method sep_token (line 1014) | def sep_token(self) -> str: method pad_token (line 1025) | def pad_token(self) -> str: method cls_token (line 1035) | def cls_token(self) -> str: method mask_token (line 1046) | def mask_token(self) -> str: method additional_special_tokens (line 1057) | def additional_special_tokens(self) -> List[str]: method bos_token (line 1068) | def bos_token(self, value): method eos_token (line 1072) | def eos_token(self, value): method unk_token (line 1076) | def unk_token(self, value): method sep_token (line 1080) | def sep_token(self, value): method pad_token (line 1084) | def pad_token(self, value): method cls_token (line 1088) | def cls_token(self, value): method mask_token (line 1092) | def mask_token(self, value): method additional_special_tokens (line 1096) | def additional_special_tokens(self, value): method bos_token_id (line 1100) | def bos_token_id(self) -> Optional[int]: method eos_token_id (line 1110) | def eos_token_id(self) -> Optional[int]: method unk_token_id (line 1120) | def unk_token_id(self) -> Optional[int]: method sep_token_id (line 1129) | def sep_token_id(self) -> Optional[int]: method pad_token_id (line 1139) | def pad_token_id(self) -> Optional[int]: method pad_token_type_id (line 1148) | def pad_token_type_id(self) -> int: method cls_token_id (line 1155) | def cls_token_id(self) -> Optional[int]: method mask_token_id (line 1167) | def mask_token_id(self) -> Optional[int]: method additional_special_tokens_ids (line 1177) | def additional_special_tokens_ids(self) -> List[int]: method bos_token_id (line 1185) | def bos_token_id(self, value): method eos_token_id (line 1189) | def eos_token_id(self, value): method unk_token_id (line 1193) | def unk_token_id(self, value): method sep_token_id (line 1197) | def sep_token_id(self, value): method pad_token_id (line 1201) | def pad_token_id(self, value): method cls_token_id (line 1205) | def cls_token_id(self, value): method mask_token_id (line 1209) | def mask_token_id(self, value): method additional_special_tokens_ids (line 1213) | def additional_special_tokens_ids(self, values): method special_tokens_map (line 1217) | def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]: method special_tokens_map_extended (line 1242) | def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedTok... method all_special_tokens (line 1265) | def all_special_tokens(self) -> List[str]: method all_special_tokens_extended (line 1276) | def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: method all_special_ids (line 1296) | def all_special_ids(self) -> List[int]: class PretrainedTokenizerBase (line 1305) | class PretrainedTokenizerBase(SpecialTokensMixin): method __init__ (line 1387) | def __init__(self, **kwargs): method max_len_single_sentence (line 1428) | def max_len_single_sentence(self) -> int: method max_len_sentences_pair (line 1435) | def max_len_sentences_pair(self) -> int: method max_len_single_sentence (line 1442) | def max_len_single_sentence(self, value) -> int: method _switch_to_input_mode (line 1455) | def _switch_to_input_mode(self): method max_len_sentences_pair (line 1462) | def max_len_sentences_pair(self, value) -> int: method _set_processor_class (line 1475) | def _set_processor_class(self, processor_class: str): method __repr__ (line 1479) | def __repr__(self) -> str: method get_vocab (line 1489) | def get_vocab(self) -> Dict[str, int]: method from_pretrained (line 1502) | def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): method _from_pretrained (line 1623) | def _from_pretrained( method save_pretrained (line 1818) | def save_pretrained(self, save_directory, filename_prefix: Optional[st... method _save_pretrained (line 1914) | def _save_pretrained( method save_resources (line 1937) | def save_resources(self, save_directory): method save_to_hf_hub (line 1951) | def save_to_hf_hub( method save_to_aistudio (line 2011) | def save_to_aistudio( method tokenize (line 2059) | def tokenize(self, text: str, pair: Optional[str] = None, add_special_... method num_special_tokens_to_add (line 2079) | def num_special_tokens_to_add(self, pair: bool = False) -> int: method _get_padding_truncation_strategies (line 2082) | def _get_padding_truncation_strategies( method __call__ (line 2219) | def __call__( method encode (line 2492) | def encode( method encode_plus (line 2571) | def encode_plus( method _encode_plus (line 2645) | def _encode_plus( method batch_encode (line 2670) | def batch_encode( method _batch_encode_plus (line 2761) | def _batch_encode_plus( method pad (line 2793) | def pad( method create_token_type_ids_from_sequences (line 2982) | def create_token_type_ids_from_sequences( method build_inputs_with_special_tokens (line 3002) | def build_inputs_with_special_tokens( method build_offset_mapping_with_special_tokens (line 3022) | def build_offset_mapping_with_special_tokens(self, offset_mapping_0, o... method prepare_for_model (line 3042) | def prepare_for_model( method truncate_sequences (line 3197) | def truncate_sequences( method _pad (line 3316) | def _pad( method convert_tokens_to_string (line 3462) | def convert_tokens_to_string(self, tokens: List[str]) -> str: method decode_token (line 3475) | def decode_token( method batch_decode (line 3508) | def batch_decode( method decode (line 3541) | def decode( method _decode (line 3577) | def _decode( method get_special_tokens_mask (line 3586) | def get_special_tokens_mask( method clean_up_tokenization (line 3618) | def clean_up_tokenization(out_string: str) -> str: method _eventual_warn_about_too_long_sequence (line 3642) | def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_l... FILE: paddleformers/transformers/linear_utils.py class ColumnSequenceParallelLinearPass (line 43) | class ColumnSequenceParallelLinearPass(object): class RowSequenceParallelLinearPass (line 51) | class RowSequenceParallelLinearPass(object): FILE: paddleformers/transformers/llama/auto_dist_config.py function get_dist_config (line 18) | def get_dist_config(model, prefix=""): FILE: paddleformers/transformers/llama/modeling.py function rotate_half (line 40) | def rotate_half(x: paddle.Tensor) -> paddle.Tensor: function apply_rotary_pos_emb (line 50) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di... class LLamaAttention (line 71) | class LLamaAttention(nn.Layer): method __init__ (line 72) | def __init__(self, config: LlamaConfig, layer_idx: int): method forward (line 141) | def forward( class LlamaDecoderLayer (line 189) | class LlamaDecoderLayer(nn.Layer): method __init__ (line 190) | def __init__(self, config: LlamaConfig, layer_idx: int): method forward (line 213) | def forward( class LlamaRotaryEmbedding (line 248) | class LlamaRotaryEmbedding(nn.Layer): method __init__ (line 249) | def __init__(self, config): method compute_default_rope_parameters (line 269) | def compute_default_rope_parameters( method forward (line 294) | def forward(self, x, position_ids): class LlamaPretrainedModel (line 310) | class LlamaPretrainedModel(PretrainedModel): method _gen_aoa_config (line 324) | def _gen_aoa_config(cls, config: LlamaConfig): method _gen_inv_aoa_config (line 356) | def _gen_inv_aoa_config(cls, config: LlamaConfig): class LlamaModel (line 387) | class LlamaModel(LlamaPretrainedModel): method __init__ (line 388) | def __init__(self, config: LlamaConfig): method forward (line 414) | def forward( method recompute_training (line 523) | def recompute_training( class LlamaForCausalLM (line 553) | class LlamaForCausalLM(LlamaPretrainedModel): method __init__ (line 556) | def __init__(self, config: LlamaConfig): method forward (line 564) | def forward( method auto_dist_config (line 628) | def auto_dist_config(self, prefix=""): class LlamaForCausalLMPipe (line 633) | class LlamaForCausalLMPipe(GeneralModelForCausalLMPipe): FILE: paddleformers/transformers/masking_utils.py function prepare_sliding_window_startend_row_indices (line 22) | def prepare_sliding_window_startend_row_indices( function create_causal_masks_and_row_indices (line 54) | def create_causal_masks_and_row_indices( function create_causal_mask_and_row_indices (line 200) | def create_causal_mask_and_row_indices( function create_sliding_window_causal_mask_and_row_indices (line 277) | def create_sliding_window_causal_mask_and_row_indices( FILE: paddleformers/transformers/mc2_parallel_linear.py function is_mc2_valid (line 38) | def is_mc2_valid(): class MC2ColumnParallelCoreLinear (line 47) | class MC2ColumnParallelCoreLinear(PyLayer): method forward (line 49) | def forward(ctx, input_, weight, group): method backward (line 57) | def backward(ctx, dy): class MC2RowParallelCoreLinear (line 77) | class MC2RowParallelCoreLinear(PyLayer): method forward (line 79) | def forward(ctx, input_, weight, group): method backward (line 92) | def backward(ctx, dy): class MC2ColumnSeqParallelCoreLinear (line 104) | class MC2ColumnSeqParallelCoreLinear(PyLayer): method forward (line 106) | def forward(ctx, input_, weight, group): method backward (line 131) | def backward(ctx, grad_output): class MC2RowSeqParallelCoreLinear (line 161) | class MC2RowSeqParallelCoreLinear(PyLayer): method forward (line 163) | def forward(ctx, input_, weight, group): method backward (line 186) | def backward(ctx, grad_output): class MC2ColumnSeqParallelLinear (line 209) | class MC2ColumnSeqParallelLinear(ColumnSequenceParallelLinear): method forward (line 210) | def forward(self, x): class MC2RowSeqParallelLinear (line 215) | class MC2RowSeqParallelLinear(RowSequenceParallelLinear): method forward (line 216) | def forward(self, x): FILE: paddleformers/transformers/model_outputs.py function tuple_output (line 31) | def tuple_output(outputs: Tuple[Tensor], loss: Optional[Tensor] = None): function convert_encoder_output (line 45) | def convert_encoder_output(encoder_output): function layer_init_wrapper (line 61) | def layer_init_wrapper(func): function _transformer_encoder_layer_fwd (line 75) | def _transformer_encoder_layer_fwd(self, src, src_mask=None, cache=None,... function _transformer_decoder_layer_fwd (line 107) | def _transformer_decoder_layer_fwd( function _transformer_decoder_fwd (line 185) | def _transformer_decoder_fwd( function _transformer_encoder_fwd (line 270) | def _transformer_encoder_fwd( function _get_wrap_setattr (line 373) | def _get_wrap_setattr(cls): function is_tensor (line 392) | def is_tensor(x): class ModelOutput (line 399) | class ModelOutput(OrderedDict): method __post_init__ (line 413) | def __post_init__(self): method __delitem__ (line 465) | def __delitem__(self, *args, **kwargs): method setdefault (line 468) | def setdefault(self, *args, **kwargs): method pop (line 471) | def pop(self, *args, **kwargs): method update (line 474) | def update(self, *args, **kwargs): method __getitem__ (line 477) | def __getitem__(self, k): method __setattr__ (line 484) | def __setattr__(self, name, value): method __setitem__ (line 490) | def __setitem__(self, key, value): method to_tuple (line 496) | def to_tuple(self) -> Tuple[Any]: class BaseModelOutput (line 513) | class BaseModelOutput(ModelOutput): class BaseModelOutputWithNoAttention (line 539) | class BaseModelOutputWithNoAttention(ModelOutput): class BaseModelOutputWithPooling (line 558) | class BaseModelOutputWithPooling(ModelOutput): class BaseModelOutputWithPast (line 590) | class BaseModelOutputWithPast(ModelOutput): class BaseModelOutputWithPastAndCrossAttentions (line 629) | class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): class BaseModelOutputWithPastAndMTP (line 680) | class BaseModelOutputWithPastAndMTP(ModelOutput): class BaseModelOutputWithPoolingAndCrossAttentions (line 723) | class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): class SequenceClassifierOutput (line 772) | class SequenceClassifierOutput(ModelOutput): class TokenClassifierOutput (line 801) | class TokenClassifierOutput(ModelOutput): class QuestionAnsweringModelOutput (line 830) | class QuestionAnsweringModelOutput(ModelOutput): class MultipleChoiceModelOutput (line 862) | class MultipleChoiceModelOutput(ModelOutput): class MaskedLMOutput (line 893) | class MaskedLMOutput(ModelOutput): class CausalLMOutputWithPast (line 922) | class CausalLMOutputWithPast(ModelOutput): class CausalLMOutputWithCrossAttentions (line 959) | class CausalLMOutputWithCrossAttentions(ModelOutput): class Seq2SeqModelOutput (line 1003) | class Seq2SeqModelOutput(ModelOutput): class Seq2SeqLMOutput (line 1070) | class Seq2SeqLMOutput(ModelOutput): class Seq2SeqQuestionAnsweringModelOutput (line 1135) | class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): class Seq2SeqSequenceClassifierOutput (line 1196) | class Seq2SeqSequenceClassifierOutput(ModelOutput): class SequenceClassifierOutputWithPast (line 1254) | class SequenceClassifierOutputWithPast(ModelOutput): class BackboneOutput (line 1290) | class BackboneOutput(ModelOutput): class BaseModelOutputWithPoolingAndNoAttention (line 1317) | class BaseModelOutputWithPoolingAndNoAttention(ModelOutput): class ImageClassifierOutputWithNoAttention (line 1339) | class ImageClassifierOutputWithNoAttention(ModelOutput): class DepthEstimatorOutput (line 1360) | class DepthEstimatorOutput(ModelOutput): class SemanticSegmenterOutput (line 1390) | class SemanticSegmenterOutput(ModelOutput): class Seq2SeqSpectrogramOutput (line 1421) | class Seq2SeqSpectrogramOutput(ModelOutput): class MoEModelOutputWithPast (line 1481) | class MoEModelOutputWithPast(ModelOutput): class MoECausalLMOutputWithPast (line 1523) | class MoECausalLMOutputWithPast(ModelOutput): class MoECausalLMOutputWithPastAndMTP (line 1575) | class MoECausalLMOutputWithPastAndMTP(MoECausalLMOutputWithPast): FILE: paddleformers/transformers/model_provider.py class DistributedDataParallelConfig (line 59) | class DistributedDataParallelConfig: method __init__ (line 60) | def __init__(self): class ModelProviderMixin (line 64) | class ModelProviderMixin(abc.ABC, Generic[ModelT]): method provide (line 83) | def provide( method provide_distributed_model (line 102) | def provide_distributed_model( method initialize_model_parallel (line 201) | def initialize_model_parallel( method meta_model (line 232) | def meta_model(self) -> list[ModelT]: method pre_wrap_hook (line 241) | def pre_wrap_hook(self) -> Callable[[list[FleetLayer]], list[FleetLaye... method register_pre_wrap_hook (line 264) | def register_pre_wrap_hook( method post_wrap_hook (line 287) | def post_wrap_hook(self) -> Callable[[list[FleetLayer]], list[FleetLay... method register_post_wrap_hook (line 310) | def register_post_wrap_hook( function get_model (line 333) | def get_model( function _create_model (line 458) | def _create_model( function _print_num_params (line 526) | def _print_num_params(model: list[FleetLayer]) -> None: FILE: paddleformers/transformers/model_utils.py function fit_bf16_to_uint16_np (line 118) | def fit_bf16_to_uint16_np(tensor): function dy2st_nocheck_guard_context (line 124) | def dy2st_nocheck_guard_context(): function unwrap_optimizer (line 132) | def unwrap_optimizer(optimizer, optimizer_instances=()): function prune_linear_layer (line 153) | def prune_linear_layer(layer: nn.Linear, index: paddle.Tensor, dim: int ... function find_pruneable_heads_and_indices (line 184) | def find_pruneable_heads_and_indices( function apply_chunking_to_forward (line 208) | def apply_chunking_to_forward( function unwrap_model (line 275) | def unwrap_model(model, *args, **kwargs): function _add_variant (line 292) | def _add_variant(weights_name: str, variant=None) -> str: function dtype_guard (line 302) | def dtype_guard(dtype="float32"): function no_init_weights (line 315) | def no_init_weights(_enable=True): function get_parameter_dtype (line 331) | def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype: function _split_keys_evenly (line 351) | def _split_keys_evenly(keys: list, n: int) -> list: function _load_part_state_dict (line 377) | def _load_part_state_dict( function load_state_dict (line 510) | def load_state_dict( function prepare_safe_save_state_dict (line 621) | def prepare_safe_save_state_dict(state_dict, save_to_hf=True): function resolve_weight_file_from_hf_hub (line 632) | def resolve_weight_file_from_hf_hub( function register_base_model (line 678) | def register_base_model(cls): class BackboneMixin (line 707) | class BackboneMixin: method forward_with_filtered_kwargs (line 708) | def forward_with_filtered_kwargs(self, *args, **kwargs): function _partion_for_pipeline_mode (line 718) | def _partion_for_pipeline_mode(keys): function shard_checkpoint (line 745) | def shard_checkpoint( function load_sharded_checkpoint (line 857) | def load_sharded_checkpoint(model, folder, variant=None, strict=True, pr... function faster_set_state_dict (line 946) | def faster_set_state_dict(model, state_dict, model_state_dict=None, stri... function _load_state_dict_into_model (line 998) | def _load_state_dict_into_model(model_to_load, state_dict, start_prefix,... function _convert_state_dict_dtype_and_shape (line 1033) | def _convert_state_dict_dtype_and_shape(state_dict, model_to_load_state_... function _load_state_dict_into_meta_model (line 1054) | def _load_state_dict_into_meta_model( function _parse_size (line 1119) | def _parse_size(size_str: str) -> int: function clean_unrelated_safetensors (line 1144) | def clean_unrelated_safetensors(save_dir): function get_common_folder (line 1174) | def get_common_folder(file_list): class PretrainedModel (line 1184) | class PretrainedModel(Layer, GenerationMixin, ConversionMixin): method __init__ (line 1250) | def __init__(self, *args, **kwargs): method _post_init (line 1283) | def _post_init(self, original_init, *args, **kwargs): method _init_weights (line 1311) | def _init_weights(self, layer): method _initialize_weights (line 1317) | def _initialize_weights(self, layer): method init_weights (line 1326) | def init_weights(self): method _from_config (line 1343) | def _from_config(cls, config, **kwargs): method from_config (line 1365) | def from_config(cls, config, **kwargs): method set_inference_config (line 1376) | def set_inference_config(cls, config, predictor_args, **kwargs): method confirm_inference_model (line 1451) | def confirm_inference_model(cls, predictor_args, **kwargs): method base_model (line 1463) | def base_model(self): method model_name_list (line 1472) | def model_name_list(self): method can_generate (line 1480) | def can_generate(self) -> bool: method recompute_enable (line 1491) | def recompute_enable(self): method recompute_disable (line 1503) | def recompute_disable(self): method get_memory_footprint (line 1515) | def get_memory_footprint(self, return_buffers=True): method get_model_flops (line 1531) | def get_model_flops(self, *args, **kwargs): method get_hardware_flops (line 1537) | def get_hardware_flops(self, *args, **kwargs): method get_input_embeddings (line 1543) | def get_input_embeddings(self) -> nn.Embedding: method set_input_embeddings (line 1568) | def set_input_embeddings(self, value: Embedding): method get_output_embeddings (line 1592) | def get_output_embeddings(self) -> Optional[Embedding]: method set_output_embeddings (line 1604) | def set_output_embeddings(self, new_embeddings): method get_decoder (line 1611) | def get_decoder(self): method set_decoder (line 1626) | def set_decoder(self, decoder): method tie_weights (line 1637) | def tie_weights(self): method resize_position_embeddings (line 1677) | def resize_position_embeddings(self, new_num_position_embeddings: int): method constructed_from_pretrained_config (line 1692) | def constructed_from_pretrained_config(cls, init_func=None) -> bool: method save_model_config (line 1699) | def save_model_config(self, save_dir: str, **kwargs): method save_to_hf_hub (line 1710) | def save_to_hf_hub( method save_to_aistudio (line 1772) | def save_to_aistudio( method resize_token_embeddings (line 1836) | def resize_token_embeddings(self, new_num_tokens: Optional[int] = None... method _update_init_config (line 1868) | def _update_init_config(self, init_config: dict, key: str, value: Any): method _get_resized_embeddings (line 1885) | def _get_resized_embeddings( method __setattr__ (line 1936) | def __setattr__(self, name, value): method _resolve_model_file_path (line 1941) | def _resolve_model_file_path( method _load_pretrained_model (line 2178) | def _load_pretrained_model( method from_pretrained (line 2699) | def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): method _get_key_renaming_mapping (line 3110) | def _get_key_renaming_mapping( method save_pretrained (line 3140) | def save_pretrained( method merge_auto_dist_configs (line 3386) | def merge_auto_dist_configs(self, configs): method _generate_auto_dist_config (line 3455) | def _generate_auto_dist_config(self, auto_dist_degree): class PipelinePretrainedModel (line 3507) | class PipelinePretrainedModel(PretrainedModel): method __init_hook__ (line 3508) | def __init_hook__(self): method __init__ (line 3514) | def __init__(self, config, *args, **kwargs): method add_sequential_layer (line 3518) | def add_sequential_layer(self, layer_desc, name_prefix=""): method get_sequential_layers (line 3522) | def get_sequential_layers(self): method get_sequential_name_prefixes (line 3526) | def get_sequential_name_prefixes(self): method _set_pipeline_name_mapping (line 3530) | def _set_pipeline_name_mapping(self, mappings=None): method get_shardlayer_prefix (line 3595) | def get_shardlayer_prefix(self, name_splited, shared_layer_class=Share... method state_dict (line 3623) | def state_dict(self, *args, **kwargs): method sharded_state_dict (line 3636) | def sharded_state_dict(self, *args, **kwargs): method set_state_dict (line 3669) | def set_state_dict(self, state_dict, *args, **kwargs): function load_sharded_checkpoint_as_one (line 3684) | def load_sharded_checkpoint_as_one(folder, variant=None, return_numpy=Fa... function load_tp_checkpoint (line 3759) | def load_tp_checkpoint(folder, cls, config, return_numpy=False, convert_... function clean_model_class_name (line 3814) | def clean_model_class_name(class_name, suffixes_to_strip: Union[str, Lis... function save_full_param (line 3837) | def save_full_param( function replace_name_and_gen_index (line 3929) | def replace_name_and_gen_index(path, total_size, save_peft=False): class HFFormatFullParamSaver (line 4003) | class HFFormatFullParamSaver: method __init__ (line 4004) | def __init__( method get_full_param_iter (line 4025) | def get_full_param_iter(self): method determin_saver_based_group (line 4047) | def determin_saver_based_group(self): method save_checkpoint (line 4063) | def save_checkpoint(self, path, max_shard_size="16GB", save_peft=False): class EMAStateHFFormatFullParamSaver (line 4086) | class EMAStateHFFormatFullParamSaver(HFFormatFullParamSaver): method __init__ (line 4087) | def __init__( method get_full_param_iter (line 4110) | def get_full_param_iter(self): class SonicMoEHFFormatFullParamSaver (line 4138) | class SonicMoEHFFormatFullParamSaver(HFFormatFullParamSaver): method __init__ (line 4139) | def __init__( method deinterleave_gate_up_proj (line 4161) | def deinterleave_gate_up_proj(self, w, moe_intermediate_size): method get_full_param_iter (line 4172) | def get_full_param_iter(self): FILE: paddleformers/transformers/modeling_rope_utils.py function standardize_rope_params (line 25) | def standardize_rope_params(config, rope_theta: float | dict[str, float]... function dynamic_rope_update (line 81) | def dynamic_rope_update(rope_forward): function _compute_linear_scaling_rope_parameters (line 190) | def _compute_linear_scaling_rope_parameters( function _compute_dynamic_ntk_parameters (line 246) | def _compute_dynamic_ntk_parameters( function _compute_yarn_parameters (line 319) | def _compute_yarn_parameters( function _compute_longrope_parameters (line 458) | def _compute_longrope_parameters( function _compute_llama3_parameters (line 550) | def _compute_llama3_parameters( function _check_received_keys (line 648) | def _check_received_keys( function _validate_default_rope_parameters (line 677) | def _validate_default_rope_parameters( function _validate_linear_scaling_rope_parameters (line 686) | def _validate_linear_scaling_rope_parameters( function _validate_dynamic_scaling_rope_parameters (line 699) | def _validate_dynamic_scaling_rope_parameters( function _validate_yarn_parameters (line 713) | def _validate_yarn_parameters( function _validate_longrope_parameters (line 780) | def _validate_longrope_parameters(rope_parameters: dict, config: Pretrai... function _validate_llama3_parameters (line 828) | def _validate_llama3_parameters(rope_parameters: dict, config: Pretraine... function rope_config_validation (line 881) | def rope_config_validation(config: PretrainedConfig, ignore_keys: Option... FILE: paddleformers/transformers/modelscope_utils.py class UnauthorizedError (line 21) | class UnauthorizedError(Exception): class EntryNotFoundError (line 25) | class EntryNotFoundError(Exception): function _add_subfolder (line 29) | def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -... function modelscope_download (line 35) | def modelscope_download( FILE: paddleformers/transformers/moe_gate.py class MoEGateMixin (line 30) | class MoEGateMixin: method gate_score_func (line 31) | def gate_score_func(self, logits: paddle.Tensor) -> paddle.Tensor: method gumbel_rsample (line 54) | def gumbel_rsample(self, logits: paddle.Tensor) -> paddle.Tensor: method uniform_sample (line 58) | def uniform_sample(self, logits: paddle.Tensor) -> paddle.Tensor: method _one_hot_to_float (line 63) | def _one_hot_to_float(self, x, num_classes): method _one_hot_to_int64 (line 69) | def _one_hot_to_int64(self, x, num_classes): method _capacity (line 75) | def _capacity( method _cal_aux_loss (line 99) | def _cal_aux_loss(self, gates, mask): method _cal_seq_aux_loss (line 126) | def _cal_seq_aux_loss(self, probs, top_k, routing_map): method _cal_z_loss (line 163) | def _cal_z_loss(self, logits) -> paddle.Tensor: method _cal_orthogonal_loss (line 176) | def _cal_orthogonal_loss(self) -> paddle.Tensor: class PretrainedMoEGate (line 187) | class PretrainedMoEGate(nn.Layer, MoEGateMixin): method __init__ (line 188) | def __init__(self, config, num_experts, expert_hidden_size, **kwargs): method _priority (line 225) | def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.... method _topk_greedy (line 247) | def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle.... method _topk_group_limited_greedy (line 262) | def _topk_group_limited_greedy( method _topk_noaux_tc (line 294) | def _topk_noaux_tc( method top1gating (line 334) | def top1gating( method top2gating (line 406) | def top2gating( method topkgating (line 482) | def topkgating( method topkgating_nodrop (line 564) | def topkgating_nodrop(self, gates: paddle.Tensor): FILE: paddleformers/transformers/moe_layer.py function dispatching (line 31) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): function combining (line 71) | def combining(x, combine_weights, scatter_index): class _AllToAll (line 95) | class _AllToAll(paddle.autograd.PyLayer): method forward (line 97) | def forward( method backward (line 141) | def backward(ctx: Any, *grad_output: Tensor) -> Tuple[Tensor]: class MoELayer (line 154) | class MoELayer(nn.Layer): method __init__ (line 155) | def __init__( method _parse_moe_expert_parallel (line 216) | def _parse_moe_expert_parallel(self, moe_num_experts, expert_model_par... method _post_init (line 226) | def _post_init(self): method forward (line 237) | def forward( class MoEFlexTokenLayer (line 341) | class MoEFlexTokenLayer(nn.Layer): method __init__ (line 342) | def __init__(self, config, moe_num_experts, expert_class, expert_kwarg... method _post_init (line 370) | def _post_init(self): method expert_forward (line 381) | def expert_forward(self, dispatched_input, tokens_per_expert): method forward (line 397) | def forward(self, hidden_states: paddle.Tensor): method _parse_moe_expert_parallel (line 406) | def _parse_moe_expert_parallel(self, moe_num_experts, expert_model_par... FILE: paddleformers/transformers/moe_layer_auto.py function dispatching (line 31) | def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): function combining (line 71) | def combining(x, combine_weights, scatter_index): class LocalGatePart1 (line 95) | class LocalGatePart1(LocalLayer): method __init__ (line 96) | def __init__(self, config, gate: PretrainedMoEGate, ipp=None): method forward (line 114) | def forward(self, hidden_state, gate_weight, e_score_correction_bias, ... class LocalGateAndDispatch (line 131) | class LocalGateAndDispatch(LocalLayer): method __init__ (line 132) | def __init__(self, gate: PretrainedMoEGate, ipp=None): method forward (line 145) | def forward(self, reshaped_input, scores): class LocalCombine (line 151) | class LocalCombine(LocalLayer): method __init__ (line 152) | def __init__(self, ipp=None): method forward (line 158) | def forward(self, combine_weights, expert_output, dtype="float32", out... class MoELayer (line 169) | class MoELayer(nn.Layer): method __init__ (line 170) | def __init__( method _redistribute_experts (line 212) | def _redistribute_experts(self, experts, moe_group: str): method _parse_moe_expert_parallel (line 221) | def _parse_moe_expert_parallel(self, moe_num_experts, config): method _post_init (line 238) | def _post_init(self): method expert_forward (line 249) | def expert_forward(self, dispatched_input): method forward (line 270) | def forward( FILE: paddleformers/transformers/ofa_utils.py function prepare_qkv_ofa (line 30) | def prepare_qkv_ofa(self, query, key, value, cache=None): function mha_ofa_forward (line 52) | def mha_ofa_forward(self, query, key, value, attn_mask=None, cache=None): function encoder_ofa_forward (line 96) | def encoder_ofa_forward( function encoder_layer_ofa_forward (line 127) | def encoder_layer_ofa_forward(self, src, src_mask=None, cache=None, outp... function reorder_head (line 151) | def reorder_head(layer, index): function reorder_neuron (line 189) | def reorder_neuron(layer, index, dim=0): function reorder_neuron_head (line 214) | def reorder_neuron_head(model, head_importance, neuron_importance): function compute_neuron_head_importance (line 230) | def compute_neuron_head_importance( FILE: paddleformers/transformers/paddleocr_vl/configuration.py class PaddleOCRVisionConfig (line 23) | class PaddleOCRVisionConfig(PretrainedConfig): method __init__ (line 28) | def __init__( class PaddleOCRVLConfig (line 76) | class PaddleOCRVLConfig(PretrainedConfig): method __init__ (line 81) | def __init__( method __getattribute__ (line 210) | def __getattribute__(self, key): FILE: paddleformers/transformers/paddleocr_vl/image_processor.py function is_scaled_image (line 44) | def is_scaled_image(image: np.ndarray) -> bool: function make_batched_images (line 55) | def make_batched_images(images) -> List[List[ImageInput]]: function adjust_size (line 78) | def adjust_size(size, patch_size): function smart_resize (line 85) | def smart_resize( class PaddleOCRVLImageProcessor (line 129) | class PaddleOCRVLImageProcessor(BaseImageProcessor): method __init__ (line 135) | def __init__( method set_pixels (line 169) | def set_pixels(self, min_pixels=None, max_pixels=None, msg=""): method get_smarted_resize (line 182) | def get_smarted_resize(self, height, width, min_pixels=None, max_pixel... method _preprocess (line 198) | def _preprocess( method preprocess (line 293) | def preprocess( FILE: paddleformers/transformers/paddleocr_vl/modeling.py function rotate_half (line 61) | def rotate_half(x): function _ensure_cos_sin_dim (line 68) | def _ensure_cos_sin_dim(cos, sin, dim_needed): function apply_multimodal_rotary_pos_emb (line 80) | def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqu... function apply_rotary_pos_emb_vision (line 96) | def apply_rotary_pos_emb_vision(q, k, cos, sin): function apply_fused_rope (line 109) | def apply_fused_rope(query_states, key_states, rope_theta): class PaddleOCRAttention (line 128) | class PaddleOCRAttention(nn.Layer): method __init__ (line 129) | def __init__(self, config: PaddleOCRVisionConfig): method forward (line 165) | def forward( class PaddleOCRVisionEmbeddings (line 216) | class PaddleOCRVisionEmbeddings(nn.Layer): method __init__ (line 217) | def __init__(self, config: PaddleOCRVisionConfig): method forward (line 245) | def forward( class PaddleOCRMLP (line 296) | class PaddleOCRMLP(nn.Layer): method __init__ (line 297) | def __init__(self, config: PaddleOCRVisionConfig): method forward (line 314) | def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: class PaddleOCREncoderLayer (line 321) | class PaddleOCREncoderLayer(nn.Layer): method __init__ (line 322) | def __init__(self, config: PaddleOCRVisionConfig): method forward (line 348) | def forward( class PaddleOCRVisionRotaryEmbedding (line 386) | class PaddleOCRVisionRotaryEmbedding(nn.Layer): method __init__ (line 387) | def __init__(self, dim: int, theta: float = 10000.0) -> None: method rope_init (line 393) | def rope_init(self): method forward (line 398) | def forward(self, seqlen: int) -> paddle.Tensor: class PaddleOCREncoder (line 404) | class PaddleOCREncoder(nn.Layer): method __init__ (line 405) | def __init__(self, config: PaddleOCRVisionConfig): method flatten_list (line 421) | def flatten_list(image_grid_thw): method get_position_ids_vectorized (line 431) | def get_position_ids_vectorized(image_grid_thw, dtype="int64"): method build_window_index (line 458) | def build_window_index(self, image_grid, window_size): method recompute_training (line 493) | def recompute_training( method forward (line 530) | def forward( class PaddleOCRVisionTransformer (line 647) | class PaddleOCRVisionTransformer(nn.Layer): method __init__ (line 648) | def __init__(self, config: PaddleOCRVisionConfig): method forward (line 664) | def forward( class PaddleOCRVisionPreTrainedModel (line 776) | class PaddleOCRVisionPreTrainedModel(PretrainedModel): class PaddleOCRVisionModel (line 791) | class PaddleOCRVisionModel(PaddleOCRVisionPreTrainedModel): method __init__ (line 795) | def __init__(self, config: PaddleOCRVisionConfig): method get_input_embeddings (line 800) | def get_input_embeddings(self) -> nn.Layer: method forward (line 803) | def forward( class Projector (line 834) | class Projector(nn.Layer): method __init__ (line 835) | def __init__(self, text_config: PaddleOCRVLConfig, vision_config: Padd... method forward (line 865) | def forward(self, image_features, image_grid_thw): class PaddleOCRRotaryEmbedding (line 972) | class PaddleOCRRotaryEmbedding(nn.Layer): method __init__ (line 973) | def __init__(self, config: PaddleOCRVLConfig): method compute_default_rope_parameters (line 990) | def compute_default_rope_parameters( method forward (line 1016) | def forward(self, x, position_ids): class Ernie4_5Attention (line 1034) | class Ernie4_5Attention(nn.Layer): method __init__ (line 1037) | def __init__(self, config, layer_idx=0): method forward (line 1105) | def forward( class Ernie4_5DecoderLayer (line 1189) | class Ernie4_5DecoderLayer(nn.Layer): method __init__ (line 1196) | def __init__(self, config, layer_idx): method forward (line 1235) | def forward( class Ernie4_5PretrainedModel (line 1304) | class Ernie4_5PretrainedModel(PretrainedModel): method _gen_aoa_config (line 1325) | def _gen_aoa_config(cls, config: PaddleOCRVLConfig): method _gen_inv_aoa_config (line 1407) | def _gen_inv_aoa_config(cls, config: PaddleOCRVLConfig): class Ernie4_5Model (line 1490) | class Ernie4_5Model(Ernie4_5PretrainedModel): method __init__ (line 1493) | def __init__(self, config: PaddleOCRVLConfig): method recompute_training (line 1520) | def recompute_training( method forward (line 1563) | def forward( class PaddleOCRVLCausalLMOutputWithPast (line 1724) | class PaddleOCRVLCausalLMOutputWithPast(ModelOutput): class PaddleOCRVLModel (line 1733) | class PaddleOCRVLModel(Ernie4_5PretrainedModel): method __init__ (line 1736) | def __init__(self, config: PaddleOCRVLConfig): class PaddleOCRVLForConditionalGeneration (line 1742) | class PaddleOCRVLForConditionalGeneration(Ernie4_5PretrainedModel, Gener... method __init__ (line 1749) | def __init__(self, config: PaddleOCRVLConfig): method get_input_embeddings (line 1764) | def get_input_embeddings(self): method set_input_embeddings (line 1767) | def set_input_embeddings(self, value): method get_output_embeddings (line 1770) | def get_output_embeddings(self): method set_output_embeddings (line 1773) | def set_output_embeddings(self, new_embeddings): method set_decoder (line 1776) | def set_decoder(self, decoder): method get_decoder (line 1779) | def get_decoder(self): method freeze_vision (line 1782) | def freeze_vision(self): method get_rope_index (line 1786) | def get_rope_index( method prepare_attention_mask_for_generation (line 1914) | def prepare_attention_mask_for_generation(self, input_ids, pad_token_i... method prepare_inputs_for_generation (line 1920) | def prepare_inputs_for_generation( method forward (line 1967) | def forward( FILE: paddleformers/transformers/phi3/configuration.py class Phi3Config (line 20) | class Phi3Config(PretrainedConfig): method __init__ (line 31) | def __init__( method _rope_parameters_adjustment (line 105) | def _rope_parameters_adjustment(self): method _rope_parameters_validation (line 115) | def _rope_parameters_validation(self): FILE: paddleformers/transformers/processing_utils.py class _LazyAutoProcessorMapping (line 68) | class _LazyAutoProcessorMapping(dict): method __getitem__ (line 81) | def __getitem__(self, key): method __contains__ (line 88) | def __contains__(self, key): method keys (line 91) | def keys(self): function _get_modality_for_attribute (line 105) | def _get_modality_for_attribute(attribute_name: str) -> str: class VideosKwargs (line 124) | class VideosKwargs(TypedDict, total=False): class ProcessingKwargs (line 206) | class ProcessingKwargs(ProcessingKwargs_hf): class AllKwargsForChatTemplate (line 213) | class AllKwargsForChatTemplate(AllKwargsForChatTemplate_hf): class MultiModalData (line 219) | class MultiModalData: method __contains__ (line 234) | def __contains__(self, key): method __getitem__ (line 237) | def __getitem__(self, key): class PaddleProcessorMixin (line 243) | class PaddleProcessorMixin: method __init__ (line 248) | def __init__(self, *args, **kwargs): method __call__ (line 251) | def __call__( method check_argument_for_proper_class (line 261) | def check_argument_for_proper_class(self, argument_name, argument): method to_dict (line 283) | def to_dict(self, legacy_serialization=True) -> dict[str, Any]: method to_json_string (line 344) | def to_json_string(self, legacy_serialization=True) -> str: method to_json_file (line 355) | def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy... method save_pretrained (line 366) | def save_pretrained(self, save_directory, push_to_hub: bool = False, l... method get_processor_dict (line 480) | def get_processor_dict( method from_args_and_dict (line 638) | def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kw... method from_pretrained (line 691) | def from_pretrained( method get_attributes (line 701) | def get_attributes(cls): method _load_tokenizer_from_pretrained (line 718) | def _load_tokenizer_from_pretrained( method _get_arguments_from_pretrained (line 738) | def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path,... method get_possibly_dynamic_module (line 814) | def get_possibly_dynamic_module(module_name): method batch_decode (line 833) | def batch_decode(self, *args, **kwargs): method decode (line 842) | def decode(self, *args, **kwargs): method model_input_names (line 852) | def model_input_names(self): method apply_chat_template (line 860) | def apply_chat_template( function warp_processormixin (line 1058) | def warp_processormixin(hf_processormixin_class: ProcessorMixin_hf): class ProcessorMixin (line 1062) | class ProcessorMixin(PaddleProcessorMixin, ProcessorMixin_hf): method init (line 1063) | def init(self, *args, **kwargs):