SYMBOL INDEX (475 symbols across 44 files) FILE: data/prepare_data.py function is_git_lfs_installed (line 17) | def is_git_lfs_installed(): function is_huggingface_git_url (line 27) | def is_huggingface_git_url(url): function is_github_repo_url (line 36) | def is_github_repo_url(url): function is_s3_url (line 47) | def is_s3_url(url): function clone_git_repo (line 77) | def clone_git_repo(data_source, destination_dir): function download_from_s3 (line 111) | def download_from_s3(url, destination_dir, access_key_id = None, function download_from_url (line 185) | def download_from_url(url, destination_dir): function prepare_data (line 218) | def prepare_data(data_source, destination_dir, access_key_id=None, secre... function main (line 253) | def main(): FILE: inference/bot.py class StopWordsCriteria (line 18) | class StopWordsCriteria(StoppingCriteria): method __init__ (line 19) | def __init__(self, tokenizer, stop_words, stream_callback): method __call__ (line 26) | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTen... class ChatModel (line 47) | class ChatModel: method __init__ (line 51) | def __init__(self, model_name, gpu_id, max_memory): method do_inference (line 86) | def do_inference(self, prompt, max_new_tokens, do_sample, temperature,... class OpenChatKitShell (line 109) | class OpenChatKitShell(cmd.Cmd): method __init__ (line 113) | def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, tem... method preloop (line 125) | def preloop(self): method precmd (line 136) | def precmd(self, line): method do_say (line 142) | def do_say(self, arg): method do_raw_say (line 163) | def do_raw_say(self, arg): method do_raw_prompt (line 174) | def do_raw_prompt(self, arg): method do_reset (line 177) | def do_reset(self, arg): method do_hyperparameters (line 181) | def do_hyperparameters(self, arg): method do_quit (line 190) | def do_quit(self, arg): function main (line 194) | def main(): FILE: inference/conversation.py function clean_response (line 11) | def clean_response(response): class Conversation (line 17) | class Conversation: method __init__ (line 18) | def __init__(self, human_id, bot_id): method push_context_turn (line 26) | def push_context_turn(self, context): method push_human_turn (line 30) | def push_human_turn(self, query): method push_model_response (line 34) | def push_model_response(self, response): method get_last_turn (line 44) | def get_last_turn(self): method get_raw_prompt (line 50) | def get_raw_prompt(self): method from_raw_prompt (line 54) | def from_raw_prompt(cls, value): FILE: pretrained/prepare_pretrained.py function prepare_pretrained (line 10) | def prepare_pretrained(save_path, model_name, offload_dir=None): function main (line 53) | def main(): FILE: retrieval/wikipedia.py function mean_pooling (line 16) | def mean_pooling(token_embeddings, mask): function cos_sim_2d (line 21) | def cos_sim_2d(x, y): class WikipediaIndex (line 27) | class WikipediaIndex: method __init__ (line 28) | def __init__(self): method search (line 42) | def search(self, query, k=1, w=5, w_th=0.5): FILE: tools/convert_to_hf_gptneox.py function create_empty_gptneox (line 14) | def create_empty_gptneox(config): function load_decentralized_checkpoint (line 33) | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_... FILE: tools/convert_to_hf_llama.py function create_emtpy_llama (line 15) | def create_emtpy_llama(config): function load_decentralized_checkpoint (line 34) | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_... FILE: tools/model_load_benchmark.py function benchmark (line 12) | def benchmark(model_dict: dict, device_name: str, repeat_infer: int): function main (line 132) | def main(input_file: str, output_file: str, device_name: str, repeat_inf... FILE: training/comm/comm_utils.py function get_lock (line 20) | def get_lock(): function get_data_parallel_comm (line 23) | def get_data_parallel_comm() -> NCCLCommunicator: function get_data_parallel_rank (line 28) | def get_data_parallel_rank() -> int: function get_data_parallel_world_size (line 33) | def get_data_parallel_world_size() -> int: function get_pipeline_parallel_comm (line 38) | def get_pipeline_parallel_comm() -> NCCLCommunicator: function get_pipeline_parallel_rank (line 43) | def get_pipeline_parallel_rank() -> int: function get_pipeline_parallel_world_size (line 48) | def get_pipeline_parallel_world_size() -> int: function get_megatron_tensor_parallel_comm (line 53) | def get_megatron_tensor_parallel_comm() -> NCCLCommunicator: function get_megatron_tensor_parallel_rank (line 58) | def get_megatron_tensor_parallel_rank() -> int: function get_megatron_tensor_parallel_world_size (line 63) | def get_megatron_tensor_parallel_world_size() -> int: function default_init (line 68) | def default_init(args): function init_communicators (line 84) | def init_communicators(args): function reinit_dp_communicator (line 159) | def reinit_dp_communicator(args): FILE: training/comm/nccl_backend.py function _type_torch_to_cupy (line 9) | def _type_torch_to_cupy(torch_type: torch.dtype): class NCCLCommunicator (line 24) | class NCCLCommunicator: method __init__ (line 25) | def __init__(self, method barrier (line 51) | def barrier(): method store_set (line 54) | def store_set(self, key, value): method store_get (line 57) | def store_get(self, key): method send (line 60) | def send(self, method recv (line 73) | def recv(self, method broadcast (line 87) | def broadcast(self, method reduce (line 99) | def reduce(self, method all_reduce (line 114) | def all_reduce(self, method scatter (line 127) | def scatter(self, method gather (line 147) | def gather(self, method all_to_all (line 167) | def all_to_all(self, method all_gather (line 178) | def all_gather(self, method all_reduce_opt (line 190) | def all_reduce_opt(self, FILE: training/comm/torch_backend.py class TorchCommunicator (line 5) | class TorchCommunicator: method __init__ (line 7) | def __init__(self, method barrier (line 18) | def barrier(self): method send (line 21) | def send(self, method recv (line 31) | def recv(self, method isend (line 43) | def isend(self, method irecv (line 54) | def irecv(self, method broadcast (line 67) | def broadcast(self, method reduce (line 78) | def reduce(self, method all_reduce (line 85) | def all_reduce(self, method gather (line 93) | def gather(self, method all_to_all (line 100) | def all_to_all(self, method all_gather (line 106) | def all_gather(self, FILE: training/data_parallel/dist_dp_allreduce.py class AllReduceDP (line 6) | class AllReduceDP: method __init__ (line 7) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t... method _compute_total_para_num (line 51) | def _compute_total_para_num(self): method profile_mark_allreduce_start (line 60) | def profile_mark_allreduce_start(self, name=None): method profile_mark_allreduce_end (line 67) | def profile_mark_allreduce_end(self, name=None): method profile_mark_optimizer_step_start (line 72) | def profile_mark_optimizer_step_start(self): method _allreduce_gradients (line 76) | def _allreduce_gradients(self): method optimizer_step (line 93) | def optimizer_step(self): method set_time_stamp (line 101) | def set_time_stamp(self, init_time_stamp, init_event): method get_ts (line 105) | def get_ts(self, event): method profiling_data_parallel (line 108) | def profiling_data_parallel(self, init_time_stamp, init_event): FILE: training/data_parallel/dist_dp_central_ps.py class CentralPSDP (line 6) | class CentralPSDP: method __init__ (line 7) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t... method _compute_total_para_num (line 58) | def _compute_total_para_num(self): method profile_mark_reduce_start (line 67) | def profile_mark_reduce_start(self, name=None): method profile_mark_reduce_end (line 74) | def profile_mark_reduce_end(self, name=None): method profile_mark_optimizer_step_start (line 81) | def profile_mark_optimizer_step_start(self): method profile_mark_broadcast_start (line 85) | def profile_mark_broadcast_start(self, name=None): method profile_mark_broadcast_end (line 92) | def profile_mark_broadcast_end(self, name=None): method _reduce_gradients (line 97) | def _reduce_gradients(self): method _broadcast_reduced_gradients (line 111) | def _broadcast_reduced_gradients(self): method optimizer_step (line 125) | def optimizer_step(self): method set_time_stamp (line 134) | def set_time_stamp(self, init_time_stamp, init_event): method get_ts (line 138) | def get_ts(self, event): method profiling_data_parallel (line 141) | def profiling_data_parallel(self, init_time_stamp, init_event): FILE: training/data_parallel/dist_dp_local.py class LocalDP (line 7) | class LocalDP: method __init__ (line 8) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t... method _compute_total_para_num (line 54) | def _compute_total_para_num(self): method profile_mark_allreduce_start (line 63) | def profile_mark_allreduce_start(self, name=None): method profile_mark_allreduce_end (line 70) | def profile_mark_allreduce_end(self, name=None): method profile_mark_optimizer_step_start (line 75) | def profile_mark_optimizer_step_start(self): method allreduce_parameters (line 79) | def allreduce_parameters(self): method rollback_parameters (line 103) | def rollback_parameters(self): method optimizer_step (line 113) | def optimizer_step(self): method set_time_stamp (line 123) | def set_time_stamp(self, init_time_stamp, init_event): method get_ts (line 127) | def get_ts(self, event): method profiling_data_parallel (line 130) | def profiling_data_parallel(self, init_time_stamp, init_event): FILE: training/data_parallel/dist_dp_sharded_ps.py class ShardedPSDP (line 9) | class ShardedPSDP: method __init__ (line 10) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t... method _compute_total_para_num (line 50) | def _compute_total_para_num(self): method _declare_grad_buffer (line 59) | def _declare_grad_buffer(self): method profile_mark_sync_grad_start (line 66) | def profile_mark_sync_grad_start(self): method profile_mark_allreduce_end (line 70) | def profile_mark_allreduce_end(self): method profile_mark_optimizer_step_start (line 73) | def profile_mark_optimizer_step_start(self): method _sync_gradients (line 77) | def _sync_gradients(self): method optimizer_step (line 87) | def optimizer_step(self): method set_time_stamp (line 95) | def set_time_stamp(self, init_time_stamp, init_event): method get_ts (line 99) | def get_ts(self, event): method profiling_data_parallel (line 102) | def profiling_data_parallel(self, init_time_stamp, init_event): FILE: training/data_parallel/dist_dp_utils.py function get_dp_module (line 6) | def get_dp_module(args, device, module, optimizer): FILE: training/data_parallel/flatten_utils.py function _assert_contiguous (line 4) | def _assert_contiguous(tensors): function flatten_params (line 12) | def flatten_params(param_set, chunk=None): function flatten_tensors (line 55) | def flatten_tensors(tensor_set, chunk=None): FILE: training/dist_clm_train.py function test_loop (line 24) | def test_loop(args, pipe, device, test_data_loader): function train_loop (line 76) | def train_loop(args, pipe, device, train_data_loader, test_data_loader, ... function calculate_training_steps (line 264) | def calculate_training_steps(args, train_data_loader) -> int: function main (line 325) | def main(): FILE: training/dist_prefixlm_train.py function test_loop (line 21) | def test_loop(args, pipe, device, test_data_loader): function train_loop (line 25) | def train_loop(args, pipe, device, train_data_loader, test_data_loader): function main (line 190) | def main(): FILE: training/lora/example/redpajama-incite-chat-3b.py function print_trainable_parameters (line 37) | def print_trainable_parameters(model): FILE: training/modules/deberta_modules.py function make_log_bucket_position (line 15) | def make_log_bucket_position(relative_pos, bucket_size, max_position): function build_relative_position (line 23) | def build_relative_position(query_size, key_size, bucket_size=-1, max_po... class DisentangledSelfAttention (line 35) | class DisentangledSelfAttention(nn.Module): method __init__ (line 37) | def __init__(self, config): method transpose_for_scores (line 75) | def transpose_for_scores(self, x, attention_heads): method forward (line 80) | def forward( method disentangled_attention_bias (line 135) | def disentangled_attention_bias(self, query_layer, key_layer, relative... class DebertaV2Layers (line 222) | class DebertaV2Layers(_DebertaV2Encoder): method __init__ (line 223) | def __init__(self, config, first_block=False): method get_rel_pos (line 261) | def get_rel_pos(self, hidden_states, query_states=None, relative_pos=N... method forward (line 269) | def forward( class DebertaClassificationHead (line 322) | class DebertaClassificationHead(nn.Module): method __init__ (line 323) | def __init__(self, config): method forward (line 335) | def forward(self, hidden_states, input_ids=None): FILE: training/modules/dist_deberta_pp_module.py class DebertaStageBase (line 5) | class DebertaStageBase(nn.Module): method __init__ (line 6) | def __init__(self, args, config): method _create_first_layer (line 11) | def _create_first_layer(self): method _create_last_layer (line 14) | def _create_last_layer(self): method _create_transformer_layers (line 17) | def _create_transformer_layers(self, first_block=False): class DebertaStageFirst (line 21) | class DebertaStageFirst(DebertaStageBase): method __init__ (line 22) | def __init__(self, args, config, device): method forward (line 28) | def forward(self, x, token_type_ids=None, attention_mask=None): class DebertaStageMiddle (line 40) | class DebertaStageMiddle(DebertaStageBase): method __init__ (line 41) | def __init__(self, args, config, device): method forward (line 46) | def forward(self, x, attention_mask=None): class DebertaStageLast (line 55) | class DebertaStageLast(DebertaStageBase): method __init__ (line 56) | def __init__(self, args, config, device): method forward (line 62) | def forward(self, x, attention_mask=None, input_ids=None): FILE: training/modules/dist_gpt_fsdp_module.py class GPTTransformerFsdpLayer (line 10) | class GPTTransformerFsdpLayer(torch.nn.Module): method __init__ (line 11) | def __init__(self, model_dim, head_num, feedforward_dim=2048, layer_no... method forward (line 32) | def forward(self, x: torch.Tensor) -> torch.Tensor: class GPTGlueFsdpModel (line 44) | class GPTGlueFsdpModel(torch.nn.Module): method __init__ (line 45) | def __init__(self, args, vocab_size, num_classes, use_checkpoint=True): method forward (line 56) | def forward(self, input_ids, position_ids): class GPTFsdpStageBase (line 62) | class GPTFsdpStageBase(torch.nn.Module): method __init__ (line 63) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, us... method _create_first_layer (line 76) | def _create_first_layer(self): method _create_last_layer (line 84) | def _create_last_layer(self): method _create_fsdp_transformer_layer (line 92) | def _create_fsdp_transformer_layer(self): class GPTFsdpStageFirst (line 97) | class GPTFsdpStageFirst(GPTFsdpStageBase): method __init__ (line 98) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, de... method forward (line 107) | def forward(self, x): class GPTFsdpStageMiddle (line 112) | class GPTFsdpStageMiddle(GPTFsdpStageBase): method __init__ (line 113) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, de... method forward (line 122) | def forward(self, x): class GPTFsdpStageLast (line 127) | class GPTFsdpStageLast(GPTFsdpStageBase): method __init__ (line 128) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, de... method forward (line 138) | def forward(self, x): FILE: training/modules/dist_gpt_pp_module.py class GPTStageBase (line 8) | class GPTStageBase(nn.Module): method __init__ (line 9) | def __init__(self, args, config): method _create_first_layer (line 45) | def _create_first_layer(self): method _create_last_layer (line 60) | def _create_last_layer(self): method _create_transformer_layer (line 75) | def _create_transformer_layer(self, layer_idx=0): class GPTStageFull (line 92) | class GPTStageFull(GPTStageBase): method __init__ (line 93) | def __init__(self, args, config, device): method forward (line 105) | def forward(self, x, **kargs): class GPTStageFirst (line 111) | class GPTStageFirst(GPTStageBase): method __init__ (line 112) | def __init__(self, args, config, device): method forward (line 120) | def forward(self, x, **kargs): class GPTStageMiddle (line 128) | class GPTStageMiddle(GPTStageBase): method __init__ (line 129) | def __init__(self, args, config, device): method forward (line 137) | def forward(self, x, **kargs): class GPTStageLast (line 145) | class GPTStageLast(GPTStageBase): method __init__ (line 146) | def __init__(self, args, config, device): method forward (line 162) | def forward(self, x, **kargs): FILE: training/modules/hf_gpt2_modules.py function gpt_loss_func (line 22) | def gpt_loss_func(input, target): class GPTEmbeddings (line 30) | class GPTEmbeddings(nn.Module): method __init__ (line 31) | def __init__(self, config): method forward (line 40) | def forward(self, input_ids, **kargs): class GPTAttention (line 61) | class GPTAttention(_GPT2Attention): method _attn (line 63) | def _attn(self, query, key, value, attention_mask=None, head_mask=None... method forward (line 110) | def forward( class GPTBlock (line 165) | class GPTBlock(_GPT2Block): method __init__ (line 166) | def __init__(self, config, layer_idx=None, use_checkpoint=True): method forward (line 194) | def forward(self, x: torch.Tensor, prefix_masks=None, **kargs) -> torc... class GPTModel (line 214) | class GPTModel(_GPT2Model): method __init__ (line 215) | def __init__(self, config): method forward (line 236) | def forward(self, input_ids, attention_mask=None, **kargs): class GPTLMHead (line 270) | class GPTLMHead(nn.Module): method __init__ (line 271) | def __init__(self, config): method forward (line 276) | def forward(self, x, **kargs): class GPTLMHeadModel (line 281) | class GPTLMHeadModel(_GPT2LMHeadModel): method __init__ (line 283) | def __init__(self, config): class GPTClassificationHead (line 296) | class GPTClassificationHead(nn.Module): method __init__ (line 297) | def __init__(self, config): method forward (line 303) | def forward(self, hidden_states, input_ids=None): class GPTForClassification (line 317) | class GPTForClassification(_GPT2ForSequenceClassification): method __init__ (line 319) | def __init__(self, config): FILE: training/modules/hf_gptj_modules.py function gpt_loss_func (line 23) | def gpt_loss_func(input, target): function fixed_pos_embedding (line 31) | def fixed_pos_embedding(x, seq_dim=1, seq_len=None): class GPTJMLP (line 40) | class GPTJMLP(_GPTJMLP): method __init__ (line 41) | def __init__(self, intermediate_size, config, device='cpu'): # in MLP... class GPTJAttention (line 52) | class GPTJAttention(_GPTJAttention): method __init__ (line 54) | def __init__(self, config, device='cpu'): method _attn (line 87) | def _attn( method forward (line 138) | def forward( class GPTEmbeddings (line 214) | class GPTEmbeddings(nn.Module): method __init__ (line 215) | def __init__(self, config, device='cpu'): method from_pretrained (line 223) | def from_pretrained(cls, model_path, config=None): method forward (line 236) | def forward(self, input_ids, *args, **kargs): class GPTBlock (line 245) | class GPTBlock(_GPTJBlock): method __init__ (line 246) | def __init__(self, config, *args, use_checkpoint=True, device='cpu', *... method from_pretrained (line 265) | def from_pretrained(cls, model_path, config=None, layer_index=None): method forward (line 280) | def forward(self, x: torch.Tensor, prefix_masks=None, layer_past=None,... class GPTLMHead (line 318) | class GPTLMHead(nn.Module): method __init__ (line 319) | def __init__(self, config, device='cpu'): method from_pretrained (line 325) | def from_pretrained(cls, model_path, config=None): method forward (line 338) | def forward(self, x, **kargs): FILE: training/modules/hf_gptneox_modules.py class FlashAttentionV2 (line 31) | class FlashAttentionV2(nn.Module): method __init__ (line 41) | def __init__(self, softmax_scale=None, attention_dropout=0.0): method forward (line 46) | def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens... function rotate_half (line 73) | def rotate_half(x): function apply_rotary_pos_emb (line 80) | def apply_rotary_pos_emb(q, k, cos, sin, offset=0): class GPTNeoXAttention (line 98) | class GPTNeoXAttention(_GPTNeoXAttention): method __init__ (line 100) | def __init__(self, config): method forward (line 130) | def forward( method _attn (line 228) | def _attn(self, query, key, value, attention_mask=None, head_mask=None): class GPTEmbeddings (line 281) | class GPTEmbeddings(nn.Module): method __init__ (line 283) | def __init__(self, config): method from_pretrained (line 291) | def from_pretrained(cls, model_path, config=None): method forward (line 307) | def forward(self, input_ids, *args, **kargs): class GPTBlock (line 316) | class GPTBlock(_GPTNeoXBlock): method __init__ (line 318) | def __init__(self, config, *args, use_checkpoint=True, **kargs): method from_pretrained (line 355) | def from_pretrained(cls, model_path, config=None, layer_index=None): method forward (line 373) | def forward(self, class GPTLMHead (line 423) | class GPTLMHead(nn.Module): method __init__ (line 425) | def __init__(self, config): method from_pretrained (line 434) | def from_pretrained(cls, model_path, config=None): method forward (line 450) | def forward(self, x, *args, **kargs): FILE: training/modules/hf_opt_modules.py function _make_causal_mask (line 15) | def _make_causal_mask( function _expand_mask (line 38) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option... function _prepare_decoder_attention_mask (line 51) | def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_... class GPTEmbeddings (line 72) | class GPTEmbeddings(nn.Module): method __init__ (line 73) | def __init__(self, config, device='cpu'): method from_pretrained (line 86) | def from_pretrained(cls, model_path, config=None): method forward (line 99) | def forward(self, input_ids, past_layer=None, mask=None, **kargs): class OPTAttention (line 143) | class OPTAttention(_OPTAttention): method __init__ (line 144) | def __init__( method _shape (line 172) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 175) | def forward( class GPTBlock (line 295) | class GPTBlock(OPTDecoderLayer): method __init__ (line 296) | def __init__(self, config, *args, use_checkpoint=True, device='cpu', *... method from_pretrained (line 362) | def from_pretrained(cls, model_path, config=None, layer_index=None): method forward (line 377) | def forward(self, x: torch.Tensor, layer_past=None, mask=None, *args, ... class GPTLMHead (line 446) | class GPTLMHead(nn.Module): method __init__ (line 447) | def __init__(self, config, device='cpu'): method from_pretrained (line 463) | def from_pretrained(cls, model_path, config=None): method forward (line 476) | def forward(self, x, input_ids=None, *args, **kargs): FILE: training/modules/llama_modules.py class RotaryEmbedding (line 43) | class RotaryEmbedding(torch.nn.Module): method __init__ (line 61) | def __init__( method _compute_inv_freq (line 109) | def _compute_inv_freq(self, device=None): method _update_cos_sin_cache (line 118) | def _update_cos_sin_cache(self, seqlen, device=None, dtype=None): method forward (line 169) | def forward( class FlashAttentionV2 (line 237) | class FlashAttentionV2(nn.Module): method __init__ (line 248) | def __init__(self, softmax_scale=None, attention_dropout=0.0): method forward (line 253) | def forward( function _make_causal_mask (line 300) | def _make_causal_mask( function _make_causal_mask_device (line 321) | def _make_causal_mask_device( function _expand_mask (line 351) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option... function _prepare_decoder_attention_mask (line 367) | def _prepare_decoder_attention_mask( function rmsnorm_func (line 396) | def rmsnorm_func(hidden_states, weight, variance_epsilon): class RMSNorm (line 404) | class RMSNorm(nn.Module): method __init__ (line 405) | def __init__(self, hidden_size, eps=1e-6): method forward (line 417) | def forward(self, hidden_states): class LlamaMLP (line 421) | class LlamaMLP(nn.Module): method __init__ (line 422) | def __init__( method forward (line 434) | def forward(self, x): class LlamaAttention (line 438) | class LlamaAttention(nn.Module): method __init__ (line 441) | def __init__( method _shape (line 510) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): method forward (line 517) | def forward( class LlamaDecoderLayer (line 559) | class LlamaDecoderLayer(nn.Module): method __init__ (line 560) | def __init__(self, config: LlamaConfig): method forward (line 578) | def forward( class GPTEmbeddings (line 632) | class GPTEmbeddings(nn.Module): method __init__ (line 633) | def __init__(self, config, device="cpu"): method forward (line 643) | def forward( class GPTLMHead (line 654) | class GPTLMHead(nn.Module): method __init__ (line 655) | def __init__(self, config, device="cpu"): method forward (line 663) | def forward( class GPTBlock (line 676) | class GPTBlock(nn.Module): method __init__ (line 677) | def __init__(self, config: LlamaConfig, *args, **kargs): method forward (line 724) | def forward( FILE: training/modules/task_modules.py class GlueClassification (line 4) | class GlueClassification(torch.nn.Module): method __init__ (line 5) | def __init__(self, model_dim, num_classes): method forward (line 12) | def forward(self, hidden_states, pooler_index=0): FILE: training/modules/tokenizer.py function build_tokenizer (line 4) | def build_tokenizer(args): function build_gpt2_tokenizer (line 10) | def build_gpt2_tokenizer(args): function build_deberta_tokenizer (line 15) | def build_deberta_tokenizer(args): FILE: training/modules/utils.py function gpt_loss_func (line 10) | def gpt_loss_func(input, target): FILE: training/optimizer/grad_scalar.py class GradScaler (line 7) | class GradScaler(ABC): method __init__ (line 8) | def __init__(self, initial_scale, device=None): method scale (line 15) | def scale(self): method inv_scale (line 19) | def inv_scale(self): method update (line 23) | def update(self, found_inf): method state_dict (line 27) | def state_dict(self): method load_state_dict (line 31) | def load_state_dict(self, state_dict): class ConstantGradScaler (line 35) | class ConstantGradScaler(GradScaler): method update (line 37) | def update(self, found_inf): method state_dict (line 40) | def state_dict(self): method load_state_dict (line 43) | def load_state_dict(self, state_dict): class DynamicGradScaler (line 47) | class DynamicGradScaler(GradScaler): method __init__ (line 49) | def __init__(self, initial_scale, min_scale, method update (line 79) | def update(self, found_inf): method state_dict (line 102) | def state_dict(self): method load_state_dict (line 109) | def load_state_dict(self, state_dict): FILE: training/optimizer/optimizer.py function _has_overflow_serial (line 7) | def _has_overflow_serial(grads): function _zero_grad_group (line 40) | def _zero_grad_group(group, set_to_none): class Fp16Optimizer (line 62) | class Fp16Optimizer: method __init__ (line 64) | def __init__(self, optimizer, grad_scaler, device, offload=False): method zero_grad (line 113) | def zero_grad(self, set_to_none=True): method get_loss_scale (line 120) | def get_loss_scale(self): method _copy_model_grads_to_optimizer_grads (line 123) | def _copy_model_grads_to_optimizer_grads(self): method _unscale_optimizer_grads_and_check_for_nan (line 138) | def _unscale_optimizer_grads_and_check_for_nan(self): method _get_model_and_optimizer_params_data_float16_deprecated (line 157) | def _get_model_and_optimizer_params_data_float16_deprecated(self): method _copy_optimizer_params_to_model_params (line 166) | def _copy_optimizer_params_to_model_params(self): method _copy_model_params_to_optimizer_params (line 179) | def _copy_model_params_to_optimizer_params(self): method reload_model_params (line 191) | def reload_model_params(self): method step (line 195) | def step(self): method scale (line 216) | def scale(self, z): method unscale (line 219) | def unscale(self, z): method state_dict (line 222) | def state_dict(self): method load_state_dict (line 225) | def load_state_dict(self, state_dict): function get_fp16_optimizer (line 229) | def get_fp16_optimizer(args, optimizer, device): FILE: training/pipeline_parallel/dist_gpipe_pipeline_async.py function get_parameter_names (line 16) | def get_parameter_names(model, forbidden_layer_types): function create_optimizer (line 32) | def create_optimizer(model, optimizer_type, weight_decay=0.01, learning_... class GpipeAsync (line 67) | class GpipeAsync: method __init__ (line 78) | def __init__(self, args, config, device, use_dp=False, method _compute_micro_batch_size (line 225) | def _compute_micro_batch_size(self): method zero_input_grad (line 236) | def zero_input_grad(self): method profile_mark_forward_comp_start (line 242) | def profile_mark_forward_comp_start(self, i): method profile_mark_forward_recv_start (line 247) | def profile_mark_forward_recv_start(self, i): method profile_mark_forward_send_start (line 252) | def profile_mark_forward_send_start(self, i): method profile_mark_forward_send_end (line 257) | def profile_mark_forward_send_end(self, i): method profile_mark_backward_comp_start (line 262) | def profile_mark_backward_comp_start(self, i): method profile_mark_backward_recv_start (line 267) | def profile_mark_backward_recv_start(self, i): method profile_mark_backward_send_start (line 272) | def profile_mark_backward_send_start(self, i): method profile_mark_backward_send_end (line 277) | def profile_mark_backward_send_end(self, i): method get_ts (line 282) | def get_ts(self, event): method forward_stage (line 285) | def forward_stage(self, input_data=None, aux_input_data=None): method profiling_forward_stage (line 388) | def profiling_forward_stage(self): method backward_stage (line 417) | def backward_stage(self, cached_output_micro_batches: List[torch.Tenso... method profiling_backward_stage (line 521) | def profiling_backward_stage(self): method save_on_disk (line 549) | def save_on_disk(self, path): method optimizer_step (line 553) | def optimizer_step(self): method profiling_optimizer_step (line 574) | def profiling_optimizer_step(self): method export_profiling_result (line 587) | def export_profiling_result(self, filename): method sgd_iter (line 591) | def sgd_iter(self, input_=None, target=None, method infer_stage (line 656) | def infer_stage(self, input_data=None, aux_input_data=None, method infer_iter (line 736) | def infer_iter(self, input_=None, target=None, FILE: training/pipeline_parallel/dist_pp_utils.py function get_pp_module (line 4) | def get_pp_module(args, config, device, use_dp): FILE: training/tasks/data_loaders/data_utils.py function random_chunk (line 32) | def random_chunk(li, min_chunk=1, max_chunk=5): class UL2RProcessor (line 42) | class UL2RProcessor: method __init__ (line 48) | def __init__(self, tokenizer, seq_length=1024): method preprocess_tokens_s2s (line 59) | def preprocess_tokens_s2s(self, tokens): method preprocess_tokens_nlg (line 76) | def preprocess_tokens_nlg(self, tokens): method preprocess_tokens_nlu (line 98) | def preprocess_tokens_nlu(self, tokens): method preprocess_ul2r (line 136) | def preprocess_ul2r(self, inputs): method preprocess_random (line 146) | def preprocess_random(self, inputs): method __call__ (line 168) | def __call__(self, inputs): class StreamDataset (line 175) | class StreamDataset(IterableDataset): method __init__ (line 177) | def __init__(self, data, tokenizer, seq_length=1024, doc_separator=Non... method state_dict (line 187) | def state_dict(self): method load_state_dict (line 190) | def load_state_dict(self, state_dict): method get_sequence (line 193) | def get_sequence(self): method get_stream (line 208) | def get_stream(self): method __iter__ (line 214) | def __iter__(self): class StreamDatasetList (line 220) | class StreamDatasetList(IterableDataset): method __init__ (line 221) | def __init__(self, task_names, datasets, sample_probs, tokenizer, seq_... method state_dict (line 234) | def state_dict(self): method load_state_dict (line 237) | def load_state_dict(self, state_dict): method get_sequence (line 240) | def get_sequence(self): method get_stream (line 268) | def get_stream(self): method __iter__ (line 271) | def __iter__(self): method tokenize_function (line 276) | def tokenize_function(self, examples): method get_dataset_token_count (line 285) | def get_dataset_token_count(self) -> int: method get_dataset_example_count (line 314) | def get_dataset_example_count(self) -> int: function name_to_dataset (line 329) | def name_to_dataset(task, tokenizer, args): function name_to_dataset_eval (line 343) | def name_to_dataset_eval(task, tokenizer, args): function get_train_data_loader (line 352) | def get_train_data_loader(args, tokenizer, num_workers=1, state_dict=None): function get_eval_data_loader (line 407) | def get_eval_data_loader(args, tokenizer, num_workers=1, state_dict=None): function get_ul2r_train_data_loader (line 434) | def get_ul2r_train_data_loader(args, tokenizer, num_workers=1, state_dic... FILE: training/tasks/data_loaders/prosocial.py class StreamDataset (line 14) | class StreamDataset(IterableDataset): method __init__ (line 15) | def __init__(self, dataset, tokenizer, seq_length=1024): method state_dict (line 25) | def state_dict(self): method load_state_dict (line 30) | def load_state_dict(self, state_dict): method get_sequence (line 34) | def get_sequence(self): method get_stream (line 66) | def get_stream(self): method __iter__ (line 69) | def __iter__(self): FILE: training/utils/dist_args_utils.py function add_device_arguments (line 1) | def add_device_arguments(parser): function add_torch_distributed_arguments (line 12) | def add_torch_distributed_arguments(parser): function add_task_arguments (line 29) | def add_task_arguments(parser): function add_model_arguments (line 46) | def add_model_arguments(parser): function add_training_hyper_parameter_arguments (line 57) | def add_training_hyper_parameter_arguments(parser): function add_mixed_precision_arguments (line 72) | def add_mixed_precision_arguments(parser): function add_parallel_schema_arguments (line 90) | def add_parallel_schema_arguments(parser): function get_model_arguments_str (line 99) | def get_model_arguments_str(args): function get_dist_arguments_str (line 103) | def get_dist_arguments_str(args, add_rank=True): function get_learning_arguments_str (line 111) | def get_learning_arguments_str(args): function get_mixed_precision_arguments_str (line 115) | def get_mixed_precision_arguments_str(args): FILE: training/utils/dist_checkpoint_utils.py function load_checkpoint (line 11) | def load_checkpoint(pipe, args): function save_checkpoint (line 64) | def save_checkpoint(pipe, args) -> str: function save_stream_dataloader_state_dict (line 107) | def save_stream_dataloader_state_dict(dataloader, pipe, args): function load_stream_dataloader_state_dict (line 121) | def load_stream_dataloader_state_dict(dataloader, pipe, args): FILE: training/utils/dist_debug_utils.py function print_cuda_memory (line 4) | def print_cuda_memory(args, info: str, device=None): function print_multi_cuda_memory (line 12) | def print_multi_cuda_memory(args, info: str): FILE: training/utils/event_report.py class EventReporter (line 28) | class EventReporter: method __init__ (line 75) | def __init__(self, host=None, auth_token=None, job_id=None): method is_enabled (line 80) | def is_enabled(self) -> bool: method report (line 114) | def report(self, object, message, event_type, function add_entry_reporter_arguments (line 188) | def add_entry_reporter_arguments(parser): function main (line 195) | def main(): FILE: training/utils/logging_utils.py function init_train_logger (line 19) | def init_train_logger(args): function train_log (line 46) | def train_log(x, *args, **kargs): FILE: training/utils/upload_manager.py class UploadManager (line 11) | class UploadManager: method __init__ (line 12) | def __init__(self, aws_endpoint_url: str, aws_access_key_id: str, method add_task (line 43) | def add_task(self, directory: str, checkpoint_upload_prefix: str, step... method wait (line 58) | def wait(self): method _report_event (line 62) | def _report_event(self, **kwargs): method _wait_for_file_write_to_finish (line 66) | def _wait_for_file_write_to_finish(self, file_path: str, wait_start_ti... method _execute_task (line 81) | def _execute_task(self, directory, s3_bucket, s3_key_prefix, step: int): function add_aws_arguments (line 184) | def add_aws_arguments(parser: argparse.ArgumentParser): function aws_process_args (line 191) | def aws_process_args(args: argparse.Namespace, required: bool = False): function main (line 207) | def main():