SYMBOL INDEX (475 symbols across 44 files)

FILE: data/prepare_data.py
  function is_git_lfs_installed (line 17) | def is_git_lfs_installed():
  function is_huggingface_git_url (line 27) | def is_huggingface_git_url(url):
  function is_github_repo_url (line 36) | def is_github_repo_url(url):
  function is_s3_url (line 47) | def is_s3_url(url):
  function clone_git_repo (line 77) | def clone_git_repo(data_source, destination_dir):
  function download_from_s3 (line 111) | def download_from_s3(url, destination_dir, access_key_id = None,
  function download_from_url (line 185) | def download_from_url(url, destination_dir):
  function prepare_data (line 218) | def prepare_data(data_source, destination_dir, access_key_id=None, secre...
  function main (line 253) | def main():

FILE: inference/bot.py
  class StopWordsCriteria (line 18) | class StopWordsCriteria(StoppingCriteria):
    method __init__ (line 19) | def __init__(self, tokenizer, stop_words, stream_callback):
    method __call__ (line 26) | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTen...
  class ChatModel (line 47) | class ChatModel:
    method __init__ (line 51) | def __init__(self, model_name, gpu_id, max_memory):
    method do_inference (line 86) | def do_inference(self, prompt, max_new_tokens, do_sample, temperature,...
  class OpenChatKitShell (line 109) | class OpenChatKitShell(cmd.Cmd):
    method __init__ (line 113) | def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, tem...
    method preloop (line 125) | def preloop(self):
    method precmd (line 136) | def precmd(self, line):
    method do_say (line 142) | def do_say(self, arg):
    method do_raw_say (line 163) | def do_raw_say(self, arg):
    method do_raw_prompt (line 174) | def do_raw_prompt(self, arg):
    method do_reset (line 177) | def do_reset(self, arg):
    method do_hyperparameters (line 181) | def do_hyperparameters(self, arg):
    method do_quit (line 190) | def do_quit(self, arg):
  function main (line 194) | def main():

FILE: inference/conversation.py
  function clean_response (line 11) | def clean_response(response):
  class Conversation (line 17) | class Conversation:
    method __init__ (line 18) | def __init__(self, human_id, bot_id):
    method push_context_turn (line 26) | def push_context_turn(self, context):
    method push_human_turn (line 30) | def push_human_turn(self, query):
    method push_model_response (line 34) | def push_model_response(self, response):
    method get_last_turn (line 44) | def get_last_turn(self):
    method get_raw_prompt (line 50) | def get_raw_prompt(self):
    method from_raw_prompt (line 54) | def from_raw_prompt(cls, value):

FILE: pretrained/prepare_pretrained.py
  function prepare_pretrained (line 10) | def prepare_pretrained(save_path, model_name, offload_dir=None):
  function main (line 53) | def main():

FILE: retrieval/wikipedia.py
  function mean_pooling (line 16) | def mean_pooling(token_embeddings, mask):
  function cos_sim_2d (line 21) | def cos_sim_2d(x, y):
  class WikipediaIndex (line 27) | class WikipediaIndex:
    method __init__ (line 28) | def __init__(self):
    method search (line 42) | def search(self, query, k=1, w=5, w_th=0.5):

FILE: tools/convert_to_hf_gptneox.py
  function create_empty_gptneox (line 14) | def create_empty_gptneox(config):
  function load_decentralized_checkpoint (line 33) | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_...

FILE: tools/convert_to_hf_llama.py
  function create_emtpy_llama (line 15) | def create_emtpy_llama(config):
  function load_decentralized_checkpoint (line 34) | def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_...

FILE: tools/model_load_benchmark.py
  function benchmark (line 12) | def benchmark(model_dict: dict, device_name: str, repeat_infer: int):
  function main (line 132) | def main(input_file: str, output_file: str, device_name: str, repeat_inf...

FILE: training/comm/comm_utils.py
  function get_lock (line 20) | def get_lock():
  function get_data_parallel_comm (line 23) | def get_data_parallel_comm() -> NCCLCommunicator:
  function get_data_parallel_rank (line 28) | def get_data_parallel_rank() -> int:
  function get_data_parallel_world_size (line 33) | def get_data_parallel_world_size() -> int:
  function get_pipeline_parallel_comm (line 38) | def get_pipeline_parallel_comm() -> NCCLCommunicator:
  function get_pipeline_parallel_rank (line 43) | def get_pipeline_parallel_rank() -> int:
  function get_pipeline_parallel_world_size (line 48) | def get_pipeline_parallel_world_size() -> int:
  function get_megatron_tensor_parallel_comm (line 53) | def get_megatron_tensor_parallel_comm() -> NCCLCommunicator:
  function get_megatron_tensor_parallel_rank (line 58) | def get_megatron_tensor_parallel_rank() -> int:
  function get_megatron_tensor_parallel_world_size (line 63) | def get_megatron_tensor_parallel_world_size() -> int:
  function default_init (line 68) | def default_init(args):
  function init_communicators (line 84) | def init_communicators(args):
  function reinit_dp_communicator (line 159) | def reinit_dp_communicator(args):

FILE: training/comm/nccl_backend.py
  function _type_torch_to_cupy (line 9) | def _type_torch_to_cupy(torch_type: torch.dtype):
  class NCCLCommunicator (line 24) | class NCCLCommunicator:
    method __init__ (line 25) | def __init__(self,
    method barrier (line 51) | def barrier():
    method store_set (line 54) | def store_set(self, key, value):
    method store_get (line 57) | def store_get(self, key):
    method send (line 60) | def send(self,
    method recv (line 73) | def recv(self,
    method broadcast (line 87) | def broadcast(self,
    method reduce (line 99) | def reduce(self,
    method all_reduce (line 114) | def all_reduce(self,
    method scatter (line 127) | def scatter(self,
    method gather (line 147) | def gather(self,
    method all_to_all (line 167) | def all_to_all(self,
    method all_gather (line 178) | def all_gather(self,
    method all_reduce_opt (line 190) | def all_reduce_opt(self,

FILE: training/comm/torch_backend.py
  class TorchCommunicator (line 5) | class TorchCommunicator:
    method __init__ (line 7) | def __init__(self,
    method barrier (line 18) | def barrier(self):
    method send (line 21) | def send(self,
    method recv (line 31) | def recv(self,
    method isend (line 43) | def isend(self,
    method irecv (line 54) | def irecv(self,
    method broadcast (line 67) | def broadcast(self,
    method reduce (line 78) | def reduce(self,
    method all_reduce (line 85) | def all_reduce(self,
    method gather (line 93) | def gather(self,
    method all_to_all (line 100) | def all_to_all(self,
    method all_gather (line 106) | def all_gather(self,

FILE: training/data_parallel/dist_dp_allreduce.py
  class AllReduceDP (line 6) | class AllReduceDP:
    method __init__ (line 7) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t...
    method _compute_total_para_num (line 51) | def _compute_total_para_num(self):
    method profile_mark_allreduce_start (line 60) | def profile_mark_allreduce_start(self, name=None):
    method profile_mark_allreduce_end (line 67) | def profile_mark_allreduce_end(self, name=None):
    method profile_mark_optimizer_step_start (line 72) | def profile_mark_optimizer_step_start(self):
    method _allreduce_gradients (line 76) | def _allreduce_gradients(self):
    method optimizer_step (line 93) | def optimizer_step(self):
    method set_time_stamp (line 101) | def set_time_stamp(self, init_time_stamp, init_event):
    method get_ts (line 105) | def get_ts(self, event):
    method profiling_data_parallel (line 108) | def profiling_data_parallel(self, init_time_stamp, init_event):

FILE: training/data_parallel/dist_dp_central_ps.py
  class CentralPSDP (line 6) | class CentralPSDP:
    method __init__ (line 7) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t...
    method _compute_total_para_num (line 58) | def _compute_total_para_num(self):
    method profile_mark_reduce_start (line 67) | def profile_mark_reduce_start(self, name=None):
    method profile_mark_reduce_end (line 74) | def profile_mark_reduce_end(self, name=None):
    method profile_mark_optimizer_step_start (line 81) | def profile_mark_optimizer_step_start(self):
    method profile_mark_broadcast_start (line 85) | def profile_mark_broadcast_start(self, name=None):
    method profile_mark_broadcast_end (line 92) | def profile_mark_broadcast_end(self, name=None):
    method _reduce_gradients (line 97) | def _reduce_gradients(self):
    method _broadcast_reduced_gradients (line 111) | def _broadcast_reduced_gradients(self):
    method optimizer_step (line 125) | def optimizer_step(self):
    method set_time_stamp (line 134) | def set_time_stamp(self, init_time_stamp, init_event):
    method get_ts (line 138) | def get_ts(self, event):
    method profiling_data_parallel (line 141) | def profiling_data_parallel(self, init_time_stamp, init_event):

FILE: training/data_parallel/dist_dp_local.py
  class LocalDP (line 7) | class LocalDP:
    method __init__ (line 8) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t...
    method _compute_total_para_num (line 54) | def _compute_total_para_num(self):
    method profile_mark_allreduce_start (line 63) | def profile_mark_allreduce_start(self, name=None):
    method profile_mark_allreduce_end (line 70) | def profile_mark_allreduce_end(self, name=None):
    method profile_mark_optimizer_step_start (line 75) | def profile_mark_optimizer_step_start(self):
    method allreduce_parameters (line 79) | def allreduce_parameters(self):
    method rollback_parameters (line 103) | def rollback_parameters(self):
    method optimizer_step (line 113) | def optimizer_step(self):
    method set_time_stamp (line 123) | def set_time_stamp(self, init_time_stamp, init_event):
    method get_ts (line 127) | def get_ts(self, event):
    method profiling_data_parallel (line 130) | def profiling_data_parallel(self, init_time_stamp, init_event):

FILE: training/data_parallel/dist_dp_sharded_ps.py
  class ShardedPSDP (line 9) | class ShardedPSDP:
    method __init__ (line 10) | def __init__(self, args, device, module: torch.nn.Module, optimizer: t...
    method _compute_total_para_num (line 50) | def _compute_total_para_num(self):
    method _declare_grad_buffer (line 59) | def _declare_grad_buffer(self):
    method profile_mark_sync_grad_start (line 66) | def profile_mark_sync_grad_start(self):
    method profile_mark_allreduce_end (line 70) | def profile_mark_allreduce_end(self):
    method profile_mark_optimizer_step_start (line 73) | def profile_mark_optimizer_step_start(self):
    method _sync_gradients (line 77) | def _sync_gradients(self):
    method optimizer_step (line 87) | def optimizer_step(self):
    method set_time_stamp (line 95) | def set_time_stamp(self, init_time_stamp, init_event):
    method get_ts (line 99) | def get_ts(self, event):
    method profiling_data_parallel (line 102) | def profiling_data_parallel(self, init_time_stamp, init_event):

FILE: training/data_parallel/dist_dp_utils.py
  function get_dp_module (line 6) | def get_dp_module(args, device, module, optimizer):

FILE: training/data_parallel/flatten_utils.py
  function _assert_contiguous (line 4) | def _assert_contiguous(tensors):
  function flatten_params (line 12) | def flatten_params(param_set, chunk=None):
  function flatten_tensors (line 55) | def flatten_tensors(tensor_set, chunk=None):

FILE: training/dist_clm_train.py
  function test_loop (line 24) | def test_loop(args, pipe, device, test_data_loader):
  function train_loop (line 76) | def train_loop(args, pipe, device, train_data_loader, test_data_loader, ...
  function calculate_training_steps (line 264) | def calculate_training_steps(args, train_data_loader) -> int:
  function main (line 325) | def main():

FILE: training/dist_prefixlm_train.py
  function test_loop (line 21) | def test_loop(args, pipe, device, test_data_loader):
  function train_loop (line 25) | def train_loop(args, pipe, device, train_data_loader, test_data_loader):
  function main (line 190) | def main():

FILE: training/lora/example/redpajama-incite-chat-3b.py
  function print_trainable_parameters (line 37) | def print_trainable_parameters(model):

FILE: training/modules/deberta_modules.py
  function make_log_bucket_position (line 15) | def make_log_bucket_position(relative_pos, bucket_size, max_position):
  function build_relative_position (line 23) | def build_relative_position(query_size, key_size, bucket_size=-1, max_po...
  class DisentangledSelfAttention (line 35) | class DisentangledSelfAttention(nn.Module):
    method __init__ (line 37) | def __init__(self, config):
    method transpose_for_scores (line 75) | def transpose_for_scores(self, x, attention_heads):
    method forward (line 80) | def forward(
    method disentangled_attention_bias (line 135) | def disentangled_attention_bias(self, query_layer, key_layer, relative...
  class DebertaV2Layers (line 222) | class DebertaV2Layers(_DebertaV2Encoder):
    method __init__ (line 223) | def __init__(self, config, first_block=False):
    method get_rel_pos (line 261) | def get_rel_pos(self, hidden_states, query_states=None, relative_pos=N...
    method forward (line 269) | def forward(
  class DebertaClassificationHead (line 322) | class DebertaClassificationHead(nn.Module):
    method __init__ (line 323) | def __init__(self, config):
    method forward (line 335) | def forward(self, hidden_states, input_ids=None):

FILE: training/modules/dist_deberta_pp_module.py
  class DebertaStageBase (line 5) | class DebertaStageBase(nn.Module):
    method __init__ (line 6) | def __init__(self, args, config):
    method _create_first_layer (line 11) | def _create_first_layer(self):
    method _create_last_layer (line 14) | def _create_last_layer(self):
    method _create_transformer_layers (line 17) | def _create_transformer_layers(self, first_block=False):
  class DebertaStageFirst (line 21) | class DebertaStageFirst(DebertaStageBase):
    method __init__ (line 22) | def __init__(self, args, config, device):
    method forward (line 28) | def forward(self, x, token_type_ids=None, attention_mask=None):
  class DebertaStageMiddle (line 40) | class DebertaStageMiddle(DebertaStageBase):
    method __init__ (line 41) | def __init__(self, args, config, device):
    method forward (line 46) | def forward(self, x, attention_mask=None):
  class DebertaStageLast (line 55) | class DebertaStageLast(DebertaStageBase):
    method __init__ (line 56) | def __init__(self, args, config, device):
    method forward (line 62) | def forward(self, x, attention_mask=None, input_ids=None):

FILE: training/modules/dist_gpt_fsdp_module.py
  class GPTTransformerFsdpLayer (line 10) | class GPTTransformerFsdpLayer(torch.nn.Module):
    method __init__ (line 11) | def __init__(self, model_dim, head_num, feedforward_dim=2048, layer_no...
    method forward (line 32) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class GPTGlueFsdpModel (line 44) | class GPTGlueFsdpModel(torch.nn.Module):
    method __init__ (line 45) | def __init__(self, args, vocab_size, num_classes, use_checkpoint=True):
    method forward (line 56) | def forward(self, input_ids, position_ids):
  class GPTFsdpStageBase (line 62) | class GPTFsdpStageBase(torch.nn.Module):
    method __init__ (line 63) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, us...
    method _create_first_layer (line 76) | def _create_first_layer(self):
    method _create_last_layer (line 84) | def _create_last_layer(self):
    method _create_fsdp_transformer_layer (line 92) | def _create_fsdp_transformer_layer(self):
  class GPTFsdpStageFirst (line 97) | class GPTFsdpStageFirst(GPTFsdpStageBase):
    method __init__ (line 98) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, de...
    method forward (line 107) | def forward(self, x):
  class GPTFsdpStageMiddle (line 112) | class GPTFsdpStageMiddle(GPTFsdpStageBase):
    method __init__ (line 113) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, de...
    method forward (line 122) | def forward(self, x):
  class GPTFsdpStageLast (line 127) | class GPTFsdpStageLast(GPTFsdpStageBase):
    method __init__ (line 128) | def __init__(self, args, num_stage_layers, vocab_size, num_classes, de...
    method forward (line 138) | def forward(self, x):

FILE: training/modules/dist_gpt_pp_module.py
  class GPTStageBase (line 8) | class GPTStageBase(nn.Module):
    method __init__ (line 9) | def __init__(self, args, config):
    method _create_first_layer (line 45) | def _create_first_layer(self):
    method _create_last_layer (line 60) | def _create_last_layer(self):
    method _create_transformer_layer (line 75) | def _create_transformer_layer(self, layer_idx=0):
  class GPTStageFull (line 92) | class GPTStageFull(GPTStageBase):
    method __init__ (line 93) | def __init__(self, args, config, device):
    method forward (line 105) | def forward(self, x, **kargs):
  class GPTStageFirst (line 111) | class GPTStageFirst(GPTStageBase):
    method __init__ (line 112) | def __init__(self, args, config, device):
    method forward (line 120) | def forward(self, x, **kargs):
  class GPTStageMiddle (line 128) | class GPTStageMiddle(GPTStageBase):
    method __init__ (line 129) | def __init__(self, args, config, device):
    method forward (line 137) | def forward(self, x, **kargs):
  class GPTStageLast (line 145) | class GPTStageLast(GPTStageBase):
    method __init__ (line 146) | def __init__(self, args, config, device):
    method forward (line 162) | def forward(self, x, **kargs):

FILE: training/modules/hf_gpt2_modules.py
  function gpt_loss_func (line 22) | def gpt_loss_func(input, target):
  class GPTEmbeddings (line 30) | class GPTEmbeddings(nn.Module):
    method __init__ (line 31) | def __init__(self, config):
    method forward (line 40) | def forward(self, input_ids, **kargs):
  class GPTAttention (line 61) | class GPTAttention(_GPT2Attention):
    method _attn (line 63) | def _attn(self, query, key, value, attention_mask=None, head_mask=None...
    method forward (line 110) | def forward(
  class GPTBlock (line 165) | class GPTBlock(_GPT2Block):
    method __init__ (line 166) | def __init__(self, config, layer_idx=None, use_checkpoint=True):
    method forward (line 194) | def forward(self, x: torch.Tensor, prefix_masks=None, **kargs) -> torc...
  class GPTModel (line 214) | class GPTModel(_GPT2Model):
    method __init__ (line 215) | def __init__(self, config):
    method forward (line 236) | def forward(self, input_ids, attention_mask=None, **kargs):
  class GPTLMHead (line 270) | class GPTLMHead(nn.Module):
    method __init__ (line 271) | def __init__(self, config):
    method forward (line 276) | def forward(self, x, **kargs):
  class GPTLMHeadModel (line 281) | class GPTLMHeadModel(_GPT2LMHeadModel):
    method __init__ (line 283) | def __init__(self, config):
  class GPTClassificationHead (line 296) | class GPTClassificationHead(nn.Module):
    method __init__ (line 297) | def __init__(self, config):
    method forward (line 303) | def forward(self, hidden_states, input_ids=None):
  class GPTForClassification (line 317) | class GPTForClassification(_GPT2ForSequenceClassification):
    method __init__ (line 319) | def __init__(self, config):

FILE: training/modules/hf_gptj_modules.py
  function gpt_loss_func (line 23) | def gpt_loss_func(input, target):
  function fixed_pos_embedding (line 31) | def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
  class GPTJMLP (line 40) | class GPTJMLP(_GPTJMLP):
    method __init__ (line 41) | def __init__(self, intermediate_size, config, device='cpu'):  # in MLP...
  class GPTJAttention (line 52) | class GPTJAttention(_GPTJAttention):
    method __init__ (line 54) | def __init__(self, config, device='cpu'):
    method _attn (line 87) | def _attn(
    method forward (line 138) | def forward(
  class GPTEmbeddings (line 214) | class GPTEmbeddings(nn.Module):
    method __init__ (line 215) | def __init__(self, config, device='cpu'):
    method from_pretrained (line 223) | def from_pretrained(cls, model_path, config=None):
    method forward (line 236) | def forward(self, input_ids, *args, **kargs):
  class GPTBlock (line 245) | class GPTBlock(_GPTJBlock):
    method __init__ (line 246) | def __init__(self, config, *args, use_checkpoint=True, device='cpu', *...
    method from_pretrained (line 265) | def from_pretrained(cls, model_path, config=None, layer_index=None):
    method forward (line 280) | def forward(self, x: torch.Tensor, prefix_masks=None, layer_past=None,...
  class GPTLMHead (line 318) | class GPTLMHead(nn.Module):
    method __init__ (line 319) | def __init__(self, config, device='cpu'):
    method from_pretrained (line 325) | def from_pretrained(cls, model_path, config=None):
    method forward (line 338) | def forward(self, x, **kargs):

FILE: training/modules/hf_gptneox_modules.py
  class FlashAttentionV2 (line 31) | class FlashAttentionV2(nn.Module):
    method __init__ (line 41) | def __init__(self, softmax_scale=None, attention_dropout=0.0):
    method forward (line 46) | def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens...
  function rotate_half (line 73) | def rotate_half(x):
  function apply_rotary_pos_emb (line 80) | def apply_rotary_pos_emb(q, k, cos, sin, offset=0):
  class GPTNeoXAttention (line 98) | class GPTNeoXAttention(_GPTNeoXAttention):
    method __init__ (line 100) | def __init__(self, config):
    method forward (line 130) | def forward(
    method _attn (line 228) | def _attn(self, query, key, value, attention_mask=None, head_mask=None):
  class GPTEmbeddings (line 281) | class GPTEmbeddings(nn.Module):
    method __init__ (line 283) | def __init__(self, config):
    method from_pretrained (line 291) | def from_pretrained(cls, model_path, config=None):
    method forward (line 307) | def forward(self, input_ids, *args, **kargs):
  class GPTBlock (line 316) | class GPTBlock(_GPTNeoXBlock):
    method __init__ (line 318) | def __init__(self, config, *args, use_checkpoint=True, **kargs):
    method from_pretrained (line 355) | def from_pretrained(cls, model_path, config=None, layer_index=None):
    method forward (line 373) | def forward(self,
  class GPTLMHead (line 423) | class GPTLMHead(nn.Module):
    method __init__ (line 425) | def __init__(self, config):
    method from_pretrained (line 434) | def from_pretrained(cls, model_path, config=None):
    method forward (line 450) | def forward(self, x, *args, **kargs):

FILE: training/modules/hf_opt_modules.py
  function _make_causal_mask (line 15) | def _make_causal_mask(
  function _expand_mask (line 38) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  function _prepare_decoder_attention_mask (line 51) | def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_...
  class GPTEmbeddings (line 72) | class GPTEmbeddings(nn.Module):
    method __init__ (line 73) | def __init__(self, config, device='cpu'):
    method from_pretrained (line 86) | def from_pretrained(cls, model_path, config=None):
    method forward (line 99) | def forward(self, input_ids, past_layer=None, mask=None, **kargs):
  class OPTAttention (line 143) | class OPTAttention(_OPTAttention):
    method __init__ (line 144) | def __init__(
    method _shape (line 172) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 175) | def forward(
  class GPTBlock (line 295) | class GPTBlock(OPTDecoderLayer):
    method __init__ (line 296) | def __init__(self, config, *args, use_checkpoint=True, device='cpu', *...
    method from_pretrained (line 362) | def from_pretrained(cls, model_path, config=None, layer_index=None):
    method forward (line 377) | def forward(self, x: torch.Tensor, layer_past=None, mask=None, *args, ...
  class GPTLMHead (line 446) | class GPTLMHead(nn.Module):
    method __init__ (line 447) | def __init__(self, config, device='cpu'):
    method from_pretrained (line 463) | def from_pretrained(cls, model_path, config=None):
    method forward (line 476) | def forward(self, x, input_ids=None, *args, **kargs):

FILE: training/modules/llama_modules.py
  class RotaryEmbedding (line 43) | class RotaryEmbedding(torch.nn.Module):
    method __init__ (line 61) | def __init__(
    method _compute_inv_freq (line 109) | def _compute_inv_freq(self, device=None):
    method _update_cos_sin_cache (line 118) | def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
    method forward (line 169) | def forward(
  class FlashAttentionV2 (line 237) | class FlashAttentionV2(nn.Module):
    method __init__ (line 248) | def __init__(self, softmax_scale=None, attention_dropout=0.0):
    method forward (line 253) | def forward(
  function _make_causal_mask (line 300) | def _make_causal_mask(
  function _make_causal_mask_device (line 321) | def _make_causal_mask_device(
  function _expand_mask (line 351) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  function _prepare_decoder_attention_mask (line 367) | def _prepare_decoder_attention_mask(
  function rmsnorm_func (line 396) | def rmsnorm_func(hidden_states, weight, variance_epsilon):
  class RMSNorm (line 404) | class RMSNorm(nn.Module):
    method __init__ (line 405) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 417) | def forward(self, hidden_states):
  class LlamaMLP (line 421) | class LlamaMLP(nn.Module):
    method __init__ (line 422) | def __init__(
    method forward (line 434) | def forward(self, x):
  class LlamaAttention (line 438) | class LlamaAttention(nn.Module):
    method __init__ (line 441) | def __init__(
    method _shape (line 510) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 517) | def forward(
  class LlamaDecoderLayer (line 559) | class LlamaDecoderLayer(nn.Module):
    method __init__ (line 560) | def __init__(self, config: LlamaConfig):
    method forward (line 578) | def forward(
  class GPTEmbeddings (line 632) | class GPTEmbeddings(nn.Module):
    method __init__ (line 633) | def __init__(self, config, device="cpu"):
    method forward (line 643) | def forward(
  class GPTLMHead (line 654) | class GPTLMHead(nn.Module):
    method __init__ (line 655) | def __init__(self, config, device="cpu"):
    method forward (line 663) | def forward(
  class GPTBlock (line 676) | class GPTBlock(nn.Module):
    method __init__ (line 677) | def __init__(self, config: LlamaConfig, *args, **kargs):
    method forward (line 724) | def forward(

FILE: training/modules/task_modules.py
  class GlueClassification (line 4) | class GlueClassification(torch.nn.Module):
    method __init__ (line 5) | def __init__(self, model_dim, num_classes):
    method forward (line 12) | def forward(self, hidden_states, pooler_index=0):

FILE: training/modules/tokenizer.py
  function build_tokenizer (line 4) | def build_tokenizer(args):
  function build_gpt2_tokenizer (line 10) | def build_gpt2_tokenizer(args):
  function build_deberta_tokenizer (line 15) | def build_deberta_tokenizer(args):

FILE: training/modules/utils.py
  function gpt_loss_func (line 10) | def gpt_loss_func(input, target):

FILE: training/optimizer/grad_scalar.py
  class GradScaler (line 7) | class GradScaler(ABC):
    method __init__ (line 8) | def __init__(self, initial_scale, device=None):
    method scale (line 15) | def scale(self):
    method inv_scale (line 19) | def inv_scale(self):
    method update (line 23) | def update(self, found_inf):
    method state_dict (line 27) | def state_dict(self):
    method load_state_dict (line 31) | def load_state_dict(self, state_dict):
  class ConstantGradScaler (line 35) | class ConstantGradScaler(GradScaler):
    method update (line 37) | def update(self, found_inf):
    method state_dict (line 40) | def state_dict(self):
    method load_state_dict (line 43) | def load_state_dict(self, state_dict):
  class DynamicGradScaler (line 47) | class DynamicGradScaler(GradScaler):
    method __init__ (line 49) | def __init__(self, initial_scale, min_scale,
    method update (line 79) | def update(self, found_inf):
    method state_dict (line 102) | def state_dict(self):
    method load_state_dict (line 109) | def load_state_dict(self, state_dict):

FILE: training/optimizer/optimizer.py
  function _has_overflow_serial (line 7) | def _has_overflow_serial(grads):
  function _zero_grad_group (line 40) | def _zero_grad_group(group, set_to_none):
  class Fp16Optimizer (line 62) | class Fp16Optimizer:
    method __init__ (line 64) | def __init__(self, optimizer, grad_scaler, device, offload=False):
    method zero_grad (line 113) | def zero_grad(self, set_to_none=True):
    method get_loss_scale (line 120) | def get_loss_scale(self):
    method _copy_model_grads_to_optimizer_grads (line 123) | def _copy_model_grads_to_optimizer_grads(self):
    method _unscale_optimizer_grads_and_check_for_nan (line 138) | def _unscale_optimizer_grads_and_check_for_nan(self):
    method _get_model_and_optimizer_params_data_float16_deprecated (line 157) | def _get_model_and_optimizer_params_data_float16_deprecated(self):
    method _copy_optimizer_params_to_model_params (line 166) | def _copy_optimizer_params_to_model_params(self):
    method _copy_model_params_to_optimizer_params (line 179) | def _copy_model_params_to_optimizer_params(self):
    method reload_model_params (line 191) | def reload_model_params(self):
    method step (line 195) | def step(self):
    method scale (line 216) | def scale(self, z):
    method unscale (line 219) | def unscale(self, z):
    method state_dict (line 222) | def state_dict(self):
    method load_state_dict (line 225) | def load_state_dict(self, state_dict):
  function get_fp16_optimizer (line 229) | def get_fp16_optimizer(args, optimizer, device):

FILE: training/pipeline_parallel/dist_gpipe_pipeline_async.py
  function get_parameter_names (line 16) | def get_parameter_names(model, forbidden_layer_types):
  function create_optimizer (line 32) | def create_optimizer(model, optimizer_type, weight_decay=0.01, learning_...
  class GpipeAsync (line 67) | class GpipeAsync:
    method __init__ (line 78) | def __init__(self, args, config, device, use_dp=False,
    method _compute_micro_batch_size (line 225) | def _compute_micro_batch_size(self):
    method zero_input_grad (line 236) | def zero_input_grad(self):
    method profile_mark_forward_comp_start (line 242) | def profile_mark_forward_comp_start(self, i):
    method profile_mark_forward_recv_start (line 247) | def profile_mark_forward_recv_start(self, i):
    method profile_mark_forward_send_start (line 252) | def profile_mark_forward_send_start(self, i):
    method profile_mark_forward_send_end (line 257) | def profile_mark_forward_send_end(self, i):
    method profile_mark_backward_comp_start (line 262) | def profile_mark_backward_comp_start(self, i):
    method profile_mark_backward_recv_start (line 267) | def profile_mark_backward_recv_start(self, i):
    method profile_mark_backward_send_start (line 272) | def profile_mark_backward_send_start(self, i):
    method profile_mark_backward_send_end (line 277) | def profile_mark_backward_send_end(self, i):
    method get_ts (line 282) | def get_ts(self, event):
    method forward_stage (line 285) | def forward_stage(self, input_data=None, aux_input_data=None):
    method profiling_forward_stage (line 388) | def profiling_forward_stage(self):
    method backward_stage (line 417) | def backward_stage(self, cached_output_micro_batches: List[torch.Tenso...
    method profiling_backward_stage (line 521) | def profiling_backward_stage(self):
    method save_on_disk (line 549) | def save_on_disk(self, path):
    method optimizer_step (line 553) | def optimizer_step(self):
    method profiling_optimizer_step (line 574) | def profiling_optimizer_step(self):
    method export_profiling_result (line 587) | def export_profiling_result(self, filename):
    method sgd_iter (line 591) | def sgd_iter(self, input_=None, target=None,
    method infer_stage (line 656) | def infer_stage(self, input_data=None, aux_input_data=None,
    method infer_iter (line 736) | def infer_iter(self, input_=None, target=None,

FILE: training/pipeline_parallel/dist_pp_utils.py
  function get_pp_module (line 4) | def get_pp_module(args, config, device, use_dp):

FILE: training/tasks/data_loaders/data_utils.py
  function random_chunk (line 32) | def random_chunk(li, min_chunk=1, max_chunk=5):
  class UL2RProcessor (line 42) | class UL2RProcessor:
    method __init__ (line 48) | def __init__(self, tokenizer, seq_length=1024):
    method preprocess_tokens_s2s (line 59) | def preprocess_tokens_s2s(self, tokens):
    method preprocess_tokens_nlg (line 76) | def preprocess_tokens_nlg(self, tokens):
    method preprocess_tokens_nlu (line 98) | def preprocess_tokens_nlu(self, tokens):
    method preprocess_ul2r (line 136) | def preprocess_ul2r(self, inputs):
    method preprocess_random (line 146) | def preprocess_random(self, inputs):
    method __call__ (line 168) | def __call__(self, inputs):
  class StreamDataset (line 175) | class StreamDataset(IterableDataset):
    method __init__ (line 177) | def __init__(self, data, tokenizer, seq_length=1024, doc_separator=Non...
    method state_dict (line 187) | def state_dict(self):
    method load_state_dict (line 190) | def load_state_dict(self, state_dict):
    method get_sequence (line 193) | def get_sequence(self):
    method get_stream (line 208) | def get_stream(self):
    method __iter__ (line 214) | def __iter__(self):
  class StreamDatasetList (line 220) | class StreamDatasetList(IterableDataset):
    method __init__ (line 221) | def __init__(self, task_names, datasets, sample_probs, tokenizer, seq_...
    method state_dict (line 234) | def state_dict(self):
    method load_state_dict (line 237) | def load_state_dict(self, state_dict):
    method get_sequence (line 240) | def get_sequence(self):
    method get_stream (line 268) | def get_stream(self):
    method __iter__ (line 271) | def __iter__(self):
    method tokenize_function (line 276) | def tokenize_function(self, examples):
    method get_dataset_token_count (line 285) | def get_dataset_token_count(self) -> int:
    method get_dataset_example_count (line 314) | def get_dataset_example_count(self) -> int:
  function name_to_dataset (line 329) | def name_to_dataset(task, tokenizer, args):
  function name_to_dataset_eval (line 343) | def name_to_dataset_eval(task, tokenizer, args):
  function get_train_data_loader (line 352) | def get_train_data_loader(args, tokenizer, num_workers=1, state_dict=None):
  function get_eval_data_loader (line 407) | def get_eval_data_loader(args, tokenizer, num_workers=1, state_dict=None):
  function get_ul2r_train_data_loader (line 434) | def get_ul2r_train_data_loader(args, tokenizer, num_workers=1, state_dic...

FILE: training/tasks/data_loaders/prosocial.py
  class StreamDataset (line 14) | class StreamDataset(IterableDataset):
    method __init__ (line 15) | def __init__(self, dataset, tokenizer, seq_length=1024):
    method state_dict (line 25) | def state_dict(self):
    method load_state_dict (line 30) | def load_state_dict(self, state_dict):
    method get_sequence (line 34) | def get_sequence(self):
    method get_stream (line 66) | def get_stream(self):
    method __iter__ (line 69) | def __iter__(self):

FILE: training/utils/dist_args_utils.py
  function add_device_arguments (line 1) | def add_device_arguments(parser):
  function add_torch_distributed_arguments (line 12) | def add_torch_distributed_arguments(parser):
  function add_task_arguments (line 29) | def add_task_arguments(parser):
  function add_model_arguments (line 46) | def add_model_arguments(parser):
  function add_training_hyper_parameter_arguments (line 57) | def add_training_hyper_parameter_arguments(parser):
  function add_mixed_precision_arguments (line 72) | def add_mixed_precision_arguments(parser):
  function add_parallel_schema_arguments (line 90) | def add_parallel_schema_arguments(parser):
  function get_model_arguments_str (line 99) | def get_model_arguments_str(args):
  function get_dist_arguments_str (line 103) | def get_dist_arguments_str(args, add_rank=True):
  function get_learning_arguments_str (line 111) | def get_learning_arguments_str(args):
  function get_mixed_precision_arguments_str (line 115) | def get_mixed_precision_arguments_str(args):

FILE: training/utils/dist_checkpoint_utils.py
  function load_checkpoint (line 11) | def load_checkpoint(pipe, args):
  function save_checkpoint (line 64) | def save_checkpoint(pipe, args) -> str:
  function save_stream_dataloader_state_dict (line 107) | def save_stream_dataloader_state_dict(dataloader, pipe, args):
  function load_stream_dataloader_state_dict (line 121) | def load_stream_dataloader_state_dict(dataloader, pipe, args):

FILE: training/utils/dist_debug_utils.py
  function print_cuda_memory (line 4) | def print_cuda_memory(args, info: str, device=None):
  function print_multi_cuda_memory (line 12) | def print_multi_cuda_memory(args, info: str):

FILE: training/utils/event_report.py
  class EventReporter (line 28) | class EventReporter:
    method __init__ (line 75) | def __init__(self, host=None, auth_token=None, job_id=None):
    method is_enabled (line 80) | def is_enabled(self) -> bool:
    method report (line 114) | def report(self, object, message, event_type,
  function add_entry_reporter_arguments (line 188) | def add_entry_reporter_arguments(parser):
  function main (line 195) | def main():

FILE: training/utils/logging_utils.py
  function init_train_logger (line 19) | def init_train_logger(args):
  function train_log (line 46) | def train_log(x, *args, **kargs):

FILE: training/utils/upload_manager.py
  class UploadManager (line 11) | class UploadManager:
    method __init__ (line 12) | def __init__(self, aws_endpoint_url: str, aws_access_key_id: str,
    method add_task (line 43) | def add_task(self, directory: str, checkpoint_upload_prefix: str, step...
    method wait (line 58) | def wait(self):
    method _report_event (line 62) | def _report_event(self, **kwargs):
    method _wait_for_file_write_to_finish (line 66) | def _wait_for_file_write_to_finish(self, file_path: str, wait_start_ti...
    method _execute_task (line 81) | def _execute_task(self, directory, s3_bucket, s3_key_prefix, step: int):
  function add_aws_arguments (line 184) | def add_aws_arguments(parser: argparse.ArgumentParser):
  function aws_process_args (line 191) | def aws_process_args(args: argparse.Namespace, required: bool = False):
  function main (line 207) | def main():