SYMBOL INDEX (450 symbols across 35 files)

FILE: abacus.py
  class Abacus (line 5) | class Abacus(torch.nn.Module):
    method __init__ (line 11) | def __init__(self, digit_tokens, embedding_dim, max_seq_length=1024, m...
    method helper (line 24) | def helper(self, mask, device):
    method forward (line 53) | def forward(self, input_ids):

FILE: arithmetic_eval_quicker.py
  function grid_plotter (line 21) | def grid_plotter(data, type="accs", name='_large', extra_path=None):
  function index_hints_helper (line 44) | def index_hints_helper(num, tokenizer):
  function grid_logic (line 54) | def grid_logic(cfg):
  function main (line 199) | def main(cfg):
  function launch (line 509) | def launch(cfg):

FILE: cramming/__init__.py
  function get_config (line 23) | def get_config(overrides=[]):
  function get_model_config (line 31) | def get_model_config(arch="hf-bert-tiny", overrides=[]):
  function get_backend_config (line 39) | def get_backend_config(backend="torch-default", overrides=[]):

FILE: cramming/architectures/attention.py
  function get_attention_mechanism (line 11) | def get_attention_mechanism(idx, hidden_size, cfg_attention, norm_fn: to...
  class Identity (line 33) | class Identity(torch.nn.Module):
    method __init__ (line 39) | def __init__(self, hidden_size):
    method forward (line 43) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor...
  class RandomNoise (line 46) | class RandomNoise(torch.nn.Module):
    method __init__ (line 52) | def __init__(self, hidden_size):
    method forward (line 56) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor...
  class BertAttentionWrapper (line 60) | class BertAttentionWrapper(BertSelfAttention):
    method __init__ (line 66) | def __init__(self, hidden_size, cfg_attention):
    method forward (line 81) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor...
  class SelfAttentionPyTorch (line 85) | class SelfAttentionPyTorch(torch.nn.Module):
    method __init__ (line 91) | def __init__(self, hidden_size, cfg_attention):
    method forward (line 102) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor...
  class SeqFirstSelfAttentionPyTorch (line 113) | class SeqFirstSelfAttentionPyTorch(torch.nn.Module):
    method __init__ (line 119) | def __init__(self, hidden_size, cfg_attention):
    method forward (line 130) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor...
  class SeqFirstSelfAttention (line 141) | class SeqFirstSelfAttention(torch.nn.MultiheadAttention):
    method __init__ (line 155) | def __init__(self, hidden_size: int, cfg_attention, norm_module=torch....
    method attention (line 213) | def attention(self, query_layer, key_layer, value_layer, attention_mas...
    method forward (line 262) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor...
  class FourierMixing (line 302) | class FourierMixing(torch.nn.Module):
    method __init__ (line 311) | def __init__(self, hidden_size, cfg_attention):
    method forward (line 323) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor...
  class TorchSoftmax (line 348) | class TorchSoftmax(torch.nn.Module):
    method __init__ (line 349) | def __init__(self, seq_op_in_fp32=False):
    method forward (line 353) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
  class TorchShaped (line 363) | class TorchShaped(torch.nn.Module):
    method __init__ (line 366) | def __init__(self, seq_op_in_fp32=False, hidden_size=768):
    method forward (line 371) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
  class SwinCosine (line 384) | class SwinCosine(torch.nn.Module):
    method __init__ (line 387) | def __init__(self, seq_op_in_fp32=False, tau=0.1, eps=1e-8):
    method forward (line 393) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
  class TorchNormalize (line 408) | class TorchNormalize(torch.nn.Module):
    method __init__ (line 409) | def __init__(self, num_attention_heads=1, seq_op_in_fp32=False):
    method forward (line 416) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
  class ScaledIdentity (line 430) | class ScaledIdentity(torch.nn.Module):
    method __init__ (line 431) | def __init__(self, seq_op_in_fp32):
    method forward (line 435) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
  class Cumsum (line 443) | class Cumsum(torch.nn.Module):
    method __init__ (line 444) | def __init__(self, seq_op_in_fp32):
    method forward (line 448) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
  class CumsumExp (line 456) | class CumsumExp(torch.nn.Module):
    method __init__ (line 457) | def __init__(self, seq_op_in_fp32):
    method forward (line 461) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):

FILE: cramming/architectures/components.py
  class EmbeddingComponent (line 14) | class EmbeddingComponent(torch.nn.Module):
    method __init__ (line 16) | def __init__(self, cfg_embedding, norm, norm_eps):
    method forward (line 40) | def forward(self, input_ids):
  class PredictionHeadComponent (line 54) | class PredictionHeadComponent(torch.nn.Module):
    method __init__ (line 55) | def __init__(self, cfg_arch):
    method forward (line 67) | def forward(self, hidden_states):
  class NormalizedResidualConnection (line 72) | class NormalizedResidualConnection(torch.nn.Module):
    method __init__ (line 75) | def __init__(self, input_dim, cfg_arch, output_dim=None, dropout=0.0):
    method _simple_residual (line 108) | def _simple_residual(self, residual, layer, states, *args, **kwargs):
    method _prenormalization_residual (line 111) | def _prenormalization_residual(self, residual, layer, states, *args, *...
    method _postnormalization_residual (line 114) | def _postnormalization_residual(self, residual, layer, states, *args, ...
    method _deepnorm_residual (line 117) | def _deepnorm_residual(self, residual, layer, states, *args, **kwargs):
    method _prenorm_equalized_residual (line 120) | def _prenorm_equalized_residual(self, residual, layer, states, *args, ...
    method _sandwich_residual (line 123) | def _sandwich_residual(self, residual, layer, states, *args, **kwargs):
    method forward (line 126) | def forward(self, residual: torch.Tensor, layer_callable: torch.nn.Mod...
  function _get_norm_fn (line 136) | def _get_norm_fn(norm_name):
  function _get_nonlin_fn (line 150) | def _get_nonlin_fn(nonlin_name, use_gating=True):
  class GLU (line 169) | class GLU(torch.nn.Module):
    method __init__ (line 175) | def __init__(self, sub_activation):
    method forward (line 179) | def forward(self, inputs):
  class ScaleNorm (line 184) | class ScaleNorm(torch.nn.Module):
    method __init__ (line 191) | def __init__(self, hidden_size: int, eps: float = 1e-5, elementwise_af...
    method forward (line 199) | def forward(self, inputs):
  class RMSNorm (line 204) | class RMSNorm(torch.nn.Module):
    method __init__ (line 207) | def __init__(self, hidden_size: int, eps: float = 1e-6, elementwise_af...
    method _legacy_forward (line 215) | def _legacy_forward(self, inputs):
    method _norm (line 219) | def _norm(self, x):
    method forward (line 223) | def forward(self, x):
  function get_causal_attention_mask (line 228) | def get_causal_attention_mask(input_ids) -> torch.Tensor:
  function get_extended_attention_mask (line 239) | def get_extended_attention_mask(attention_mask: torch.Tensor, input_shap...
  function _init_module (line 292) | def _init_module(module, init_method="normal", init_std=0.02, hidden_siz...

FILE: cramming/architectures/construction.py
  function construct_model (line 14) | def construct_model(cfg_arch, tokenizer):

FILE: cramming/architectures/crammed_depthrecurrent.py
  class crammedDepthRecurrentConfig (line 21) | class crammedDepthRecurrentConfig(PretrainedConfig):
    method __init__ (line 24) | def __init__(self, cfg_arch_container: dict = {}, **kwargs):
  function construct_crammed_recurrent (line 29) | def construct_crammed_recurrent(cfg_arch, vocab_size, equals_token):
  class FFNComponent (line 44) | class FFNComponent(torch.nn.Module):
    method __init__ (line 51) | def __init__(self, hidden_size, intermed_size, cfg_arch, output_size=N...
    method forward (line 66) | def forward(self, hidden_states):
  class TransformerLayer (line 70) | class TransformerLayer(torch.nn.Module):
    method __init__ (line 73) | def __init__(self, idx, cfg_arch):
    method forward (line 85) | def forward(self, states, attention_mask: Optional[torch.Tensor] = None):
  class TransformerBlock (line 91) | class TransformerBlock(torch.nn.Module):
    method __init__ (line 94) | def __init__(self, layers, cfg_arch):
    method forward (line 104) | def forward(self, states, injected_state, attention_mask: Optional[tor...
  class TransposedAdapter (line 120) | class TransposedAdapter(torch.nn.Linear):  # steal init
    method __init__ (line 121) | def __init__(self, embedding_dim, hidden_size, original_adapter, tie_w...
    method forward (line 132) | def forward(self, inputs):
  class ScriptableRecurrentLM (line 136) | class ScriptableRecurrentLM(PreTrainedModel):
    method __init__ (line 141) | def __init__(self, config):
    method forward (line 172) | def forward(self, input_ids: torch.Tensor, num_steps_no_grad: int = No...
    method initialize_state (line 194) | def initialize_state(self, hidden_states):
  class ScriptableRecurrentLMReplicaConcat (line 210) | class ScriptableRecurrentLMReplicaConcat(PreTrainedModel):
    method __init__ (line 216) | def __init__(self, config):
    method apply_recurrent_block (line 258) | def apply_recurrent_block(self, hidden_states, injected_state, attenti...
    method forward (line 264) | def forward(self, input_ids: torch.Tensor, num_steps_no_grad: int = No...
    method initialize_state (line 286) | def initialize_state(self, hidden_states):
  function _generate (line 304) | def _generate(self, input_ids, token_limit=100, temperature=1.0, steps_a...
  class ScriptableRecurrentLMForPreTraining (line 372) | class ScriptableRecurrentLMForPreTraining(PreTrainedModel):
    method __init__ (line 377) | def __init__(self, config):
    method _init_weights (line 396) | def _init_weights(self, module=None):
    method forward (line 407) | def forward(self, input_ids: torch.Tensor, *args, **kwargs):
    method _generate (line 424) | def _generate(self, input_ids, token_limit=100, temperature=0.7, steps...
  class ScriptableRecurrentLMBPTT (line 428) | class ScriptableRecurrentLMBPTT(PreTrainedModel):
    method __init__ (line 433) | def __init__(self, config, equals_token):
    method _init_weights (line 463) | def _init_weights(self, module=None):
    method set_max_recurrences_for_training (line 474) | def set_max_recurrences_for_training(self, new_max):
    method forward (line 479) | def forward(self, input_ids: torch.Tensor, *args, **kwargs):
    method forward_progressive (line 493) | def forward_progressive(self, input_ids):
    method prog_model_call_with_masking (line 517) | def prog_model_call_with_masking(self, input_ids, n, k):
    method _generate (line 548) | def _generate(self, input_ids, token_limit=100, temperature=1.0, steps...

FILE: cramming/architectures/crammed_transformer.py
  class crammedTransformerConfig (line 21) | class crammedTransformerConfig(PretrainedConfig):
    method __init__ (line 24) | def __init__(self, cfg_arch_container: dict = {}, **kwargs):
  function construct_crammed_transformer (line 29) | def construct_crammed_transformer(cfg_arch, vocab_size):
  class FFNComponent (line 39) | class FFNComponent(torch.nn.Module):
    method __init__ (line 46) | def __init__(self, hidden_size, intermed_size, cfg_arch, output_size=N...
    method forward (line 61) | def forward(self, hidden_states):
  class TransformerLayer (line 65) | class TransformerLayer(torch.nn.Module):
    method __init__ (line 68) | def __init__(self, idx, cfg_arch):
    method forward (line 80) | def forward(self, states, attention_mask: Optional[torch.Tensor] = None):
  class ScriptableLM (line 86) | class ScriptableLM(PreTrainedModel):
    method __init__ (line 91) | def __init__(self, config):
    method forward (line 106) | def forward(self, input_ids: torch.Tensor):
  class ScriptableLMForPreTraining (line 124) | class ScriptableLMForPreTraining(PreTrainedModel):
    method __init__ (line 129) | def __init__(self, config):
    method _init_weights (line 141) | def _init_weights(self, module=None):
    method forward (line 152) | def forward(self, input_ids: torch.Tensor, *args, **kwargs):

FILE: cramming/architectures/embeddings.py
  class PositionalEmbedding (line 11) | class PositionalEmbedding(torch.nn.Module):
    method __init__ (line 13) | def __init__(self, demb):
    method forward (line 21) | def forward(self, pos_seq, bsz=None):
  class RandomNoise (line 35) | class RandomNoise(torch.nn.Module):
    method __init__ (line 37) | def __init__(self, embedding_dim, max_seq_length=5000):
    method forward (line 41) | def forward(self, input_ids):
  class RPE (line 45) | class RPE(torch.nn.Module):
    method __init__ (line 52) | def __init__(self, d_model, num_heads, max_len=1024, dropout=0.1):
    method forward (line 68) | def forward(self, x):
    method skew (line 105) | def skew(self, QEr):
  class SinusoidalPositional (line 118) | class SinusoidalPositional(torch.nn.Module):
    method __init__ (line 125) | def __init__(self, embedding_dim, max_seq_length=5000):
    method forward (line 137) | def forward(self, input_ids):
  class ScaledSinosoidal (line 150) | class ScaledSinosoidal(SinusoidalPositional):
    method __init__ (line 153) | def __init__(self, embedding_dim, max_seq_length):
    method forward (line 157) | def forward(self, input_ids):
  class LearnablePositional (line 170) | class LearnablePositional(torch.nn.Module):
    method __init__ (line 173) | def __init__(self, embedding_dim, max_seq_length=1024):
    method forward (line 178) | def forward(self, input_ids):
  class LearnablePositionalRand (line 184) | class LearnablePositionalRand(torch.nn.Module):
    method __init__ (line 187) | def __init__(self, embedding_dim, max_seq_length=1024):
    method forward (line 193) | def forward(self, input_ids):
  class Rotary (line 206) | class Rotary(torch.nn.Module):
    method __init__ (line 207) | def __init__(self, dim, base=10000, def_seq_length=128, seq_dim: int =...
    method get_cos_sin_cache (line 230) | def get_cos_sin_cache(self, x: torch.Tensor):
    method _get_cos_sin (line 239) | def _get_cos_sin(self):
    method forward (line 248) | def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
    method single_forward (line 253) | def single_forward(self, inputs: torch.Tensor):
    method rotate_half (line 258) | def rotate_half(self, x: torch.Tensor):
  class RotarySanityCheck (line 262) | class RotarySanityCheck(torch.nn.Module):
    method __init__ (line 265) | def __init__(self, dim, base=10000, def_seq_length=128, seq_dim: int =...
    method get_cos_sin_cache (line 276) | def get_cos_sin_cache(self, x: torch.Tensor):
    method _get_cos_sin (line 285) | def _get_cos_sin(self):
    method forward (line 294) | def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
    method rotate_half (line 300) | def rotate_half(self, x: torch.Tensor):
    method single_forward (line 305) | def single_forward(self, inputs: torch.Tensor):
  class RotaryEleutherAI (line 313) | class RotaryEleutherAI(torch.nn.Module):
    method __init__ (line 329) | def __init__(self, dim_model: int, *_, **__):
    method _update_cos_sin_tables (line 340) | def _update_cos_sin_tables(self, x: torch.Tensor, seq_dimension: int =...
    method forward (line 356) | def forward(self, q: torch.Tensor, k: torch.Tensor, seq_dimension: int...
    method rotate_half (line 365) | def rotate_half(self, x: torch.Tensor):
    method apply_rotary_pos_emb (line 371) | def apply_rotary_pos_emb(self, x: torch.Tensor, cos: torch.Tensor, sin...
  class RotaryLLAMA (line 383) | class RotaryLLAMA(torch.nn.Module):
    method __init__ (line 386) | def __init__(self, hidden_per_head, base=10000, max_seq_length=512, se...
    method forward (line 392) | def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
    method apply_rotary_emb (line 395) | def apply_rotary_emb(self, xq: torch.Tensor, xk: torch.Tensor, freqs_c...
    method reshape_for_broadcast (line 404) | def reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tens...
    method precompute_freqs_cis (line 412) | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
  class FIRE (line 419) | class FIRE(torch.nn.Module):
    method __init__ (line 420) | def __init__(self, num_heads=12, mlp_width=32, init_c=0.1, init_L=512....
    method forward (line 446) | def forward(self, seq_length, device):
  class Abacus (line 482) | class Abacus(torch.nn.Module):
    method __init__ (line 485) | def __init__(self, embedding_dim, max_seq_length=1024, max_k=99):
    method helper (line 491) | def helper(self, mask, device):
    method forward (line 517) | def forward(self, input_ids):

FILE: cramming/architectures/huggingface_interface.py
  function construct_huggingface_model (line 6) | def construct_huggingface_model(cfg_arch, vocab_size):

FILE: cramming/architectures/losses.py
  class CosineLoss (line 5) | class CosineLoss(torch.nn.Module):
    method __init__ (line 9) | def __init__(self, reduction: str = "mean", dim=-1, eps=1e-8) -> None:
    method forward (line 16) | def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
  class CrossEntropyWithZLoss (line 20) | class CrossEntropyWithZLoss(torch.nn.Module):
    method __init__ (line 27) | def __init__(self, ignore_index=-100, z_loss_factor=1e-4):
    method forward (line 33) | def forward(self, inputs, labels):
  class MSELoss (line 44) | class MSELoss(torch.nn.Module):
    method __init__ (line 49) | def __init__(self, ignore_index=-100):
    method forward (line 54) | def forward(self, inputs, labels):
    method _label_to_onehot (line 64) | def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100):
  class MSELossFast (line 70) | class MSELossFast(torch.nn.Module):
    method __init__ (line 75) | def __init__(self, ignore_index=-100):
    method forward (line 80) | def forward(self, inputs, labels):
  class L1Loss (line 94) | class L1Loss(torch.nn.Module):
    method __init__ (line 99) | def __init__(self, ignore_index=-100):
    method forward (line 104) | def forward(self, inputs, labels):
    method _label_to_onehot (line 114) | def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100):
  class SzegedyLoss (line 120) | class SzegedyLoss(torch.nn.Module):
    method __init__ (line 126) | def __init__(self, embedding_layer, ignore_index=-100, overrelaxation=...
    method forward (line 133) | def forward(self, inputs, labels):
  class FocalLoss (line 173) | class FocalLoss(torch.nn.Module):
    method __init__ (line 174) | def __init__(self, gamma: float = 5.0, size_average: bool = True, igno...
    method forward (line 180) | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch....
  class IncorrectCrossEntropyLoss (line 191) | class IncorrectCrossEntropyLoss(torch.nn.CrossEntropyLoss):
    method forward (line 194) | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch....

FILE: cramming/architectures/sanity_check.py
  class SanityCheckforPreTraining (line 6) | class SanityCheckforPreTraining(torch.nn.Module):
    method __init__ (line 9) | def __init__(self, width, vocab_size):
    method forward (line 14) | def forward(

FILE: cramming/backend/optimizers/optimizer_modifiers.py
  class MetaOptimizer (line 14) | class MetaOptimizer(torch.optim.Optimizer):
    method __init__ (line 17) | def __init__(self, optimizer):
    method __getstate__ (line 21) | def __getstate__(self):
    method __setstate__ (line 24) | def __setstate__(self, state):
    method __repr__ (line 27) | def __repr__(self):
    method __getattr__ (line 30) | def __getattr__(self, name):
    method step (line 35) | def step(self, closure=None):
  class LARS (line 39) | class LARS(MetaOptimizer):
    method __init__ (line 66) | def __init__(self, optimizer, trust_coefficient=0.02, clip=False, eps=...
    method step (line 73) | def step(self, closure=None):
  class SAM (line 132) | class SAM(MetaOptimizer):
    method __init__ (line 133) | def __init__(self, base_optimizer_instance, rho=0.05):
    method first_step (line 141) | def first_step(self, zero_grad=False):
    method second_step (line 157) | def second_step(self, zero_grad=False):
    method step (line 170) | def step(self, closure=None):
    method _grad_norm (line 180) | def _grad_norm(self):

FILE: cramming/backend/optimizers/progressive_batching.py
  class ProgressiveBatching (line 17) | class ProgressiveBatching(MetaOptimizer):
    method __init__ (line 18) | def __init__(self, optimizer, progress_rule="norm-based", theta=0.9, m...
    method step (line 33) | def step(self):
    method inner_product_test (line 63) | def inner_product_test(self):
    method norm_test (line 85) | def norm_test(self):
    method cosine_test (line 100) | def cosine_test(self):
    method coefficient_of_variation (line 119) | def coefficient_of_variation(self):
    method update_sample_statistics (line 136) | def update_sample_statistics(self):
    method reset_sample_statistics (line 147) | def reset_sample_statistics(self):
    method copy_mean_grad (line 157) | def copy_mean_grad(self):

FILE: cramming/backend/optimizers/schedulers.py
  function get_schedule_fn (line 10) | def get_schedule_fn(cfg_train, elapsed_time: float=0.0, true_budget: flo...
  class DumbScheduler (line 213) | class DumbScheduler:
    method __init__ (line 214) | def __init__(self, *args, **kwargs):
    method step (line 217) | def step(self, *args, **kwargs):
    method _initial_step (line 220) | def _initial_step(self):
    method state_dict (line 225) | def state_dict(self):
    method load_state_dict (line 228) | def load_state_dict(self, state_dict):
    method get_last_lr (line 231) | def get_last_lr(self):
    method get_lr (line 235) | def get_lr(self):
    method print_lr (line 238) | def print_lr(self, is_verbose, group, lr, epoch=None):
  function get_inverse_sqrt_scheduler (line 245) | def get_inverse_sqrt_scheduler(optimizer, num_warmup_steps, num_cooldown...
  function get_one_cycle (line 276) | def get_one_cycle(optimizer, num_training_steps):
  function get_ramp (line 288) | def get_ramp(optimizer, num_cooldown_steps, num_training_steps):
  function _get_fake_step (line 302) | def _get_fake_step(current_step, initial_time, hour_budget, num_training...
  function get_budget_inv_sqrt_scheduler (line 312) | def get_budget_inv_sqrt_scheduler(optimizer, hour_budget, num_warmup_ste...
  function get_budget_constant_scheduler (line 335) | def get_budget_constant_scheduler(optimizer, hour_budget, num_warmup_ste...
  function get_budget_linear_schedule_with_warmup (line 351) | def get_budget_linear_schedule_with_warmup(optimizer, hour_budget, num_w...
  function get_budget_cosine_schedule_with_warmup (line 364) | def get_budget_cosine_schedule_with_warmup(optimizer, hour_budget, num_w...
  function get_budget_cosine_half_cycles_with_warmup (line 378) | def get_budget_cosine_half_cycles_with_warmup(optimizer, hour_budget, nu...
  function get_budget_one_cycle (line 392) | def get_budget_one_cycle(optimizer, hour_budget, num_training_steps, ela...
  function get_budget_multi_cycle (line 406) | def get_budget_multi_cycle(optimizer, hour_budget, num_training_steps, n...
  function get_budget_ramp (line 421) | def get_budget_ramp(optimizer, hour_budget, num_cooldown_steps, num_trai...
  function get_budget_inv_cosine_schedule (line 436) | def get_budget_inv_cosine_schedule(optimizer, hour_budget, num_cooldown_...
  function get_budget_triangle (line 454) | def get_budget_triangle(optimizer, hour_budget, num_training_steps, base...
  function get_budget_dive (line 471) | def get_budget_dive(optimizer, hour_budget, num_training_steps, num_warm...
  function get_budget_polynomial_decay_with_warmup (line 487) | def get_budget_polynomial_decay_with_warmup(optimizer, hour_budget, num_...

FILE: cramming/backend/prepare_backend.py
  function load_backend (line 9) | def load_backend(model, tokenizer, cfg_train, cfg_impl, setup=_default_s...

FILE: cramming/backend/torch_default.py
  function initialize_torch (line 41) | def initialize_torch(model, tokenizer, cfg_train, cfg_impl, setup=_defau...
  class TorchEngine (line 55) | class TorchEngine(torch.nn.Module):
    method __init__ (line 58) | def __init__(self, model, cfg_train, cfg_impl, setup=_default_setup, s...
    method get_true_budget (line 125) | def get_true_budget(self):
    method step (line 131) | def step(self, batch: dict[str, torch.Tensor]):
    method to_device (line 137) | def to_device(self, batch: dict[str, torch.Tensor], keys: list[str] = ...
    method forward (line 146) | def forward(self, *inputs, **kwargs):
    method backward (line 153) | def backward(self, loss):
    method forward_inference (line 160) | def forward_inference(self, *inputs, **kwargs):
    method dynamic_generation (line 168) | def dynamic_generation(self, *inputs, temperature=0.7, token_limit=100...
    method optimizer_step (line 193) | def optimizer_step(self):
    method set_train_batch_size (line 209) | def set_train_batch_size(self, batch_size):
    method schedule_batch_size (line 214) | def schedule_batch_size(self):
    method record_batch_size (line 235) | def record_batch_size(self):
    method record_tokens_per_step (line 241) | def record_tokens_per_step(self):
    method retrieve_model_state_dict (line 246) | def retrieve_model_state_dict(self):
    method _init_distributed (line 261) | def _init_distributed(self, model):
    method load_checkpoint (line 273) | def load_checkpoint(self, cfg_arch, file, skip_optim_state=False) -> D...
    method load_metadata (line 332) | def load_metadata(self, metadata: Dict[str, Any]):
    method save_training_checkpoint (line 337) | def save_training_checkpoint(self, checkpoint_directory: str, checkpoi...
    method save_final_model (line 360) | def save_final_model(self, base_directory, identifier, tokenizer, cfg_...
    method save_model (line 378) | def save_model(
    method push_to_hub (line 420) | def push_to_hub(self, tokenizer, cfg, dryrun=False):
  function _load_optimizer (line 476) | def _load_optimizer(model, cfg_train, cfg_impl, elapsed_time=0.0, true_b...

FILE: cramming/backend/utils.py
  function group_parameters (line 14) | def group_parameters(model, cfg_train):
  function get_model_engine_tokenizer_dataloaders (line 32) | def get_model_engine_tokenizer_dataloaders(cfg, setup, train_eval: bool ...
  function load_model_checkpoint (line 113) | def load_model_checkpoint(model, model_dir, forward_only_model_with_skip...

FILE: cramming/data/arithmetic_tokenizers.py
  class CustomCharLevelTokenizerForAddingPadding (line 11) | class CustomCharLevelTokenizerForAddingPadding(PreTrainedTokenizer):
    method __init__ (line 13) | def __init__(self, **kwargs):
    method vocab_size (line 46) | def vocab_size(self):
    method get_vocab (line 49) | def get_vocab(self):
    method _tokenize (line 52) | def _tokenize(self, text):
    method _convert_token_to_id (line 59) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 62) | def _convert_id_to_token(self, index):
    method __call__ (line 66) | def __call__(self, text, **kwargs):
    method decode (line 72) | def decode(self, token_ids, **kwargs):
  class CustomCharLevelTokenizerForAddingPaddingWithIndexHints (line 78) | class CustomCharLevelTokenizerForAddingPaddingWithIndexHints(PreTrainedT...
    method __init__ (line 80) | def __init__(self, **kwargs):
    method vocab_size (line 115) | def vocab_size(self):
    method get_vocab (line 118) | def get_vocab(self):
    method _tokenize (line 121) | def _tokenize(self, text):
    method _convert_token_to_id (line 128) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 131) | def _convert_id_to_token(self, index):
    method __call__ (line 135) | def __call__(self, text, **kwargs):
    method decode (line 141) | def decode(self, token_ids, **kwargs):
  class CustomCharLevelTokenizerSort (line 147) | class CustomCharLevelTokenizerSort(PreTrainedTokenizer):
    method __init__ (line 149) | def __init__(self, **kwargs):
    method vocab_size (line 190) | def vocab_size(self):
    method get_vocab (line 193) | def get_vocab(self):
    method _tokenize (line 196) | def _tokenize(self, text):
    method _convert_token_to_id (line 202) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 205) | def _convert_id_to_token(self, index):
    method __call__ (line 209) | def __call__(self, text, **kwargs):
    method decode (line 215) | def decode(self, token_ids, **kwargs):

FILE: cramming/data/curriculum_sorting.py
  function _sort_tokenized_dataset_by_unigram (line 10) | def _sort_tokenized_dataset_by_unigram(tokenized_dataset, tokenizer, num...
  function _sort_tokenized_dataset_by_token (line 53) | def _sort_tokenized_dataset_by_token(tokenized_dataset, tokenizer, targe...
  function _sort_tokenized_dataset_by_word_length (line 92) | def _sort_tokenized_dataset_by_word_length(tokenized_dataset, tokenizer,...

FILE: cramming/data/deduplicate.py
  function deduplicate_huggingface_dataset (line 40) | def deduplicate_huggingface_dataset(dataset, threshold=100, original_cwd...
  function _write_tmp_file (line 60) | def _write_tmp_file(dataset, dirname):
  function _make_suffix_array (line 69) | def _make_suffix_array(text_file, tmpdir, path_to_rust_code):
  function _finish_and_return_to_hf_dataset (line 139) | def _finish_and_return_to_hf_dataset(original_text_file, remove_file_cac...

FILE: cramming/data/pretraining_preparation.py
  function get_num_workers (line 41) | def get_num_workers(cfg_impl):
  function load_pretraining_corpus (line 50) | def load_pretraining_corpus(cfg_data, cfg_impl, data_dir: str = None):
  function load_tokenized_data (line 147) | def load_tokenized_data(tokenized_dataset_path):
  function convert_to_hf_dataset (line 151) | def convert_to_hf_dataset(tokenized_data):
  function preprocess_dataset (line 162) | def preprocess_dataset(cfg_data, download_path, num_threads=1, max_raw_c...
  function _move_stream_to_fixed_map (line 233) | def _move_stream_to_fixed_map(raw_data_streamed, max_entries_in_raw_data...
  function _huggingface_preprocessing (line 263) | def _huggingface_preprocessing(raw_dataset, tokenizer, cfg_data, num_thr...
  function _load_fake_dataset (line 357) | def _load_fake_dataset(cfg_data, details, path=None):
  function _concatenate_entries (line 366) | def _concatenate_entries(dataset, num_entries_in_group, num_threads):
  function raw_dataset_preprocessing (line 402) | def raw_dataset_preprocessing(raw_dataset, num_threads, cfg_data):
  function main_process_first (line 454) | def main_process_first():
  function _load_from_hub (line 478) | def _load_from_hub(cfg_data, data_path):
  function prepare_dataloaders (line 498) | def prepare_dataloaders(datasets, tokenizer, cfg_train, cfg_impl) -> Dic...
  function prepare_pretraining_dataloader (line 506) | def prepare_pretraining_dataloader(dataset, tokenizer, cfg_train, cfg_im...
  function prepare_validation_dataloader (line 554) | def prepare_validation_dataloader(dataset, tokenizer, cfg_impl):
  class FastDataCollatorForLanguageModeling (line 586) | class FastDataCollatorForLanguageModeling(transformers.DataCollatorForLa...
    method __init__ (line 587) | def __init__(self, *args, create_labels_entry=False, **kwargs):
    method torch_call (line 592) | def torch_call(self, examples):
  class InfiniteDataLoader (line 616) | class InfiniteDataLoader(torch.utils.data.DataLoader):
    method __init__ (line 619) | def __init__(self, *args, **kwargs):
    method __iter__ (line 625) | def __iter__(self):
    method __next__ (line 628) | def __next__(self):
    method set_epoch (line 640) | def set_epoch(self, epoch: int):
  class RuntimeInfiniteDataLoader (line 643) | class RuntimeInfiniteDataLoader(torch.utils.data.DataLoader):
    method __init__ (line 646) | def __init__(self, tokenizer, device, *args, **kwargs):
    method get_arithmetic (line 661) | def get_arithmetic(self, n, m):
    method tokenize_batch (line 685) | def tokenize_batch(self, batch):
    method __iter__ (line 696) | def __iter__(self):
    method __next__ (line 699) | def __next__(self):

FILE: cramming/data/tokenizer_preparation.py
  function get_tokenizer (line 12) | def get_tokenizer(tokenizer_type: str):
  function load_tokenizer (line 27) | def load_tokenizer(tokenizer_path_or_name, seq_length=512, vocab_size=No...
  function construct_tokenizer (line 38) | def construct_tokenizer(raw_datasets, cfg_data, path, known_tokens=[]):
  function _download_tokenizer (line 48) | def _download_tokenizer(tokenizer_path_or_name, seq_length, cache_dir=No...
  function _get_sane_token_args (line 57) | def _get_sane_token_args():
  function _get_sane_normalizers (line 67) | def _get_sane_normalizers(force_english_keyboard=False, force_lowercase=...
  function _construct_tokenizer (line 85) | def _construct_tokenizer(raw_datasets, cfg_data, known_tokens=[]):

FILE: cramming/data/utils.py
  function checksum_config (line 17) | def checksum_config(cfg):
  function stage_dataset (line 26) | def stage_dataset(data_directory_path, local_staging_dir):
  function _get_size (line 65) | def _get_size(start_path="."):
  function detailed_OSError (line 78) | def detailed_OSError(e):

FILE: cramming/utils.py
  function main_launcher (line 37) | def main_launcher(cfg, main_fn, job_name=""):
  function get_cpus (line 78) | def get_cpus() -> int:
  function system_startup (line 90) | def system_startup(cfg):
  function is_main_process (line 184) | def is_main_process():
  function num_processes (line 188) | def num_processes():
  function find_pretrained_checkpoint (line 194) | def find_pretrained_checkpoint(checkpoint: str, local_checkpoint_folder:...
  function save_summary (line 256) | def save_summary(table_name, cfg, stats, local_time, setup, original_cwd...
  function save_to_table (line 334) | def save_to_table(out_dir, table_name, dryrun, **kwargs):
  function set_random_seed (line 367) | def set_random_seed(seed=233):
  function set_deterministic (line 378) | def set_deterministic():
  function avg_n_dicts (line 386) | def avg_n_dicts(dicts):
  function dump_metrics (line 406) | def dump_metrics(cfg, metrics):
  function _initialize_wandb (line 420) | def _initialize_wandb(setup, cfg):
  function wandb_log (line 440) | def wandb_log(stats, cfg):
  function flatten (line 448) | def flatten(d, parent_key="", sep="_"):
  function collect_system_metrics (line 460) | def collect_system_metrics(cfg, metrics, kWh_counter, setup):
  function get_kWh (line 479) | def get_kWh(kWh_counter, setup):
  function pathfinder (line 486) | def pathfinder(cfg):

FILE: create_data_split.py
  function generate_no_carry_addition (line 20) | def generate_no_carry_addition(n, m):
  function has_carry (line 31) | def has_carry(num1, num2):
  function generate_dataset (line 39) | def generate_dataset(dir_name, operation, n, m, num_examples, base_folde...
  function tokenize_and_save_dataset (line 128) | def tokenize_and_save_dataset(dataset, tokenizer, directory, test_split_...
  function character_histogram (line 190) | def character_histogram(dir_name, condense_white_space=False):
  function token_histogram (line 238) | def token_histogram(dir_name, tokenizer_type="normal"):
  function main_dataset_gen (line 296) | def main_dataset_gen(dir_name, op, n, m, num_samples, exact=False, keep_...
  function tokenize_main (line 305) | def tokenize_main(dir_name, tokenizer_type, test_split_ratio=0.05):
  function pick_char_set (line 334) | def pick_char_set(max_len):
  function hints_helper (line 346) | def hints_helper(num_str, chars):
  function bucket_method_gen (line 353) | def bucket_method_gen(n=3, m=3, operation='+', limit=1000, p=0, no_carry...
  function bucket_method_main (line 412) | def bucket_method_main(n, m, operation, limit, dir_name, p=0, no_carry_a...
  function uniform_distribution_sort_basic (line 437) | def uniform_distribution_sort_basic(maximum_number_of_digts, maximum_len...
  function bucket_uniform_distribution (line 467) | def bucket_uniform_distribution(maximum_number_of_digts, maximum_length,...
  function uniform_distribution_sort_main (line 476) | def uniform_distribution_sort_main(FLAGS, dir_name):
  function main (line 507) | def main():

FILE: create_pos_or_variants.py
  function one_hot_vector (line 6) | def one_hot_vector(length, index=None):
  function zero_vector (line 14) | def zero_vector(length):
  function main (line 19) | def main():

FILE: dataset_analysis.py
  function read_dataset (line 9) | def read_dataset(dir_name, condense_white_space=False):
  function remove_leading_zeros (line 26) | def remove_leading_zeros(match):
  function count_digits (line 30) | def count_digits(dataset, remove_formatting=False):
  function plot_pairs_heatmap (line 53) | def plot_pairs_heatmap(pairs, dir_name=".", remove_formatting=False):
  function line_plotter (line 73) | def line_plotter(data, name, dir_name=".", remove_formatting=False):
  function consecutive_digit_counts (line 89) | def consecutive_digit_counts(input_strings):
  function create_repetition_heatmap (line 119) | def create_repetition_heatmap(data, dir_name=".", remove_formatting=False):
  function main (line 134) | def main(dir_name):

FILE: load_local_model.py
  function main_load_process (line 23) | def main_load_process(cfg, setup):
  function launch (line 42) | def launch(cfg):

FILE: pretrain.py
  function main_training_process (line 17) | def main_training_process(cfg, setup):
  function get_time_elapsed (line 187) | def get_time_elapsed(start_time: float, additional_time: float = 0.0) ->...
  function check_checkpointing (line 190) | def check_checkpointing(data_idx: int, cfg_impl, last_save_time) -> bool:
  function check_deadline (line 196) | def check_deadline(launch_time, hour_limit, prev_budget: float = 0.0, ov...
  function check_early_termination (line 205) | def check_early_termination(start_time, loss, early_termination, prev_bu...
  function collect_stats (line 217) | def collect_stats(data_step, loss_vals, log_ppls, model_outputs, train_t...
  function validate (line 265) | def validate(model_engine, validloader, setup, cfg):
  function generate (line 313) | def generate(model_engine, tokenizer, example_prompts, token_limit=10, t...
  function flag_communication (line 331) | def flag_communication(training_allowed):
  function launch (line 345) | def launch(cfg):

FILE: pretty_plotter.py
  function find_file (line 12) | def find_file(starting_directory, target_file):
  function grid_plotter (line 18) | def grid_plotter(data, type="accs", path="", title=None, rect_size=20, u...
  function main (line 44) | def main():

FILE: pretty_plotter_big.py
  function grid_plotter (line 14) | def grid_plotter(data, type="accs", path="", title=None, rect_size=20):
  function main (line 40) | def main():

FILE: pretty_plotter_sort.py
  function grid_plotter (line 8) | def grid_plotter(data, title="", path=None):
  function run (line 31) | def run(names, short_hand, base_dir, sort_plots_path):

FILE: sort_eval.py
  function grid_plotter (line 21) | def grid_plotter(data, type="accs", name='_large', extra_path=None):
  function grid_logic (line 44) | def grid_logic(cfg):
  function main (line 189) | def main(cfg):
  function launch (line 375) | def launch(cfg):

FILE: upload_processed_dataset.py
  function upload (line 18) | def upload(cfg, setup):
  function launch (line 81) | def launch(cfg):