SYMBOL INDEX (450 symbols across 35 files) FILE: abacus.py class Abacus (line 5) | class Abacus(torch.nn.Module): method __init__ (line 11) | def __init__(self, digit_tokens, embedding_dim, max_seq_length=1024, m... method helper (line 24) | def helper(self, mask, device): method forward (line 53) | def forward(self, input_ids): FILE: arithmetic_eval_quicker.py function grid_plotter (line 21) | def grid_plotter(data, type="accs", name='_large', extra_path=None): function index_hints_helper (line 44) | def index_hints_helper(num, tokenizer): function grid_logic (line 54) | def grid_logic(cfg): function main (line 199) | def main(cfg): function launch (line 509) | def launch(cfg): FILE: cramming/__init__.py function get_config (line 23) | def get_config(overrides=[]): function get_model_config (line 31) | def get_model_config(arch="hf-bert-tiny", overrides=[]): function get_backend_config (line 39) | def get_backend_config(backend="torch-default", overrides=[]): FILE: cramming/architectures/attention.py function get_attention_mechanism (line 11) | def get_attention_mechanism(idx, hidden_size, cfg_attention, norm_fn: to... class Identity (line 33) | class Identity(torch.nn.Module): method __init__ (line 39) | def __init__(self, hidden_size): method forward (line 43) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor... class RandomNoise (line 46) | class RandomNoise(torch.nn.Module): method __init__ (line 52) | def __init__(self, hidden_size): method forward (line 56) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor... class BertAttentionWrapper (line 60) | class BertAttentionWrapper(BertSelfAttention): method __init__ (line 66) | def __init__(self, hidden_size, cfg_attention): method forward (line 81) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor... class SelfAttentionPyTorch (line 85) | class SelfAttentionPyTorch(torch.nn.Module): method __init__ (line 91) | def __init__(self, hidden_size, cfg_attention): method forward (line 102) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor... class SeqFirstSelfAttentionPyTorch (line 113) | class SeqFirstSelfAttentionPyTorch(torch.nn.Module): method __init__ (line 119) | def __init__(self, hidden_size, cfg_attention): method forward (line 130) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor... class SeqFirstSelfAttention (line 141) | class SeqFirstSelfAttention(torch.nn.MultiheadAttention): method __init__ (line 155) | def __init__(self, hidden_size: int, cfg_attention, norm_module=torch.... method attention (line 213) | def attention(self, query_layer, key_layer, value_layer, attention_mas... method forward (line 262) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor... class FourierMixing (line 302) | class FourierMixing(torch.nn.Module): method __init__ (line 311) | def __init__(self, hidden_size, cfg_attention): method forward (line 323) | def forward(self, hidden_states, attention_mask: Optional[torch.Tensor... class TorchSoftmax (line 348) | class TorchSoftmax(torch.nn.Module): method __init__ (line 349) | def __init__(self, seq_op_in_fp32=False): method forward (line 353) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None): class TorchShaped (line 363) | class TorchShaped(torch.nn.Module): method __init__ (line 366) | def __init__(self, seq_op_in_fp32=False, hidden_size=768): method forward (line 371) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None): class SwinCosine (line 384) | class SwinCosine(torch.nn.Module): method __init__ (line 387) | def __init__(self, seq_op_in_fp32=False, tau=0.1, eps=1e-8): method forward (line 393) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None): class TorchNormalize (line 408) | class TorchNormalize(torch.nn.Module): method __init__ (line 409) | def __init__(self, num_attention_heads=1, seq_op_in_fp32=False): method forward (line 416) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None): class ScaledIdentity (line 430) | class ScaledIdentity(torch.nn.Module): method __init__ (line 431) | def __init__(self, seq_op_in_fp32): method forward (line 435) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None): class Cumsum (line 443) | class Cumsum(torch.nn.Module): method __init__ (line 444) | def __init__(self, seq_op_in_fp32): method forward (line 448) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None): class CumsumExp (line 456) | class CumsumExp(torch.nn.Module): method __init__ (line 457) | def __init__(self, seq_op_in_fp32): method forward (line 461) | def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None): FILE: cramming/architectures/components.py class EmbeddingComponent (line 14) | class EmbeddingComponent(torch.nn.Module): method __init__ (line 16) | def __init__(self, cfg_embedding, norm, norm_eps): method forward (line 40) | def forward(self, input_ids): class PredictionHeadComponent (line 54) | class PredictionHeadComponent(torch.nn.Module): method __init__ (line 55) | def __init__(self, cfg_arch): method forward (line 67) | def forward(self, hidden_states): class NormalizedResidualConnection (line 72) | class NormalizedResidualConnection(torch.nn.Module): method __init__ (line 75) | def __init__(self, input_dim, cfg_arch, output_dim=None, dropout=0.0): method _simple_residual (line 108) | def _simple_residual(self, residual, layer, states, *args, **kwargs): method _prenormalization_residual (line 111) | def _prenormalization_residual(self, residual, layer, states, *args, *... method _postnormalization_residual (line 114) | def _postnormalization_residual(self, residual, layer, states, *args, ... method _deepnorm_residual (line 117) | def _deepnorm_residual(self, residual, layer, states, *args, **kwargs): method _prenorm_equalized_residual (line 120) | def _prenorm_equalized_residual(self, residual, layer, states, *args, ... method _sandwich_residual (line 123) | def _sandwich_residual(self, residual, layer, states, *args, **kwargs): method forward (line 126) | def forward(self, residual: torch.Tensor, layer_callable: torch.nn.Mod... function _get_norm_fn (line 136) | def _get_norm_fn(norm_name): function _get_nonlin_fn (line 150) | def _get_nonlin_fn(nonlin_name, use_gating=True): class GLU (line 169) | class GLU(torch.nn.Module): method __init__ (line 175) | def __init__(self, sub_activation): method forward (line 179) | def forward(self, inputs): class ScaleNorm (line 184) | class ScaleNorm(torch.nn.Module): method __init__ (line 191) | def __init__(self, hidden_size: int, eps: float = 1e-5, elementwise_af... method forward (line 199) | def forward(self, inputs): class RMSNorm (line 204) | class RMSNorm(torch.nn.Module): method __init__ (line 207) | def __init__(self, hidden_size: int, eps: float = 1e-6, elementwise_af... method _legacy_forward (line 215) | def _legacy_forward(self, inputs): method _norm (line 219) | def _norm(self, x): method forward (line 223) | def forward(self, x): function get_causal_attention_mask (line 228) | def get_causal_attention_mask(input_ids) -> torch.Tensor: function get_extended_attention_mask (line 239) | def get_extended_attention_mask(attention_mask: torch.Tensor, input_shap... function _init_module (line 292) | def _init_module(module, init_method="normal", init_std=0.02, hidden_siz... FILE: cramming/architectures/construction.py function construct_model (line 14) | def construct_model(cfg_arch, tokenizer): FILE: cramming/architectures/crammed_depthrecurrent.py class crammedDepthRecurrentConfig (line 21) | class crammedDepthRecurrentConfig(PretrainedConfig): method __init__ (line 24) | def __init__(self, cfg_arch_container: dict = {}, **kwargs): function construct_crammed_recurrent (line 29) | def construct_crammed_recurrent(cfg_arch, vocab_size, equals_token): class FFNComponent (line 44) | class FFNComponent(torch.nn.Module): method __init__ (line 51) | def __init__(self, hidden_size, intermed_size, cfg_arch, output_size=N... method forward (line 66) | def forward(self, hidden_states): class TransformerLayer (line 70) | class TransformerLayer(torch.nn.Module): method __init__ (line 73) | def __init__(self, idx, cfg_arch): method forward (line 85) | def forward(self, states, attention_mask: Optional[torch.Tensor] = None): class TransformerBlock (line 91) | class TransformerBlock(torch.nn.Module): method __init__ (line 94) | def __init__(self, layers, cfg_arch): method forward (line 104) | def forward(self, states, injected_state, attention_mask: Optional[tor... class TransposedAdapter (line 120) | class TransposedAdapter(torch.nn.Linear): # steal init method __init__ (line 121) | def __init__(self, embedding_dim, hidden_size, original_adapter, tie_w... method forward (line 132) | def forward(self, inputs): class ScriptableRecurrentLM (line 136) | class ScriptableRecurrentLM(PreTrainedModel): method __init__ (line 141) | def __init__(self, config): method forward (line 172) | def forward(self, input_ids: torch.Tensor, num_steps_no_grad: int = No... method initialize_state (line 194) | def initialize_state(self, hidden_states): class ScriptableRecurrentLMReplicaConcat (line 210) | class ScriptableRecurrentLMReplicaConcat(PreTrainedModel): method __init__ (line 216) | def __init__(self, config): method apply_recurrent_block (line 258) | def apply_recurrent_block(self, hidden_states, injected_state, attenti... method forward (line 264) | def forward(self, input_ids: torch.Tensor, num_steps_no_grad: int = No... method initialize_state (line 286) | def initialize_state(self, hidden_states): function _generate (line 304) | def _generate(self, input_ids, token_limit=100, temperature=1.0, steps_a... class ScriptableRecurrentLMForPreTraining (line 372) | class ScriptableRecurrentLMForPreTraining(PreTrainedModel): method __init__ (line 377) | def __init__(self, config): method _init_weights (line 396) | def _init_weights(self, module=None): method forward (line 407) | def forward(self, input_ids: torch.Tensor, *args, **kwargs): method _generate (line 424) | def _generate(self, input_ids, token_limit=100, temperature=0.7, steps... class ScriptableRecurrentLMBPTT (line 428) | class ScriptableRecurrentLMBPTT(PreTrainedModel): method __init__ (line 433) | def __init__(self, config, equals_token): method _init_weights (line 463) | def _init_weights(self, module=None): method set_max_recurrences_for_training (line 474) | def set_max_recurrences_for_training(self, new_max): method forward (line 479) | def forward(self, input_ids: torch.Tensor, *args, **kwargs): method forward_progressive (line 493) | def forward_progressive(self, input_ids): method prog_model_call_with_masking (line 517) | def prog_model_call_with_masking(self, input_ids, n, k): method _generate (line 548) | def _generate(self, input_ids, token_limit=100, temperature=1.0, steps... FILE: cramming/architectures/crammed_transformer.py class crammedTransformerConfig (line 21) | class crammedTransformerConfig(PretrainedConfig): method __init__ (line 24) | def __init__(self, cfg_arch_container: dict = {}, **kwargs): function construct_crammed_transformer (line 29) | def construct_crammed_transformer(cfg_arch, vocab_size): class FFNComponent (line 39) | class FFNComponent(torch.nn.Module): method __init__ (line 46) | def __init__(self, hidden_size, intermed_size, cfg_arch, output_size=N... method forward (line 61) | def forward(self, hidden_states): class TransformerLayer (line 65) | class TransformerLayer(torch.nn.Module): method __init__ (line 68) | def __init__(self, idx, cfg_arch): method forward (line 80) | def forward(self, states, attention_mask: Optional[torch.Tensor] = None): class ScriptableLM (line 86) | class ScriptableLM(PreTrainedModel): method __init__ (line 91) | def __init__(self, config): method forward (line 106) | def forward(self, input_ids: torch.Tensor): class ScriptableLMForPreTraining (line 124) | class ScriptableLMForPreTraining(PreTrainedModel): method __init__ (line 129) | def __init__(self, config): method _init_weights (line 141) | def _init_weights(self, module=None): method forward (line 152) | def forward(self, input_ids: torch.Tensor, *args, **kwargs): FILE: cramming/architectures/embeddings.py class PositionalEmbedding (line 11) | class PositionalEmbedding(torch.nn.Module): method __init__ (line 13) | def __init__(self, demb): method forward (line 21) | def forward(self, pos_seq, bsz=None): class RandomNoise (line 35) | class RandomNoise(torch.nn.Module): method __init__ (line 37) | def __init__(self, embedding_dim, max_seq_length=5000): method forward (line 41) | def forward(self, input_ids): class RPE (line 45) | class RPE(torch.nn.Module): method __init__ (line 52) | def __init__(self, d_model, num_heads, max_len=1024, dropout=0.1): method forward (line 68) | def forward(self, x): method skew (line 105) | def skew(self, QEr): class SinusoidalPositional (line 118) | class SinusoidalPositional(torch.nn.Module): method __init__ (line 125) | def __init__(self, embedding_dim, max_seq_length=5000): method forward (line 137) | def forward(self, input_ids): class ScaledSinosoidal (line 150) | class ScaledSinosoidal(SinusoidalPositional): method __init__ (line 153) | def __init__(self, embedding_dim, max_seq_length): method forward (line 157) | def forward(self, input_ids): class LearnablePositional (line 170) | class LearnablePositional(torch.nn.Module): method __init__ (line 173) | def __init__(self, embedding_dim, max_seq_length=1024): method forward (line 178) | def forward(self, input_ids): class LearnablePositionalRand (line 184) | class LearnablePositionalRand(torch.nn.Module): method __init__ (line 187) | def __init__(self, embedding_dim, max_seq_length=1024): method forward (line 193) | def forward(self, input_ids): class Rotary (line 206) | class Rotary(torch.nn.Module): method __init__ (line 207) | def __init__(self, dim, base=10000, def_seq_length=128, seq_dim: int =... method get_cos_sin_cache (line 230) | def get_cos_sin_cache(self, x: torch.Tensor): method _get_cos_sin (line 239) | def _get_cos_sin(self): method forward (line 248) | def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor): method single_forward (line 253) | def single_forward(self, inputs: torch.Tensor): method rotate_half (line 258) | def rotate_half(self, x: torch.Tensor): class RotarySanityCheck (line 262) | class RotarySanityCheck(torch.nn.Module): method __init__ (line 265) | def __init__(self, dim, base=10000, def_seq_length=128, seq_dim: int =... method get_cos_sin_cache (line 276) | def get_cos_sin_cache(self, x: torch.Tensor): method _get_cos_sin (line 285) | def _get_cos_sin(self): method forward (line 294) | def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor): method rotate_half (line 300) | def rotate_half(self, x: torch.Tensor): method single_forward (line 305) | def single_forward(self, inputs: torch.Tensor): class RotaryEleutherAI (line 313) | class RotaryEleutherAI(torch.nn.Module): method __init__ (line 329) | def __init__(self, dim_model: int, *_, **__): method _update_cos_sin_tables (line 340) | def _update_cos_sin_tables(self, x: torch.Tensor, seq_dimension: int =... method forward (line 356) | def forward(self, q: torch.Tensor, k: torch.Tensor, seq_dimension: int... method rotate_half (line 365) | def rotate_half(self, x: torch.Tensor): method apply_rotary_pos_emb (line 371) | def apply_rotary_pos_emb(self, x: torch.Tensor, cos: torch.Tensor, sin... class RotaryLLAMA (line 383) | class RotaryLLAMA(torch.nn.Module): method __init__ (line 386) | def __init__(self, hidden_per_head, base=10000, max_seq_length=512, se... method forward (line 392) | def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor): method apply_rotary_emb (line 395) | def apply_rotary_emb(self, xq: torch.Tensor, xk: torch.Tensor, freqs_c... method reshape_for_broadcast (line 404) | def reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tens... method precompute_freqs_cis (line 412) | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): class FIRE (line 419) | class FIRE(torch.nn.Module): method __init__ (line 420) | def __init__(self, num_heads=12, mlp_width=32, init_c=0.1, init_L=512.... method forward (line 446) | def forward(self, seq_length, device): class Abacus (line 482) | class Abacus(torch.nn.Module): method __init__ (line 485) | def __init__(self, embedding_dim, max_seq_length=1024, max_k=99): method helper (line 491) | def helper(self, mask, device): method forward (line 517) | def forward(self, input_ids): FILE: cramming/architectures/huggingface_interface.py function construct_huggingface_model (line 6) | def construct_huggingface_model(cfg_arch, vocab_size): FILE: cramming/architectures/losses.py class CosineLoss (line 5) | class CosineLoss(torch.nn.Module): method __init__ (line 9) | def __init__(self, reduction: str = "mean", dim=-1, eps=1e-8) -> None: method forward (line 16) | def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: class CrossEntropyWithZLoss (line 20) | class CrossEntropyWithZLoss(torch.nn.Module): method __init__ (line 27) | def __init__(self, ignore_index=-100, z_loss_factor=1e-4): method forward (line 33) | def forward(self, inputs, labels): class MSELoss (line 44) | class MSELoss(torch.nn.Module): method __init__ (line 49) | def __init__(self, ignore_index=-100): method forward (line 54) | def forward(self, inputs, labels): method _label_to_onehot (line 64) | def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100): class MSELossFast (line 70) | class MSELossFast(torch.nn.Module): method __init__ (line 75) | def __init__(self, ignore_index=-100): method forward (line 80) | def forward(self, inputs, labels): class L1Loss (line 94) | class L1Loss(torch.nn.Module): method __init__ (line 99) | def __init__(self, ignore_index=-100): method forward (line 104) | def forward(self, inputs, labels): method _label_to_onehot (line 114) | def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100): class SzegedyLoss (line 120) | class SzegedyLoss(torch.nn.Module): method __init__ (line 126) | def __init__(self, embedding_layer, ignore_index=-100, overrelaxation=... method forward (line 133) | def forward(self, inputs, labels): class FocalLoss (line 173) | class FocalLoss(torch.nn.Module): method __init__ (line 174) | def __init__(self, gamma: float = 5.0, size_average: bool = True, igno... method forward (line 180) | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.... class IncorrectCrossEntropyLoss (line 191) | class IncorrectCrossEntropyLoss(torch.nn.CrossEntropyLoss): method forward (line 194) | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.... FILE: cramming/architectures/sanity_check.py class SanityCheckforPreTraining (line 6) | class SanityCheckforPreTraining(torch.nn.Module): method __init__ (line 9) | def __init__(self, width, vocab_size): method forward (line 14) | def forward( FILE: cramming/backend/optimizers/optimizer_modifiers.py class MetaOptimizer (line 14) | class MetaOptimizer(torch.optim.Optimizer): method __init__ (line 17) | def __init__(self, optimizer): method __getstate__ (line 21) | def __getstate__(self): method __setstate__ (line 24) | def __setstate__(self, state): method __repr__ (line 27) | def __repr__(self): method __getattr__ (line 30) | def __getattr__(self, name): method step (line 35) | def step(self, closure=None): class LARS (line 39) | class LARS(MetaOptimizer): method __init__ (line 66) | def __init__(self, optimizer, trust_coefficient=0.02, clip=False, eps=... method step (line 73) | def step(self, closure=None): class SAM (line 132) | class SAM(MetaOptimizer): method __init__ (line 133) | def __init__(self, base_optimizer_instance, rho=0.05): method first_step (line 141) | def first_step(self, zero_grad=False): method second_step (line 157) | def second_step(self, zero_grad=False): method step (line 170) | def step(self, closure=None): method _grad_norm (line 180) | def _grad_norm(self): FILE: cramming/backend/optimizers/progressive_batching.py class ProgressiveBatching (line 17) | class ProgressiveBatching(MetaOptimizer): method __init__ (line 18) | def __init__(self, optimizer, progress_rule="norm-based", theta=0.9, m... method step (line 33) | def step(self): method inner_product_test (line 63) | def inner_product_test(self): method norm_test (line 85) | def norm_test(self): method cosine_test (line 100) | def cosine_test(self): method coefficient_of_variation (line 119) | def coefficient_of_variation(self): method update_sample_statistics (line 136) | def update_sample_statistics(self): method reset_sample_statistics (line 147) | def reset_sample_statistics(self): method copy_mean_grad (line 157) | def copy_mean_grad(self): FILE: cramming/backend/optimizers/schedulers.py function get_schedule_fn (line 10) | def get_schedule_fn(cfg_train, elapsed_time: float=0.0, true_budget: flo... class DumbScheduler (line 213) | class DumbScheduler: method __init__ (line 214) | def __init__(self, *args, **kwargs): method step (line 217) | def step(self, *args, **kwargs): method _initial_step (line 220) | def _initial_step(self): method state_dict (line 225) | def state_dict(self): method load_state_dict (line 228) | def load_state_dict(self, state_dict): method get_last_lr (line 231) | def get_last_lr(self): method get_lr (line 235) | def get_lr(self): method print_lr (line 238) | def print_lr(self, is_verbose, group, lr, epoch=None): function get_inverse_sqrt_scheduler (line 245) | def get_inverse_sqrt_scheduler(optimizer, num_warmup_steps, num_cooldown... function get_one_cycle (line 276) | def get_one_cycle(optimizer, num_training_steps): function get_ramp (line 288) | def get_ramp(optimizer, num_cooldown_steps, num_training_steps): function _get_fake_step (line 302) | def _get_fake_step(current_step, initial_time, hour_budget, num_training... function get_budget_inv_sqrt_scheduler (line 312) | def get_budget_inv_sqrt_scheduler(optimizer, hour_budget, num_warmup_ste... function get_budget_constant_scheduler (line 335) | def get_budget_constant_scheduler(optimizer, hour_budget, num_warmup_ste... function get_budget_linear_schedule_with_warmup (line 351) | def get_budget_linear_schedule_with_warmup(optimizer, hour_budget, num_w... function get_budget_cosine_schedule_with_warmup (line 364) | def get_budget_cosine_schedule_with_warmup(optimizer, hour_budget, num_w... function get_budget_cosine_half_cycles_with_warmup (line 378) | def get_budget_cosine_half_cycles_with_warmup(optimizer, hour_budget, nu... function get_budget_one_cycle (line 392) | def get_budget_one_cycle(optimizer, hour_budget, num_training_steps, ela... function get_budget_multi_cycle (line 406) | def get_budget_multi_cycle(optimizer, hour_budget, num_training_steps, n... function get_budget_ramp (line 421) | def get_budget_ramp(optimizer, hour_budget, num_cooldown_steps, num_trai... function get_budget_inv_cosine_schedule (line 436) | def get_budget_inv_cosine_schedule(optimizer, hour_budget, num_cooldown_... function get_budget_triangle (line 454) | def get_budget_triangle(optimizer, hour_budget, num_training_steps, base... function get_budget_dive (line 471) | def get_budget_dive(optimizer, hour_budget, num_training_steps, num_warm... function get_budget_polynomial_decay_with_warmup (line 487) | def get_budget_polynomial_decay_with_warmup(optimizer, hour_budget, num_... FILE: cramming/backend/prepare_backend.py function load_backend (line 9) | def load_backend(model, tokenizer, cfg_train, cfg_impl, setup=_default_s... FILE: cramming/backend/torch_default.py function initialize_torch (line 41) | def initialize_torch(model, tokenizer, cfg_train, cfg_impl, setup=_defau... class TorchEngine (line 55) | class TorchEngine(torch.nn.Module): method __init__ (line 58) | def __init__(self, model, cfg_train, cfg_impl, setup=_default_setup, s... method get_true_budget (line 125) | def get_true_budget(self): method step (line 131) | def step(self, batch: dict[str, torch.Tensor]): method to_device (line 137) | def to_device(self, batch: dict[str, torch.Tensor], keys: list[str] = ... method forward (line 146) | def forward(self, *inputs, **kwargs): method backward (line 153) | def backward(self, loss): method forward_inference (line 160) | def forward_inference(self, *inputs, **kwargs): method dynamic_generation (line 168) | def dynamic_generation(self, *inputs, temperature=0.7, token_limit=100... method optimizer_step (line 193) | def optimizer_step(self): method set_train_batch_size (line 209) | def set_train_batch_size(self, batch_size): method schedule_batch_size (line 214) | def schedule_batch_size(self): method record_batch_size (line 235) | def record_batch_size(self): method record_tokens_per_step (line 241) | def record_tokens_per_step(self): method retrieve_model_state_dict (line 246) | def retrieve_model_state_dict(self): method _init_distributed (line 261) | def _init_distributed(self, model): method load_checkpoint (line 273) | def load_checkpoint(self, cfg_arch, file, skip_optim_state=False) -> D... method load_metadata (line 332) | def load_metadata(self, metadata: Dict[str, Any]): method save_training_checkpoint (line 337) | def save_training_checkpoint(self, checkpoint_directory: str, checkpoi... method save_final_model (line 360) | def save_final_model(self, base_directory, identifier, tokenizer, cfg_... method save_model (line 378) | def save_model( method push_to_hub (line 420) | def push_to_hub(self, tokenizer, cfg, dryrun=False): function _load_optimizer (line 476) | def _load_optimizer(model, cfg_train, cfg_impl, elapsed_time=0.0, true_b... FILE: cramming/backend/utils.py function group_parameters (line 14) | def group_parameters(model, cfg_train): function get_model_engine_tokenizer_dataloaders (line 32) | def get_model_engine_tokenizer_dataloaders(cfg, setup, train_eval: bool ... function load_model_checkpoint (line 113) | def load_model_checkpoint(model, model_dir, forward_only_model_with_skip... FILE: cramming/data/arithmetic_tokenizers.py class CustomCharLevelTokenizerForAddingPadding (line 11) | class CustomCharLevelTokenizerForAddingPadding(PreTrainedTokenizer): method __init__ (line 13) | def __init__(self, **kwargs): method vocab_size (line 46) | def vocab_size(self): method get_vocab (line 49) | def get_vocab(self): method _tokenize (line 52) | def _tokenize(self, text): method _convert_token_to_id (line 59) | def _convert_token_to_id(self, token): method _convert_id_to_token (line 62) | def _convert_id_to_token(self, index): method __call__ (line 66) | def __call__(self, text, **kwargs): method decode (line 72) | def decode(self, token_ids, **kwargs): class CustomCharLevelTokenizerForAddingPaddingWithIndexHints (line 78) | class CustomCharLevelTokenizerForAddingPaddingWithIndexHints(PreTrainedT... method __init__ (line 80) | def __init__(self, **kwargs): method vocab_size (line 115) | def vocab_size(self): method get_vocab (line 118) | def get_vocab(self): method _tokenize (line 121) | def _tokenize(self, text): method _convert_token_to_id (line 128) | def _convert_token_to_id(self, token): method _convert_id_to_token (line 131) | def _convert_id_to_token(self, index): method __call__ (line 135) | def __call__(self, text, **kwargs): method decode (line 141) | def decode(self, token_ids, **kwargs): class CustomCharLevelTokenizerSort (line 147) | class CustomCharLevelTokenizerSort(PreTrainedTokenizer): method __init__ (line 149) | def __init__(self, **kwargs): method vocab_size (line 190) | def vocab_size(self): method get_vocab (line 193) | def get_vocab(self): method _tokenize (line 196) | def _tokenize(self, text): method _convert_token_to_id (line 202) | def _convert_token_to_id(self, token): method _convert_id_to_token (line 205) | def _convert_id_to_token(self, index): method __call__ (line 209) | def __call__(self, text, **kwargs): method decode (line 215) | def decode(self, token_ids, **kwargs): FILE: cramming/data/curriculum_sorting.py function _sort_tokenized_dataset_by_unigram (line 10) | def _sort_tokenized_dataset_by_unigram(tokenized_dataset, tokenizer, num... function _sort_tokenized_dataset_by_token (line 53) | def _sort_tokenized_dataset_by_token(tokenized_dataset, tokenizer, targe... function _sort_tokenized_dataset_by_word_length (line 92) | def _sort_tokenized_dataset_by_word_length(tokenized_dataset, tokenizer,... FILE: cramming/data/deduplicate.py function deduplicate_huggingface_dataset (line 40) | def deduplicate_huggingface_dataset(dataset, threshold=100, original_cwd... function _write_tmp_file (line 60) | def _write_tmp_file(dataset, dirname): function _make_suffix_array (line 69) | def _make_suffix_array(text_file, tmpdir, path_to_rust_code): function _finish_and_return_to_hf_dataset (line 139) | def _finish_and_return_to_hf_dataset(original_text_file, remove_file_cac... FILE: cramming/data/pretraining_preparation.py function get_num_workers (line 41) | def get_num_workers(cfg_impl): function load_pretraining_corpus (line 50) | def load_pretraining_corpus(cfg_data, cfg_impl, data_dir: str = None): function load_tokenized_data (line 147) | def load_tokenized_data(tokenized_dataset_path): function convert_to_hf_dataset (line 151) | def convert_to_hf_dataset(tokenized_data): function preprocess_dataset (line 162) | def preprocess_dataset(cfg_data, download_path, num_threads=1, max_raw_c... function _move_stream_to_fixed_map (line 233) | def _move_stream_to_fixed_map(raw_data_streamed, max_entries_in_raw_data... function _huggingface_preprocessing (line 263) | def _huggingface_preprocessing(raw_dataset, tokenizer, cfg_data, num_thr... function _load_fake_dataset (line 357) | def _load_fake_dataset(cfg_data, details, path=None): function _concatenate_entries (line 366) | def _concatenate_entries(dataset, num_entries_in_group, num_threads): function raw_dataset_preprocessing (line 402) | def raw_dataset_preprocessing(raw_dataset, num_threads, cfg_data): function main_process_first (line 454) | def main_process_first(): function _load_from_hub (line 478) | def _load_from_hub(cfg_data, data_path): function prepare_dataloaders (line 498) | def prepare_dataloaders(datasets, tokenizer, cfg_train, cfg_impl) -> Dic... function prepare_pretraining_dataloader (line 506) | def prepare_pretraining_dataloader(dataset, tokenizer, cfg_train, cfg_im... function prepare_validation_dataloader (line 554) | def prepare_validation_dataloader(dataset, tokenizer, cfg_impl): class FastDataCollatorForLanguageModeling (line 586) | class FastDataCollatorForLanguageModeling(transformers.DataCollatorForLa... method __init__ (line 587) | def __init__(self, *args, create_labels_entry=False, **kwargs): method torch_call (line 592) | def torch_call(self, examples): class InfiniteDataLoader (line 616) | class InfiniteDataLoader(torch.utils.data.DataLoader): method __init__ (line 619) | def __init__(self, *args, **kwargs): method __iter__ (line 625) | def __iter__(self): method __next__ (line 628) | def __next__(self): method set_epoch (line 640) | def set_epoch(self, epoch: int): class RuntimeInfiniteDataLoader (line 643) | class RuntimeInfiniteDataLoader(torch.utils.data.DataLoader): method __init__ (line 646) | def __init__(self, tokenizer, device, *args, **kwargs): method get_arithmetic (line 661) | def get_arithmetic(self, n, m): method tokenize_batch (line 685) | def tokenize_batch(self, batch): method __iter__ (line 696) | def __iter__(self): method __next__ (line 699) | def __next__(self): FILE: cramming/data/tokenizer_preparation.py function get_tokenizer (line 12) | def get_tokenizer(tokenizer_type: str): function load_tokenizer (line 27) | def load_tokenizer(tokenizer_path_or_name, seq_length=512, vocab_size=No... function construct_tokenizer (line 38) | def construct_tokenizer(raw_datasets, cfg_data, path, known_tokens=[]): function _download_tokenizer (line 48) | def _download_tokenizer(tokenizer_path_or_name, seq_length, cache_dir=No... function _get_sane_token_args (line 57) | def _get_sane_token_args(): function _get_sane_normalizers (line 67) | def _get_sane_normalizers(force_english_keyboard=False, force_lowercase=... function _construct_tokenizer (line 85) | def _construct_tokenizer(raw_datasets, cfg_data, known_tokens=[]): FILE: cramming/data/utils.py function checksum_config (line 17) | def checksum_config(cfg): function stage_dataset (line 26) | def stage_dataset(data_directory_path, local_staging_dir): function _get_size (line 65) | def _get_size(start_path="."): function detailed_OSError (line 78) | def detailed_OSError(e): FILE: cramming/utils.py function main_launcher (line 37) | def main_launcher(cfg, main_fn, job_name=""): function get_cpus (line 78) | def get_cpus() -> int: function system_startup (line 90) | def system_startup(cfg): function is_main_process (line 184) | def is_main_process(): function num_processes (line 188) | def num_processes(): function find_pretrained_checkpoint (line 194) | def find_pretrained_checkpoint(checkpoint: str, local_checkpoint_folder:... function save_summary (line 256) | def save_summary(table_name, cfg, stats, local_time, setup, original_cwd... function save_to_table (line 334) | def save_to_table(out_dir, table_name, dryrun, **kwargs): function set_random_seed (line 367) | def set_random_seed(seed=233): function set_deterministic (line 378) | def set_deterministic(): function avg_n_dicts (line 386) | def avg_n_dicts(dicts): function dump_metrics (line 406) | def dump_metrics(cfg, metrics): function _initialize_wandb (line 420) | def _initialize_wandb(setup, cfg): function wandb_log (line 440) | def wandb_log(stats, cfg): function flatten (line 448) | def flatten(d, parent_key="", sep="_"): function collect_system_metrics (line 460) | def collect_system_metrics(cfg, metrics, kWh_counter, setup): function get_kWh (line 479) | def get_kWh(kWh_counter, setup): function pathfinder (line 486) | def pathfinder(cfg): FILE: create_data_split.py function generate_no_carry_addition (line 20) | def generate_no_carry_addition(n, m): function has_carry (line 31) | def has_carry(num1, num2): function generate_dataset (line 39) | def generate_dataset(dir_name, operation, n, m, num_examples, base_folde... function tokenize_and_save_dataset (line 128) | def tokenize_and_save_dataset(dataset, tokenizer, directory, test_split_... function character_histogram (line 190) | def character_histogram(dir_name, condense_white_space=False): function token_histogram (line 238) | def token_histogram(dir_name, tokenizer_type="normal"): function main_dataset_gen (line 296) | def main_dataset_gen(dir_name, op, n, m, num_samples, exact=False, keep_... function tokenize_main (line 305) | def tokenize_main(dir_name, tokenizer_type, test_split_ratio=0.05): function pick_char_set (line 334) | def pick_char_set(max_len): function hints_helper (line 346) | def hints_helper(num_str, chars): function bucket_method_gen (line 353) | def bucket_method_gen(n=3, m=3, operation='+', limit=1000, p=0, no_carry... function bucket_method_main (line 412) | def bucket_method_main(n, m, operation, limit, dir_name, p=0, no_carry_a... function uniform_distribution_sort_basic (line 437) | def uniform_distribution_sort_basic(maximum_number_of_digts, maximum_len... function bucket_uniform_distribution (line 467) | def bucket_uniform_distribution(maximum_number_of_digts, maximum_length,... function uniform_distribution_sort_main (line 476) | def uniform_distribution_sort_main(FLAGS, dir_name): function main (line 507) | def main(): FILE: create_pos_or_variants.py function one_hot_vector (line 6) | def one_hot_vector(length, index=None): function zero_vector (line 14) | def zero_vector(length): function main (line 19) | def main(): FILE: dataset_analysis.py function read_dataset (line 9) | def read_dataset(dir_name, condense_white_space=False): function remove_leading_zeros (line 26) | def remove_leading_zeros(match): function count_digits (line 30) | def count_digits(dataset, remove_formatting=False): function plot_pairs_heatmap (line 53) | def plot_pairs_heatmap(pairs, dir_name=".", remove_formatting=False): function line_plotter (line 73) | def line_plotter(data, name, dir_name=".", remove_formatting=False): function consecutive_digit_counts (line 89) | def consecutive_digit_counts(input_strings): function create_repetition_heatmap (line 119) | def create_repetition_heatmap(data, dir_name=".", remove_formatting=False): function main (line 134) | def main(dir_name): FILE: load_local_model.py function main_load_process (line 23) | def main_load_process(cfg, setup): function launch (line 42) | def launch(cfg): FILE: pretrain.py function main_training_process (line 17) | def main_training_process(cfg, setup): function get_time_elapsed (line 187) | def get_time_elapsed(start_time: float, additional_time: float = 0.0) ->... function check_checkpointing (line 190) | def check_checkpointing(data_idx: int, cfg_impl, last_save_time) -> bool: function check_deadline (line 196) | def check_deadline(launch_time, hour_limit, prev_budget: float = 0.0, ov... function check_early_termination (line 205) | def check_early_termination(start_time, loss, early_termination, prev_bu... function collect_stats (line 217) | def collect_stats(data_step, loss_vals, log_ppls, model_outputs, train_t... function validate (line 265) | def validate(model_engine, validloader, setup, cfg): function generate (line 313) | def generate(model_engine, tokenizer, example_prompts, token_limit=10, t... function flag_communication (line 331) | def flag_communication(training_allowed): function launch (line 345) | def launch(cfg): FILE: pretty_plotter.py function find_file (line 12) | def find_file(starting_directory, target_file): function grid_plotter (line 18) | def grid_plotter(data, type="accs", path="", title=None, rect_size=20, u... function main (line 44) | def main(): FILE: pretty_plotter_big.py function grid_plotter (line 14) | def grid_plotter(data, type="accs", path="", title=None, rect_size=20): function main (line 40) | def main(): FILE: pretty_plotter_sort.py function grid_plotter (line 8) | def grid_plotter(data, title="", path=None): function run (line 31) | def run(names, short_hand, base_dir, sort_plots_path): FILE: sort_eval.py function grid_plotter (line 21) | def grid_plotter(data, type="accs", name='_large', extra_path=None): function grid_logic (line 44) | def grid_logic(cfg): function main (line 189) | def main(cfg): function launch (line 375) | def launch(cfg): FILE: upload_processed_dataset.py function upload (line 18) | def upload(cfg, setup): function launch (line 81) | def launch(cfg):