SYMBOL INDEX (343 symbols across 33 files) FILE: dev/gen_synthetic_data.py function sample_diversity_elements (line 312) | def sample_diversity_elements(rng): function generate_conversation (line 338) | def generate_conversation(idx: int): function validate_conversation (line 383) | def validate_conversation(messages): FILE: nanochat/checkpoint_manager.py function log0 (line 19) | def log0(message): function _patch_missing_config_keys (line 23) | def _patch_missing_config_keys(model_config_kwargs): function _patch_missing_keys (line 30) | def _patch_missing_keys(model_data, model_config): function save_checkpoint (line 42) | def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, me... function load_checkpoint (line 61) | def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, ... function build_model (line 77) | def build_model(checkpoint_dir, step, device, phase): function find_largest_model (line 118) | def find_largest_model(checkpoints_dir): function find_last_step (line 138) | def find_last_step(checkpoint_dir): function load_model_from_dir (line 149) | def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, ... function load_model (line 164) | def load_model(source, *args, **kwargs): function load_optimizer_state (line 174) | def load_optimizer_state(source, device, rank, model_tag=None, step=None): FILE: nanochat/common.py function _detect_compute_dtype (line 17) | def _detect_compute_dtype(): class ColoredFormatter (line 33) | class ColoredFormatter(logging.Formatter): method format (line 45) | def format(self, record): function setup_default_logging (line 59) | def setup_default_logging(): function get_base_dir (line 70) | def get_base_dir(): function download_file_with_lock (line 81) | def download_file_with_lock(url, filename, postprocess_fn=None): function print0 (line 117) | def print0(s="",**kwargs): function print_banner (line 122) | def print_banner(): function is_ddp_requested (line 136) | def is_ddp_requested() -> bool: function is_ddp_initialized (line 143) | def is_ddp_initialized() -> bool: function get_dist_info (line 150) | def get_dist_info(): function autodetect_device_type (line 162) | def autodetect_device_type(): function compute_init (line 173) | def compute_init(device_type="cuda"): # cuda|cpu|mps function compute_cleanup (line 210) | def compute_cleanup(): class DummyWandb (line 215) | class DummyWandb: method __init__ (line 217) | def __init__(self): method log (line 219) | def log(self, *args, **kwargs): method finish (line 221) | def finish(self): function get_peak_flops (line 227) | def get_peak_flops(device_name: str) -> float: FILE: nanochat/core_eval.py function render_prompts_mc (line 17) | def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None): function render_prompts_schema (line 36) | def render_prompts_schema(item, continuation_delimiter, fewshot_examples... function render_prompts_lm (line 56) | def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None): function find_common_length (line 86) | def find_common_length(token_sequences, direction='left'): function stack_sequences (line 104) | def stack_sequences(tokens, pad_token_id): function batch_sequences_mc (line 113) | def batch_sequences_mc(tokenizer, prompts): function batch_sequences_schema (line 123) | def batch_sequences_schema(tokenizer, prompts): function batch_sequences_lm (line 133) | def batch_sequences_lm(tokenizer, prompts): function forward_model (line 145) | def forward_model(model, input_ids): function evaluate_example (line 168) | def evaluate_example(idx, model, tokenizer, data, device, task_meta): function evaluate_task (line 244) | def evaluate_task(model, tokenizer, data, device, task_meta): FILE: nanochat/dataloader.py function _document_batches (line 25) | def _document_batches(split, resume_state_dict, tokenizer_batch_size): function tokenizing_distributed_data_loader_with_state_bos_bestfit (line 74) | def tokenizing_distributed_data_loader_with_state_bos_bestfit( function tokenizing_distributed_data_loader_bos_bestfit (line 163) | def tokenizing_distributed_data_loader_bos_bestfit(*args, **kwargs): FILE: nanochat/dataset.py function list_parquet_files (line 32) | def list_parquet_files(data_dir=None, warn_on_legacy=False): function parquets_iter_batched (line 67) | def parquets_iter_batched(split, start=0, step=1): function download_single_file (line 84) | def download_single_file(index): FILE: nanochat/engine.py function timeout (line 26) | def timeout(duration, formula): function eval_with_timeout (line 35) | def eval_with_timeout(formula, max_time=3): function use_calculator (line 46) | def use_calculator(expr): class KVCache (line 82) | class KVCache: method __init__ (line 92) | def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layer... method reset (line 106) | def reset(self): method get_pos (line 111) | def get_pos(self): method get_layer_cache (line 115) | def get_layer_cache(self, layer_idx): method advance (line 119) | def advance(self, num_tokens): method prefill (line 123) | def prefill(self, other): function sample_next_token (line 141) | def sample_next_token(logits, rng, temperature=1.0, top_k=None): class RowState (line 160) | class RowState: method __init__ (line 162) | def __init__(self, current_tokens=None): class Engine (line 169) | class Engine: method __init__ (line 171) | def __init__(self, model, tokenizer): method generate (line 176) | def generate(self, tokens, num_samples=1, max_tokens=None, temperature... method generate_batch (line 282) | def generate_batch(self, tokens, num_samples=1, **kwargs): FILE: nanochat/execution.py class ExecutionResult (line 38) | class ExecutionResult: method __repr__ (line 47) | def __repr__(self): function time_limit (line 65) | def time_limit(seconds: float): function capture_io (line 78) | def capture_io(): function create_tempdir (line 90) | def create_tempdir(): class TimeoutException (line 96) | class TimeoutException(Exception): class WriteOnlyStringIO (line 100) | class WriteOnlyStringIO(io.StringIO): method read (line 103) | def read(self, *args, **kwargs): method readline (line 106) | def readline(self, *args, **kwargs): method readlines (line 109) | def readlines(self, *args, **kwargs): method readable (line 112) | def readable(self, *args, **kwargs): class redirect_stdin (line 117) | class redirect_stdin(contextlib._RedirectStream): # type: ignore function chdir (line 122) | def chdir(root): function reliability_guard (line 134) | def reliability_guard(maximum_memory_bytes: Optional[int] = None): function _unsafe_execute (line 214) | def _unsafe_execute(code: str, timeout: float, maximum_memory_bytes: Opt... function execute_code (line 286) | def execute_code( FILE: nanochat/flash_attention.py function _load_flash_attention_3 (line 23) | def _load_flash_attention_3(): function _resolve_use_fa3 (line 48) | def _resolve_use_fa3(): function _sdpa_attention (line 69) | def _sdpa_attention(q, k, v, window_size, enable_gqa): function flash_attn_func (line 107) | def flash_attn_func(q, k, v, causal=False, window_size=(-1, -1)): function flash_attn_with_kvcache (line 131) | def flash_attn_with_kvcache(q, k_cache, v_cache, k=None, v=None, cache_s... FILE: nanochat/fp8.py function _to_fp8 (line 82) | def _to_fp8(x, fp8_dtype): function _to_col_major (line 110) | def _to_col_major(x): class _Float8Matmul (line 125) | class _Float8Matmul(torch.autograd.Function): method forward (line 133) | def forward(ctx, input_2d, weight): method backward (line 157) | def backward(ctx, grad_output): class Float8Linear (line 195) | class Float8Linear(nn.Linear): method forward (line 202) | def forward(self, input): method from_float (line 216) | def from_float(cls, mod): class Float8LinearConfig (line 230) | class Float8LinearConfig: method from_recipe_name (line 234) | def from_recipe_name(recipe_name): function convert_to_float8_training (line 243) | def convert_to_float8_training(module, *, config=None, module_filter_fn=... FILE: nanochat/gpt.py class GPTConfig (line 29) | class GPTConfig: function norm (line 42) | def norm(x): class Linear (line 45) | class Linear(nn.Linear): method forward (line 49) | def forward(self, x): function has_ve (line 53) | def has_ve(layer_idx, n_layer): function apply_rotary_emb (line 57) | def apply_rotary_emb(x, cos, sin): class CausalSelfAttention (line 65) | class CausalSelfAttention(nn.Module): method __init__ (line 66) | def __init__(self, config, layer_idx): method forward (line 82) | def forward(self, x, ve, cos_sin, window_size, kv_cache): class MLP (line 129) | class MLP(nn.Module): method __init__ (line 130) | def __init__(self, config): method forward (line 135) | def forward(self, x): class Block (line 142) | class Block(nn.Module): method __init__ (line 143) | def __init__(self, config, layer_idx): method forward (line 148) | def forward(self, x, ve, cos_sin, window_size, kv_cache): class GPT (line 154) | class GPT(nn.Module): method __init__ (line 155) | def __init__(self, config, pad_vocab_size_to=64): method init_weights (line 202) | def init_weights(self): method _precompute_rotary_embeddings (line 263) | def _precompute_rotary_embeddings(self, seq_len, head_dim, base=100000... method _compute_window_sizes (line 280) | def _compute_window_sizes(self, config): method get_device (line 309) | def get_device(self): method estimate_flops (line 312) | def estimate_flops(self): method num_scaling_params (line 340) | def num_scaling_params(self): method setup_optimizer (line 369) | def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matr... method forward (line 411) | def forward(self, idx, targets=None, kv_cache=None, loss_reduction='me... method generate (line 479) | def generate(self, tokens, max_tokens, temperature=1.0, top_k=None, se... FILE: nanochat/loss_eval.py function evaluate_bpb (line 9) | def evaluate_bpb(model, batches, steps, token_bytes): FILE: nanochat/optim.py function adamw_step_fused (line 21) | def adamw_step_fused( function muon_step_fused (line 91) | def muon_step_fused( class MuonAdamW (line 152) | class MuonAdamW(torch.optim.Optimizer): method __init__ (line 178) | def __init__(self, param_groups: list[dict]): method _step_adamw (line 194) | def _step_adamw(self, group: dict) -> None: method _step_muon (line 229) | def _step_muon(self, group: dict) -> None: method step (line 284) | def step(self): class DistMuonAdamW (line 297) | class DistMuonAdamW(torch.optim.Optimizer): method __init__ (line 355) | def __init__(self, param_groups: list[dict]): method _reduce_adamw (line 369) | def _reduce_adamw(self, group: dict, world_size: int) -> dict: method _reduce_muon (line 387) | def _reduce_muon(self, group: dict, world_size: int) -> dict: method _compute_adamw (line 408) | def _compute_adamw(self, group: dict, info: dict, gather_list: list, r... method _compute_muon (line 449) | def _compute_muon(self, group: dict, info: dict, gather_list: list, ra... method _finish_gathers (line 499) | def _finish_gathers(self, gather_list: list) -> None: method step (line 508) | def step(self): FILE: nanochat/report.py function run_command (line 15) | def run_command(cmd): function get_git_info (line 28) | def get_git_info(): function get_gpu_info (line 44) | def get_gpu_info(): function get_system_info (line 67) | def get_system_info(): function estimate_cost (line 89) | def estimate_cost(gpu_info, runtime_hours=None): function generate_header (line 120) | def generate_header(): function slugify (line 203) | def slugify(text): function extract (line 222) | def extract(section, keys): function extract_timestamp (line 233) | def extract_timestamp(content, prefix): class Report (line 244) | class Report: method __init__ (line 247) | def __init__(self, report_dir): method log (line 251) | def log(self, section, data): method generate (line 279) | def generate(self): method reset (line 371) | def reset(self): class DummyReport (line 394) | class DummyReport: method log (line 395) | def log(self, *args, **kwargs): method reset (line 397) | def reset(self, *args, **kwargs): function get_report (line 400) | def get_report(): FILE: nanochat/tokenizer.py class HuggingFaceTokenizer (line 39) | class HuggingFaceTokenizer: method __init__ (line 42) | def __init__(self, tokenizer): method from_pretrained (line 46) | def from_pretrained(cls, hf_path): method from_directory (line 52) | def from_directory(cls, tokenizer_dir): method train_from_iterator (line 59) | def train_from_iterator(cls, text_iterator, vocab_size): method get_vocab_size (line 95) | def get_vocab_size(self): method get_special_tokens (line 98) | def get_special_tokens(self): method id_to_token (line 103) | def id_to_token(self, id): method _encode_one (line 106) | def _encode_one(self, text, prepend=None, append=None, num_threads=None): method encode_special (line 121) | def encode_special(self, text): method get_bos_token_id (line 125) | def get_bos_token_id(self): method encode (line 136) | def encode(self, text, *args, **kwargs): method __call__ (line 144) | def __call__(self, *args, **kwargs): method decode (line 147) | def decode(self, ids): method save (line 150) | def save(self, tokenizer_dir): class RustBPETokenizer (line 163) | class RustBPETokenizer: method __init__ (line 166) | def __init__(self, enc, bos_token): method train_from_iterator (line 171) | def train_from_iterator(cls, text_iterator, vocab_size): method from_directory (line 193) | def from_directory(cls, tokenizer_dir): method from_pretrained (line 200) | def from_pretrained(cls, tiktoken_name): method get_vocab_size (line 209) | def get_vocab_size(self): method get_special_tokens (line 212) | def get_special_tokens(self): method id_to_token (line 215) | def id_to_token(self, id): method encode_special (line 219) | def encode_special(self, text): method get_bos_token_id (line 222) | def get_bos_token_id(self): method encode (line 225) | def encode(self, text, prepend=None, append=None, num_threads=8): method __call__ (line 252) | def __call__(self, *args, **kwargs): method decode (line 255) | def decode(self, ids): method save (line 258) | def save(self, tokenizer_dir): method render_conversation (line 266) | def render_conversation(self, conversation, max_tokens=2048): method visualize_tokenization (line 352) | def visualize_tokenization(self, ids, mask, with_token_id=False): method render_for_completion (line 367) | def render_for_completion(self, conversation): function get_tokenizer (line 390) | def get_tokenizer(): function get_token_bytes (line 397) | def get_token_bytes(device="cpu"): FILE: scripts/base_eval.py class ModelWrapper (line 45) | class ModelWrapper: method __init__ (line 47) | def __init__(self, model, max_seq_len=None): method __call__ (line 51) | def __call__(self, input_ids, targets=None, loss_reduction='mean'): method get_device (line 63) | def get_device(self): function load_hf_model (line 67) | def load_hf_model(hf_path: str, device): function get_hf_token_bytes (line 80) | def get_hf_token_bytes(tokenizer, device="cpu"): function place_eval_bundle (line 95) | def place_eval_bundle(file_path): function evaluate_core (line 107) | def evaluate_core(model, tokenizer, device, max_per_task=-1): function main (line 178) | def main(): FILE: scripts/base_train.py function build_model_meta (line 129) | def build_model_meta(depth): function fp8_module_filter (line 178) | def fp8_module_filter(mod: nn.Module, fqn: str) -> bool: function disable_fp8 (line 196) | def disable_fp8(model): function get_scaling_params (line 262) | def get_scaling_params(m): function get_lr_multiplier (line 359) | def get_lr_multiplier(it): function get_muon_momentum (line 371) | def get_muon_momentum(it): function get_weight_decay (line 384) | def get_weight_decay(it): FILE: scripts/chat_eval.py function run_generative_eval (line 29) | def run_generative_eval(task_object, tokenizer, model, engine, num_sampl... function run_categorical_eval (line 88) | def run_categorical_eval(task_object, tokenizer, model, batch_size, max_... function run_chat_eval (line 157) | def run_chat_eval(task_name, model, tokenizer, engine, FILE: scripts/chat_rl.py function get_batch (line 86) | def get_batch(): function run_gsm8k_eval (line 150) | def run_gsm8k_eval(task, tokenizer, engine, function get_lr_multiplier (line 210) | def get_lr_multiplier(it): FILE: scripts/chat_sft.py function sft_data_generator_bos_bestfit (line 187) | def sft_data_generator_bos_bestfit(split, buffer_size=100): function get_lr_multiplier (line 314) | def get_lr_multiplier(progress): function get_muon_momentum (line 324) | def get_muon_momentum(it): function centered_mean (line 384) | def centered_mean(tasks): FILE: scripts/chat_web.py class Worker (line 87) | class Worker: class WorkerPool (line 94) | class WorkerPool: method __init__ (line 97) | def __init__(self, num_gpus: Optional[int] = None): method initialize (line 107) | async def initialize(self, source: str, model_tag: Optional[str] = Non... method acquire_worker (line 135) | async def acquire_worker(self) -> Worker: method release_worker (line 139) | async def release_worker(self, worker: Worker): class ChatMessage (line 143) | class ChatMessage(BaseModel): class ChatRequest (line 147) | class ChatRequest(BaseModel): function validate_chat_request (line 153) | def validate_chat_request(request: ChatRequest): function lifespan (line 217) | async def lifespan(app: FastAPI): function root (line 236) | async def root(): function logo (line 250) | async def logo(): function generate_stream (line 255) | async def generate_stream( function chat_completions (line 306) | async def chat_completions(request: ChatRequest): function health (line 377) | async def health(): function stats (line 388) | async def stats(): FILE: scripts/tok_eval.py function print_comparison (line 203) | def print_comparison(baseline_name, baseline_results, ours_results, all_... FILE: scripts/tok_train.py function text_iterator (line 28) | def text_iterator(): FILE: tasks/arc.py class ARC (line 9) | class ARC(Task): method __init__ (line 11) | def __init__(self, subset, split, **kwargs): method eval_type (line 18) | def eval_type(self): method num_examples (line 21) | def num_examples(self): method get_example (line 24) | def get_example(self, index): method evaluate (line 43) | def evaluate(self, conversation, assistant_response): FILE: tasks/common.py class Task (line 10) | class Task: method __init__ (line 15) | def __init__(self, start=0, stop=None, step=1): method eval_type (line 25) | def eval_type(self): method num_examples (line 29) | def num_examples(self): method get_example (line 32) | def get_example(self, index): method __len__ (line 35) | def __len__(self): method __getitem__ (line 44) | def __getitem__(self, index: int): method evaluate (line 50) | def evaluate(self, problem, completion): class TaskMixture (line 54) | class TaskMixture(Task): method __init__ (line 60) | def __init__(self, tasks, **kwargs): method num_examples (line 76) | def num_examples(self): method get_example (line 79) | def get_example(self, index): class TaskSequence (line 89) | class TaskSequence(Task): method __init__ (line 95) | def __init__(self, tasks, **kwargs): method num_examples (line 101) | def num_examples(self): method get_example (line 104) | def get_example(self, index): function render_mc (line 112) | def render_mc(question, letters, choices): FILE: tasks/customjson.py class CustomJSON (line 10) | class CustomJSON(Task): method __init__ (line 17) | def __init__(self, filepath, **kwargs): method num_examples (line 56) | def num_examples(self): method get_example (line 59) | def get_example(self, index): FILE: tasks/gsm8k.py function extract_answer (line 23) | def extract_answer(completion): class GSM8K (line 37) | class GSM8K(Task): method __init__ (line 39) | def __init__(self, subset, split, **kwargs): method eval_type (line 46) | def eval_type(self): method num_examples (line 49) | def num_examples(self): method get_example (line 52) | def get_example(self, index): method evaluate (line 87) | def evaluate(self, conversation, assistant_response): method reward (line 110) | def reward(self, conversation, assistant_response): FILE: tasks/humaneval.py function extract_imports (line 12) | def extract_imports(prompt): function extract_program (line 24) | def extract_program(completion): class HumanEval (line 47) | class HumanEval(Task): method __init__ (line 49) | def __init__(self, **kwargs): method eval_type (line 54) | def eval_type(self): method num_examples (line 57) | def num_examples(self): method get_example (line 60) | def get_example(self, index): method evaluate (line 79) | def evaluate(self, conversation, completion): FILE: tasks/mmlu.py class MMLU (line 9) | class MMLU(Task): method __init__ (line 14) | def __init__(self, subset, split, **kwargs): method eval_type (line 28) | def eval_type(self): method num_examples (line 31) | def num_examples(self): method get_example (line 34) | def get_example(self, index): method evaluate (line 55) | def evaluate(self, conversation, assistant_response): FILE: tasks/smoltalk.py class SmolTalk (line 10) | class SmolTalk(Task): method __init__ (line 13) | def __init__(self, split, **kwargs): method num_examples (line 19) | def num_examples(self): method get_example (line 22) | def get_example(self, index): FILE: tasks/spellingbee.py function extract_answer (line 43) | def extract_answer(completion): class SpellingBee (line 115) | class SpellingBee(Task): method __init__ (line 117) | def __init__(self, size=1000, split="train", **kwargs): method eval_type (line 129) | def eval_type(self): method num_examples (line 132) | def num_examples(self): method get_example (line 135) | def get_example(self, index): method evaluate (line 207) | def evaluate(self, conversation, assistant_response): method reward (line 226) | def reward(self, conversation, assistant_response): class SimpleSpelling (line 233) | class SimpleSpelling(Task): method __init__ (line 236) | def __init__(self, size=1000, split="train", **kwargs): method eval_type (line 250) | def eval_type(self): method num_examples (line 253) | def num_examples(self): method get_example (line 256) | def get_example(self, index): FILE: tests/test_attention_fallback.py function set_impl (line 23) | def set_impl(impl): function run_both_impls (line 29) | def run_both_impls(fn): function assert_close (line 39) | def assert_close(t1, t2, name, atol=1e-2, rtol=1e-2): class TestFA3VsSDPA (line 52) | class TestFA3VsSDPA: method test_basic_causal (line 58) | def test_basic_causal(self): method test_full_context (line 72) | def test_full_context(self): method test_sliding_window (line 86) | def test_sliding_window(self): method test_gqa (line 101) | def test_gqa(self): method test_larger_model (line 118) | def test_larger_model(self): method test_kvcache_prefill (line 132) | def test_kvcache_prefill(self): method test_kvcache_single_token (line 155) | def test_kvcache_single_token(self): method test_kvcache_single_token_sliding_window (line 182) | def test_kvcache_single_token_sliding_window(self): method test_backward_gradients_match (line 215) | def test_backward_gradients_match(self): class TestSDPAOnly (line 254) | class TestSDPAOnly: method test_basic_forward (line 260) | def test_basic_forward(self): method test_backward (line 274) | def test_backward(self): method test_kvcache (line 292) | def test_kvcache(self): class TestOverrideMechanism (line 340) | class TestOverrideMechanism: method test_override_fa3 (line 344) | def test_override_fa3(self): method test_override_sdpa (line 350) | def test_override_sdpa(self): method test_override_auto (line 356) | def test_override_auto(self): FILE: tests/test_engine.py class MockConfig (line 16) | class MockConfig: class MockModel (line 25) | class MockModel: method __init__ (line 31) | def __init__(self, vocab_size=262): # 256 bytes + 6 special tokens method get_device (line 36) | def get_device(self): method forward (line 39) | def forward(self, ids, kv_cache=None): class ByteTokenizer (line 50) | class ByteTokenizer: method __init__ (line 55) | def __init__(self): method encode_special (line 67) | def encode_special(self, s): method get_bos_token_id (line 70) | def get_bos_token_id(self): method encode (line 73) | def encode(self, s, prepend=None): method decode (line 79) | def decode(self, tokens): function test_kv_cache_basic (line 84) | def test_kv_cache_basic(): function test_kv_cache_prefill (line 124) | def test_kv_cache_prefill(): function test_multi_sample_first_token_diversity (line 158) | def test_multi_sample_first_token_diversity(): function test_seed_reproducibility (line 201) | def test_seed_reproducibility(): function test_temperature_zero_determinism (line 214) | def test_temperature_zero_determinism(): function test_max_tokens_respected (line 226) | def test_max_tokens_respected(): function test_num_samples_count (line 238) | def test_num_samples_count(): function test_different_seeds_introduce_variation_when_temperature_nonzero (line 249) | def test_different_seeds_introduce_variation_when_temperature_nonzero():