SYMBOL INDEX (1376 symbols across 126 files)

FILE: .github/scripts/check_double_quotes.py
  function should_skip (line 37) | def should_skip(path):
  function collect_fstring_expr_string_positions (line 42) | def collect_fstring_expr_string_positions(source):
  function check_quotes_in_source (line 76) | def check_quotes_in_source(source, path):
  function check_file (line 104) | def check_file(path):
  function check_notebook (line 115) | def check_notebook(path):
  function parse_args (line 126) | def parse_args():
  function main (line 136) | def main():

FILE: appendix-A/01_main-chapter-code/DDP-script-torchrun.py
  function ddp_setup (line 22) | def ddp_setup(rank, world_size):
  class ToyDataset (line 48) | class ToyDataset(Dataset):
    method __init__ (line 49) | def __init__(self, X, y):
    method __getitem__ (line 53) | def __getitem__(self, index):
    method __len__ (line 58) | def __len__(self):
  class NeuralNetwork (line 62) | class NeuralNetwork(torch.nn.Module):
    method __init__ (line 63) | def __init__(self, num_inputs, num_outputs):
    method forward (line 79) | def forward(self, x):
  function prepare_dataset (line 84) | def prepare_dataset():
  function main (line 128) | def main(rank, world_size, num_epochs):
  function compute_accuracy (line 181) | def compute_accuracy(model, dataloader, device):

FILE: appendix-A/01_main-chapter-code/DDP-script.py
  function ddp_setup (line 23) | def ddp_setup(rank, world_size):
  class ToyDataset (line 49) | class ToyDataset(Dataset):
    method __init__ (line 50) | def __init__(self, X, y):
    method __getitem__ (line 54) | def __getitem__(self, index):
    method __len__ (line 59) | def __len__(self):
  class NeuralNetwork (line 63) | class NeuralNetwork(torch.nn.Module):
    method __init__ (line 64) | def __init__(self, num_inputs, num_outputs):
    method forward (line 80) | def forward(self, x):
  function prepare_dataset (line 85) | def prepare_dataset():
  function main (line 129) | def main(rank, world_size, num_epochs):
  function compute_accuracy (line 182) | def compute_accuracy(model, dataloader, device):

FILE: appendix-D/01_main-chapter-code/previous_chapters.py
  class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset):
    method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 36) | def __len__(self):
    method __getitem__ (line 39) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 43) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 62) | class MultiHeadAttention(nn.Module):
    method __init__ (line 63) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 78) | def forward(self, x):
  class LayerNorm (line 122) | class LayerNorm(nn.Module):
    method __init__ (line 123) | def __init__(self, emb_dim):
    method forward (line 129) | def forward(self, x):
  class GELU (line 136) | class GELU(nn.Module):
    method __init__ (line 137) | def __init__(self):
    method forward (line 140) | def forward(self, x):
  class FeedForward (line 147) | class FeedForward(nn.Module):
    method __init__ (line 148) | def __init__(self, cfg):
    method forward (line 156) | def forward(self, x):
  class TransformerBlock (line 160) | class TransformerBlock(nn.Module):
    method __init__ (line 161) | def __init__(self, cfg):
    method forward (line 175) | def forward(self, x):
  class GPTModel (line 193) | class GPTModel(nn.Module):
    method __init__ (line 194) | def __init__(self, cfg):
    method forward (line 206) | def forward(self, in_idx):
  function generate_text_simple (line 218) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function calc_loss_batch (line 249) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 256) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 273) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 282) | def generate_and_print_sample(model, tokenizer, device, start_context):
  function plot_losses (line 295) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  function text_to_token_ids (line 314) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 320) | def token_ids_to_text(token_ids, tokenizer):

FILE: appendix-E/01_main-chapter-code/gpt_download.py
  function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 49) | def download_file(url, destination, backup_url=None):
  function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):

FILE: appendix-E/01_main-chapter-code/previous_chapters.py
  class GPTDatasetV1 (line 29) | class GPTDatasetV1(Dataset):
    method __init__ (line 30) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 44) | def __len__(self):
    method __getitem__ (line 47) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 51) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 69) | class MultiHeadAttention(nn.Module):
    method __init__ (line 70) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 85) | def forward(self, x):
  class LayerNorm (line 128) | class LayerNorm(nn.Module):
    method __init__ (line 129) | def __init__(self, emb_dim):
    method forward (line 135) | def forward(self, x):
  class GELU (line 142) | class GELU(nn.Module):
    method __init__ (line 143) | def __init__(self):
    method forward (line 146) | def forward(self, x):
  class FeedForward (line 153) | class FeedForward(nn.Module):
    method __init__ (line 154) | def __init__(self, cfg):
    method forward (line 162) | def forward(self, x):
  class TransformerBlock (line 166) | class TransformerBlock(nn.Module):
    method __init__ (line 167) | def __init__(self, cfg):
    method forward (line 181) | def forward(self, x):
  class GPTModel (line 199) | class GPTModel(nn.Module):
    method __init__ (line 200) | def __init__(self, cfg):
    method forward (line 212) | def forward(self, in_idx):
  function generate_text_simple (line 224) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function assign (line 253) | def assign(left, right):
  function load_weights_into_gpt (line 259) | def load_weights_into_gpt(gpt, params):
  function text_to_token_ids (line 320) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 326) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_loader (line 331) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 350) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function download_and_unzip_spam_data (line 364) | def download_and_unzip_spam_data(url, zip_path, extracted_path, data_fil...
  function create_balanced_dataset (line 387) | def create_balanced_dataset(df):
  function random_split (line 401) | def random_split(df, train_frac, validation_frac):
  class SpamDataset (line 417) | class SpamDataset(Dataset):
    method __init__ (line 418) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method __getitem__ (line 442) | def __getitem__(self, index):
    method __len__ (line 447) | def __len__(self):
    method _longest_encoded_length (line 450) | def _longest_encoded_length(self):
  function calc_accuracy_loader (line 463) | def calc_accuracy_loader(data_loader, model, device, num_batches=None):
  function calc_loss_batch (line 484) | def calc_loss_batch(input_batch, target_batch, model, device):
  function train_classifier_simple (line 492) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ...
  function plot_values (line 530) | def plot_values(epochs_seen, examples_seen, train_values, val_values, la...

FILE: ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py
  function bytes_to_unicode (line 37) | def bytes_to_unicode():
  function get_pairs (line 59) | def get_pairs(word):
  class Encoder (line 72) | class Encoder:
    method __init__ (line 73) | def __init__(self, encoder, bpe_merges, errors="replace"):
    method bpe (line 85) | def bpe(self, token):
    method encode (line 126) | def encode(self, text):
    method decode (line 133) | def decode(self, tokens):
  function get_encoder (line 139) | def get_encoder(model_name, models_dir):
  function download_vocab (line 148) | def download_vocab():

FILE: ch02/05_bpe-from-scratch/tests.py
  function import_definitions_from_notebook (line 11) | def import_definitions_from_notebook(fullname, names):
  function imported_module (line 39) | def imported_module():
  function verdict_file (line 46) | def verdict_file(imported_module):
  function gpt2_files (line 64) | def gpt2_files(imported_module):
  function test_tokenizer_training (line 79) | def test_tokenizer_training(imported_module, verdict_file):
  function test_gpt2_tokenizer_openai_simple (line 108) | def test_gpt2_tokenizer_openai_simple(imported_module, gpt2_files):
  function test_gpt2_tokenizer_openai_edgecases (line 123) | def test_gpt2_tokenizer_openai_edgecases(imported_module, gpt2_files):
  function test_gpt2_newline_and_eot_ids (line 163) | def test_gpt2_newline_and_eot_ids(imported_module, gpt2_files):
  function test_no_eot_aliasing_and_disallowed_logic (line 185) | def test_no_eot_aliasing_and_disallowed_logic(imported_module, gpt2_files):
  function test_newline_roundtrip_and_equivalence (line 214) | def test_newline_roundtrip_and_equivalence(imported_module, gpt2_files, ...
  function test_space_newline_space_patterns (line 234) | def test_space_newline_space_patterns(imported_module, gpt2_files):
  function test_multiple_leading_spaces_roundtrip (line 250) | def test_multiple_leading_spaces_roundtrip(imported_module, gpt2_files):

FILE: ch03/02_bonus_efficient-multihead-attention/tests/test_mha_implementations.py
  function import_notebook_defs (line 10) | def import_notebook_defs():
  function copy_weights (line 16) | def copy_weights(from_mha, to_mha):
  function test_mha_einsum_matches_ch03 (line 34) | def test_mha_einsum_matches_ch03(d_in, d_out, batch, seq_len, num_heads,...

FILE: ch04/01_main-chapter-code/gpt.py
  class GPTDatasetV1 (line 15) | class GPTDatasetV1(Dataset):
    method __init__ (line 16) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 30) | def __len__(self):
    method __getitem__ (line 33) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 37) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 55) | class MultiHeadAttention(nn.Module):
    method __init__ (line 56) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 71) | def forward(self, x):
  class LayerNorm (line 114) | class LayerNorm(nn.Module):
    method __init__ (line 115) | def __init__(self, emb_dim):
    method forward (line 121) | def forward(self, x):
  class GELU (line 128) | class GELU(nn.Module):
    method __init__ (line 129) | def __init__(self):
    method forward (line 132) | def forward(self, x):
  class FeedForward (line 139) | class FeedForward(nn.Module):
    method __init__ (line 140) | def __init__(self, cfg):
    method forward (line 148) | def forward(self, x):
  class TransformerBlock (line 152) | class TransformerBlock(nn.Module):
    method __init__ (line 153) | def __init__(self, cfg):
    method forward (line 167) | def forward(self, x):
  class GPTModel (line 185) | class GPTModel(nn.Module):
    method __init__ (line 186) | def __init__(self, cfg):
    method forward (line 198) | def forward(self, in_idx):
  function generate_text_simple (line 210) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function main (line 236) | def main():

FILE: ch04/01_main-chapter-code/previous_chapters.py
  class GPTDatasetV1 (line 12) | class GPTDatasetV1(Dataset):
    method __init__ (line 13) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 27) | def __len__(self):
    method __getitem__ (line 30) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 34) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 49) | class MultiHeadAttention(nn.Module):
    method __init__ (line 50) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 65) | def forward(self, x):

FILE: ch04/01_main-chapter-code/tests.py
  function test_main (line 31) | def test_main(capsys):

FILE: ch04/03_kv-cache/gpt_ch04.py
  class MultiHeadAttention (line 14) | class MultiHeadAttention(nn.Module):
    method __init__ (line 15) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 34) | def forward(self, x):
  class LayerNorm (line 77) | class LayerNorm(nn.Module):
    method __init__ (line 78) | def __init__(self, emb_dim):
    method forward (line 84) | def forward(self, x):
  class GELU (line 91) | class GELU(nn.Module):
    method __init__ (line 92) | def __init__(self):
    method forward (line 95) | def forward(self, x):
  class FeedForward (line 102) | class FeedForward(nn.Module):
    method __init__ (line 103) | def __init__(self, cfg):
    method forward (line 111) | def forward(self, x):
  class TransformerBlock (line 115) | class TransformerBlock(nn.Module):
    method __init__ (line 116) | def __init__(self, cfg):
    method forward (line 130) | def forward(self, x):
  class GPTModel (line 148) | class GPTModel(nn.Module):
    method __init__ (line 149) | def __init__(self, cfg):
    method forward (line 161) | def forward(self, in_idx):
  function generate_text_simple (line 173) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function main (line 200) | def main():

FILE: ch04/03_kv-cache/gpt_with_kv_cache.py
  class MultiHeadAttention (line 14) | class MultiHeadAttention(nn.Module):
    method __init__ (line 15) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 41) | def forward(self, x, use_cache=False):
    method reset_cache (line 106) | def reset_cache(self):
  class LayerNorm (line 115) | class LayerNorm(nn.Module):
    method __init__ (line 116) | def __init__(self, emb_dim):
    method forward (line 122) | def forward(self, x):
  class GELU (line 129) | class GELU(nn.Module):
    method __init__ (line 130) | def __init__(self):
    method forward (line 133) | def forward(self, x):
  class FeedForward (line 140) | class FeedForward(nn.Module):
    method __init__ (line 141) | def __init__(self, cfg):
    method forward (line 149) | def forward(self, x):
  class TransformerBlock (line 153) | class TransformerBlock(nn.Module):
    method __init__ (line 154) | def __init__(self, cfg):
    method forward (line 168) | def forward(self, x, use_cache=False):
  class GPTModel (line 192) | class GPTModel(nn.Module):
    method __init__ (line 193) | def __init__(self, cfg):
    method forward (line 212) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 245) | def reset_kv_cache(self):
  function generate_text_simple (line 252) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function generate_text_simple_cached (line 280) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 308) | def main():

FILE: ch04/03_kv-cache/gpt_with_kv_cache_optimized.py
  class MultiHeadAttention (line 14) | class MultiHeadAttention(nn.Module):
    method __init__ (line 15) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 37) | def forward(self, x, use_cache=False):
    method reset_cache (line 124) | def reset_cache(self):
  class LayerNorm (line 132) | class LayerNorm(nn.Module):
    method __init__ (line 133) | def __init__(self, emb_dim):
    method forward (line 139) | def forward(self, x):
  class GELU (line 146) | class GELU(nn.Module):
    method __init__ (line 147) | def __init__(self):
    method forward (line 150) | def forward(self, x):
  class FeedForward (line 157) | class FeedForward(nn.Module):
    method __init__ (line 158) | def __init__(self, cfg):
    method forward (line 166) | def forward(self, x):
  class TransformerBlock (line 170) | class TransformerBlock(nn.Module):
    method __init__ (line 171) | def __init__(self, cfg):
    method forward (line 187) | def forward(self, x, use_cache=False):
  class GPTModel (line 211) | class GPTModel(nn.Module):
    method __init__ (line 212) | def __init__(self, cfg):
    method forward (line 232) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 271) | def reset_kv_cache(self):
  function generate_text_simple (line 278) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function generate_text_simple_cached (line 306) | def generate_text_simple_cached(model, idx, max_new_tokens, context_size...
  function main (line 343) | def main():

FILE: ch04/03_kv-cache/tests.py
  function test_gpt_model_equivalence_not_cached (line 32) | def test_gpt_model_equivalence_not_cached(ModelClass):
  function test_gpt_model_equivalence_cached (line 66) | def test_gpt_model_equivalence_cached(ModelClass):
  function test_context_overflow_bug (line 113) | def test_context_overflow_bug():
  function test_prefill_chunking_basic (line 150) | def test_prefill_chunking_basic():

FILE: ch04/04_gqa/gpt_with_kv_gqa.py
  class GroupedQueryAttention (line 20) | class GroupedQueryAttention(nn.Module):
    method __init__ (line 21) | def __init__(
    method forward (line 45) | def forward(self, x, use_cache=False):
    method reset_cache (line 121) | def reset_cache(self):
  class LayerNorm (line 129) | class LayerNorm(nn.Module):
    method __init__ (line 130) | def __init__(self, emb_dim):
    method forward (line 136) | def forward(self, x):
  class GELU (line 143) | class GELU(nn.Module):
    method __init__ (line 144) | def __init__(self):
    method forward (line 147) | def forward(self, x):
  class FeedForward (line 154) | class FeedForward(nn.Module):
    method __init__ (line 155) | def __init__(self, cfg):
    method forward (line 163) | def forward(self, x):
  class TransformerBlock (line 167) | class TransformerBlock(nn.Module):
    method __init__ (line 168) | def __init__(self, cfg):
    method forward (line 182) | def forward(self, x, use_cache=False):
  class GPTModel (line 206) | class GPTModel(nn.Module):
    method __init__ (line 207) | def __init__(self, cfg):
    method forward (line 226) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 258) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 265) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 292) | def main():

FILE: ch04/04_gqa/gpt_with_kv_mha.py
  class MultiHeadAttention (line 20) | class MultiHeadAttention(nn.Module):
    method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False):
    method forward (line 42) | def forward(self, x, use_cache=False):
    method reset_cache (line 110) | def reset_cache(self):
  class LayerNorm (line 118) | class LayerNorm(nn.Module):
    method __init__ (line 119) | def __init__(self, emb_dim):
    method forward (line 125) | def forward(self, x):
  class GELU (line 132) | class GELU(nn.Module):
    method __init__ (line 133) | def __init__(self):
    method forward (line 136) | def forward(self, x):
  class FeedForward (line 143) | class FeedForward(nn.Module):
    method __init__ (line 144) | def __init__(self, cfg):
    method forward (line 152) | def forward(self, x):
  class TransformerBlock (line 156) | class TransformerBlock(nn.Module):
    method __init__ (line 157) | def __init__(self, cfg):
    method forward (line 170) | def forward(self, x, use_cache=False):
  class GPTModel (line 194) | class GPTModel(nn.Module):
    method __init__ (line 195) | def __init__(self, cfg):
    method forward (line 214) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 246) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 253) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 280) | def main():

FILE: ch04/04_gqa/memory_estimator_gqa.py
  function convert_bytes (line 21) | def convert_bytes(n):
  function calc_kv_bytes_total (line 26) | def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads,
  function main (line 33) | def main():

FILE: ch04/04_gqa/plot_memory_estimates_gqa.py
  function bytes_convert (line 14) | def bytes_convert(n):
  function savings_percent (line 19) | def savings_percent(total_mha, total_gqa):
  function plot_abs_kv_vs_context_multi_groups (line 23) | def plot_abs_kv_vs_context_multi_groups():

FILE: ch04/05_mla/gpt_with_kv_mha.py
  class MultiHeadAttention (line 20) | class MultiHeadAttention(nn.Module):
    method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False):
    method forward (line 42) | def forward(self, x, use_cache=False):
    method reset_cache (line 110) | def reset_cache(self):
  class LayerNorm (line 118) | class LayerNorm(nn.Module):
    method __init__ (line 119) | def __init__(self, emb_dim):
    method forward (line 125) | def forward(self, x):
  class GELU (line 132) | class GELU(nn.Module):
    method __init__ (line 133) | def __init__(self):
    method forward (line 136) | def forward(self, x):
  class FeedForward (line 143) | class FeedForward(nn.Module):
    method __init__ (line 144) | def __init__(self, cfg):
    method forward (line 152) | def forward(self, x):
  class TransformerBlock (line 156) | class TransformerBlock(nn.Module):
    method __init__ (line 157) | def __init__(self, cfg):
    method forward (line 170) | def forward(self, x, use_cache=False):
  class GPTModel (line 194) | class GPTModel(nn.Module):
    method __init__ (line 195) | def __init__(self, cfg):
    method forward (line 214) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 246) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 253) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 280) | def main():

FILE: ch04/05_mla/gpt_with_kv_mla.py
  class MultiHeadLatentAttention (line 24) | class MultiHeadLatentAttention(nn.Module):
    method __init__ (line 25) | def __init__(self, d_in, d_out, dropout, num_heads,
    method reset_cache (line 50) | def reset_cache(self):
    method _reshape_to_heads (line 55) | def _reshape_to_heads(x, num_heads, head_dim):
    method forward (line 60) | def forward(self, x, use_cache=False):
  class LayerNorm (line 124) | class LayerNorm(nn.Module):
    method __init__ (line 125) | def __init__(self, emb_dim):
    method forward (line 131) | def forward(self, x):
  class GELU (line 138) | class GELU(nn.Module):
    method __init__ (line 139) | def __init__(self):
    method forward (line 142) | def forward(self, x):
  class FeedForward (line 149) | class FeedForward(nn.Module):
    method __init__ (line 150) | def __init__(self, cfg):
    method forward (line 158) | def forward(self, x):
  class TransformerBlock (line 162) | class TransformerBlock(nn.Module):
    method __init__ (line 163) | def __init__(self, cfg):
    method forward (line 178) | def forward(self, x, use_cache=False):
  class GPTModel (line 202) | class GPTModel(nn.Module):
    method __init__ (line 203) | def __init__(self, cfg):
    method forward (line 222) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 254) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 261) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 288) | def main():

FILE: ch04/05_mla/memory_estimator_mla.py
  function convert_bytes (line 20) | def convert_bytes(n):
  function calc_kv_bytes_total (line 25) | def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads,
  function calc_mla_bytes_total (line 33) | def calc_mla_bytes_total(batch, context_length, n_layers, latent_dim, by...
  function main (line 39) | def main():

FILE: ch04/05_mla/plot_memory_estimates_mla.py
  function convert_bytes_to_gb (line 18) | def convert_bytes_to_gb(n_bytes):
  function calc_kv_bytes_total_mha (line 22) | def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_heads,
  function calc_kv_bytes_total_mla (line 29) | def calc_kv_bytes_total_mla(batch, context_length, n_layers, latent_dim,...
  function plot_abs_kv_vs_context_multiple (line 33) | def plot_abs_kv_vs_context_multiple():

FILE: ch04/06_swa/gpt_with_kv_mha.py
  class MultiHeadAttention (line 20) | class MultiHeadAttention(nn.Module):
    method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False):
    method forward (line 42) | def forward(self, x, use_cache=False):
    method reset_cache (line 110) | def reset_cache(self):
  class LayerNorm (line 118) | class LayerNorm(nn.Module):
    method __init__ (line 119) | def __init__(self, emb_dim):
    method forward (line 125) | def forward(self, x):
  class GELU (line 132) | class GELU(nn.Module):
    method __init__ (line 133) | def __init__(self):
    method forward (line 136) | def forward(self, x):
  class FeedForward (line 143) | class FeedForward(nn.Module):
    method __init__ (line 144) | def __init__(self, cfg):
    method forward (line 152) | def forward(self, x):
  class TransformerBlock (line 156) | class TransformerBlock(nn.Module):
    method __init__ (line 157) | def __init__(self, cfg):
    method forward (line 170) | def forward(self, x, use_cache=False):
  class GPTModel (line 194) | class GPTModel(nn.Module):
    method __init__ (line 195) | def __init__(self, cfg):
    method forward (line 214) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 246) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 253) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 280) | def main():

FILE: ch04/06_swa/gpt_with_kv_swa.py
  class MultiHeadAttentionWithSWA (line 20) | class MultiHeadAttentionWithSWA(nn.Module):
    method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False, sl...
    method forward (line 43) | def forward(self, x, use_cache=False):
    method reset_cache (line 127) | def reset_cache(self):
  class LayerNorm (line 135) | class LayerNorm(nn.Module):
    method __init__ (line 136) | def __init__(self, emb_dim):
    method forward (line 142) | def forward(self, x):
  class GELU (line 149) | class GELU(nn.Module):
    method __init__ (line 150) | def __init__(self):
    method forward (line 153) | def forward(self, x):
  class FeedForward (line 160) | class FeedForward(nn.Module):
    method __init__ (line 161) | def __init__(self, cfg):
    method forward (line 169) | def forward(self, x):
  class TransformerBlock (line 173) | class TransformerBlock(nn.Module):
    method __init__ (line 174) | def __init__(self, cfg):
    method forward (line 189) | def forward(self, x, use_cache=False):
  class GPTModel (line 213) | class GPTModel(nn.Module):
    method __init__ (line 214) | def __init__(self, cfg):
    method forward (line 247) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 279) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 286) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 313) | def main():

FILE: ch04/06_swa/memory_estimator_swa.py
  function convert_bytes (line 20) | def convert_bytes(n):
  function calc_kv_bytes_per_layer (line 25) | def calc_kv_bytes_per_layer(batch, context_length, head_dim, n_kv_heads,...
  function parse_ratio (line 30) | def parse_ratio(ratio_str):
  function distribute_layers (line 41) | def distribute_layers(n_layers, a, b):
  function estimate_totals (line 50) | def estimate_totals(context_length, sliding_window_size, emb_dim, n_head...
  function main (line 92) | def main():

FILE: ch04/06_swa/plot_memory_estimates_swa.py
  function convert_bytes_to_gb (line 27) | def convert_bytes_to_gb(n_bytes):
  function parse_ratio (line 31) | def parse_ratio(ratio_str):
  function calc_kv_bytes_total_mha (line 42) | def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, by...
  function calc_kv_bytes_total_gqa (line 48) | def calc_kv_bytes_total_gqa(
  function calc_kv_bytes_total_mha_swa (line 57) | def calc_kv_bytes_total_mha_swa(
  function calc_kv_bytes_total_gqa_swa (line 75) | def calc_kv_bytes_total_gqa_swa(
  function main (line 104) | def main():

FILE: ch04/07_moe/gpt_with_kv_ffn.py
  class MultiHeadAttention (line 23) | class MultiHeadAttention(nn.Module):
    method __init__ (line 24) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False):
    method forward (line 45) | def forward(self, x, use_cache=False):
    method reset_cache (line 113) | def reset_cache(self):
  class LayerNorm (line 121) | class LayerNorm(nn.Module):
    method __init__ (line 122) | def __init__(self, emb_dim):
    method forward (line 128) | def forward(self, x):
  class GELU (line 135) | class GELU(nn.Module):
    method __init__ (line 136) | def __init__(self):
    method forward (line 139) | def forward(self, x):
  class FeedForward (line 159) | class FeedForward(nn.Module):
    method __init__ (line 160) | def __init__(self, cfg):
    method forward (line 166) | def forward(self, x):
  class TransformerBlock (line 170) | class TransformerBlock(nn.Module):
    method __init__ (line 171) | def __init__(self, cfg):
    method forward (line 185) | def forward(self, x, use_cache=False):
  class GPTModel (line 220) | class GPTModel(nn.Module):
    method __init__ (line 221) | def __init__(self, cfg):
    method forward (line 240) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 272) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 279) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 343) | def main():

FILE: ch04/07_moe/gpt_with_kv_moe.py
  class MultiHeadAttention (line 23) | class MultiHeadAttention(nn.Module):
    method __init__ (line 24) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False):
    method forward (line 45) | def forward(self, x, use_cache=False):
    method reset_cache (line 113) | def reset_cache(self):
  class LayerNorm (line 121) | class LayerNorm(nn.Module):
    method __init__ (line 122) | def __init__(self, emb_dim):
    method forward (line 128) | def forward(self, x):
  class GELU (line 135) | class GELU(nn.Module):
    method __init__ (line 136) | def __init__(self):
    method forward (line 139) | def forward(self, x):
  class FeedForward (line 146) | class FeedForward(nn.Module):
    method __init__ (line 147) | def __init__(self, cfg):
    method forward (line 155) | def forward(self, x):
  class MoEFeedForward (line 159) | class MoEFeedForward(nn.Module):
    method __init__ (line 160) | def __init__(self, cfg):
    method forward (line 186) | def forward(self, x):
  class TransformerBlock (line 230) | class TransformerBlock(nn.Module):
    method __init__ (line 231) | def __init__(self, cfg):
    method forward (line 245) | def forward(self, x, use_cache=False):
  class GPTModel (line 280) | class GPTModel(nn.Module):
    method __init__ (line 281) | def __init__(self, cfg):
    method forward (line 300) | def forward(self, in_idx, use_cache=False):
    method reset_kv_cache (line 332) | def reset_kv_cache(self):
  function generate_text_simple_cached (line 339) | def generate_text_simple_cached(model, idx, max_new_tokens,
  function main (line 403) | def main():

FILE: ch04/07_moe/memory_estimator_moe.py
  function convert_bytes (line 17) | def convert_bytes(n):
  function get_num_param_matrices (line 22) | def get_num_param_matrices(ffn_type):
  function calc_ffn_params (line 31) | def calc_ffn_params(emb_dim, hidden_dim, ffn_type):
  function calc_router_params (line 35) | def calc_router_params(emb_dim, num_experts):
  function estimate_params_and_hidden (line 39) | def estimate_params_and_hidden(
  function main (line 67) | def main():

FILE: ch04/07_moe/plot_memory_estimates_moe.py
  function calc_moe_active_and_total (line 16) | def calc_moe_active_and_total(
  function plot_active_params_vs_experts (line 42) | def plot_active_params_vs_experts(
  function main (line 93) | def main():

FILE: ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py
  function calc_kv_bytes_total_mha (line 20) | def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, by...
  function calc_kv_bytes_total_deltanet_no_conv (line 27) | def calc_kv_bytes_total_deltanet_no_conv(batch, emb_dim, n_layers, bytes...
  function convert_to_gb (line 34) | def convert_to_gb(x):
  function main (line 38) | def main():

FILE: ch05/01_main-chapter-code/gpt_download.py
  function download_and_load_gpt2 (line 16) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 48) | def download_file(url, destination, backup_url=None):
  function load_gpt2_params_from_tf_ckpt (line 126) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):

FILE: ch05/01_main-chapter-code/gpt_generate.py
  function text_to_token_ids (line 21) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 27) | def token_ids_to_text(token_ids, tokenizer):
  function download_and_load_gpt2 (line 32) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 62) | def download_file(url, destination):
  function load_gpt2_params_from_tf_ckpt (line 91) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
  function assign (line 120) | def assign(left, right):
  function load_weights_into_gpt (line 126) | def load_weights_into_gpt(gpt, params):
  function generate (line 187) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ...
  function main (line 230) | def main(gpt_config, input_prompt, model_size, device):

FILE: ch05/01_main-chapter-code/gpt_train.py
  function text_to_token_ids (line 17) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 23) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_batch (line 28) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 35) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 52) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 61) | def generate_and_print_sample(model, tokenizer, device, start_context):
  function train_model_simple (line 75) | def train_model_simple(model, train_loader, val_loader, optimizer, devic...
  function plot_losses (line 112) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  function main (line 131) | def main(gpt_config, settings):

FILE: ch05/01_main-chapter-code/previous_chapters.py
  class GPTDatasetV1 (line 20) | class GPTDatasetV1(Dataset):
    method __init__ (line 21) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 35) | def __len__(self):
    method __getitem__ (line 38) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 42) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 60) | class MultiHeadAttention(nn.Module):
    method __init__ (line 61) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 76) | def forward(self, x):
  class LayerNorm (line 119) | class LayerNorm(nn.Module):
    method __init__ (line 120) | def __init__(self, emb_dim):
    method forward (line 126) | def forward(self, x):
  class GELU (line 133) | class GELU(nn.Module):
    method __init__ (line 134) | def __init__(self):
    method forward (line 137) | def forward(self, x):
  class FeedForward (line 144) | class FeedForward(nn.Module):
    method __init__ (line 145) | def __init__(self, cfg):
    method forward (line 153) | def forward(self, x):
  class TransformerBlock (line 157) | class TransformerBlock(nn.Module):
    method __init__ (line 158) | def __init__(self, cfg):
    method forward (line 172) | def forward(self, x):
  class GPTModel (line 190) | class GPTModel(nn.Module):
    method __init__ (line 191) | def __init__(self, cfg):
    method forward (line 203) | def forward(self, in_idx):
  function generate_text_simple (line 215) | def generate_text_simple(model, idx, max_new_tokens, context_size):

FILE: ch05/01_main-chapter-code/tests.py
  function gpt_config (line 13) | def gpt_config():
  function other_settings (line 26) | def other_settings():
  function test_main (line 35) | def test_main(gpt_config, other_settings):
  function check_file_size (line 43) | def check_file_size(url, expected_size):
  function test_model_files (line 63) | def test_model_files():

FILE: ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
  function is_english (line 17) | def is_english(text, threshold=0.9):
  function combine_files (line 22) | def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|...

FILE: ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py
  function read_text_file (line 28) | def read_text_file(file_path):
  function create_dataloaders (line 34) | def create_dataloaders(text_data, train_ratio, batch_size, max_length, s...
  function convert_time (line 57) | def convert_time(seconds):
  function print_eta (line 63) | def print_eta(start_time, book_start_time, index, total_files):
  function train_model_simple (line 80) | def train_model_simple(model, optimizer, device, n_epochs,

FILE: ch05/03_bonus_pretraining_on_gutenberg/tests.py
  function test_pretraining (line 13) | def test_pretraining():

FILE: ch05/05_bonus_hparam_tuning/hparam_search.py
  function calc_loss_loader (line 31) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function calc_loss_batch (line 48) | def calc_loss_batch(input_batch, target_batch, model, device):
  function evaluate_model (line 57) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function train_model (line 66) | def train_model(model, train_loader, val_loader, optimizer, device,

FILE: ch05/06_user_interface/app_orig.py
  function get_model_and_tokenizer (line 24) | def get_model_and_tokenizer():
  function main (line 68) | async def main(message: chainlit.Message):

FILE: ch05/06_user_interface/app_own.py
  function get_model_and_tokenizer (line 26) | def get_model_and_tokenizer():
  function main (line 62) | async def main(message: chainlit.Message):

FILE: ch05/07_gpt_to_llama/previous_chapters.py
  function text_to_token_ids (line 16) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 22) | def token_ids_to_text(token_ids, tokenizer):
  function generate (line 27) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ...

FILE: ch05/07_gpt_to_llama/tests/test_llama32_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function test_dummy_llama3_forward (line 54) | def test_dummy_llama3_forward(dummy_cfg_base, dummy_input, import_notebo...
  function test_llama3_base_equivalence_with_transformers (line 63) | def test_llama3_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
  function litgpt_build_rope_cache (line 27) | def litgpt_build_rope_cache(
  function litgpt_apply_rope (line 79) | def litgpt_apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Ten...
  function notebook (line 96) | def notebook():
  function set_seed (line 139) | def set_seed():
  function test_rope_llama2 (line 143) | def test_rope_llama2(notebook):
  function test_rope_llama3 (line 207) | def test_rope_llama3(notebook):
  function test_rope_llama3_12 (line 277) | def test_rope_llama3_12(notebook):
  function test_silu (line 371) | def test_silu(notebook):
  function test_rmsnorm (line 378) | def test_rmsnorm(notebook):

FILE: ch05/08_memory_efficient_weight_loading/previous_chapters.py
  class MultiHeadAttention (line 18) | class MultiHeadAttention(nn.Module):
    method __init__ (line 19) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 34) | def forward(self, x):
  class LayerNorm (line 77) | class LayerNorm(nn.Module):
    method __init__ (line 78) | def __init__(self, emb_dim):
    method forward (line 84) | def forward(self, x):
  class GELU (line 91) | class GELU(nn.Module):
    method __init__ (line 92) | def __init__(self):
    method forward (line 95) | def forward(self, x):
  class FeedForward (line 102) | class FeedForward(nn.Module):
    method __init__ (line 103) | def __init__(self, cfg):
    method forward (line 111) | def forward(self, x):
  class TransformerBlock (line 115) | class TransformerBlock(nn.Module):
    method __init__ (line 116) | def __init__(self, cfg):
    method forward (line 130) | def forward(self, x):
  class GPTModel (line 148) | class GPTModel(nn.Module):
    method __init__ (line 149) | def __init__(self, cfg):
    method forward (line 161) | def forward(self, in_idx):

FILE: ch05/10_llm-training-speed/00_orig.py
  class GPTDatasetV1 (line 22) | class GPTDatasetV1(Dataset):
    method __init__ (line 23) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 37) | def __len__(self):
    method __getitem__ (line 40) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 44) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 62) | class MultiHeadAttention(nn.Module):
    method __init__ (line 63) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 78) | def forward(self, x):
  class LayerNorm (line 121) | class LayerNorm(nn.Module):
    method __init__ (line 122) | def __init__(self, emb_dim):
    method forward (line 128) | def forward(self, x):
  class GELU (line 135) | class GELU(nn.Module):
    method __init__ (line 136) | def __init__(self):
    method forward (line 139) | def forward(self, x):
  class FeedForward (line 146) | class FeedForward(nn.Module):
    method __init__ (line 147) | def __init__(self, cfg):
    method forward (line 155) | def forward(self, x):
  class TransformerBlock (line 159) | class TransformerBlock(nn.Module):
    method __init__ (line 160) | def __init__(self, cfg):
    method forward (line 174) | def forward(self, x):
  class GPTModel (line 192) | class GPTModel(nn.Module):
    method __init__ (line 193) | def __init__(self, cfg):
    method forward (line 205) | def forward(self, in_idx):
  function generate_text_simple (line 217) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function text_to_token_ids (line 247) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 253) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_batch (line 258) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 265) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 282) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 291) | def generate_and_print_sample(model, tokenizer, device, start_context):
  function train_model_simple_with_timing (line 305) | def train_model_simple_with_timing(model, train_loader, val_loader, opti...
  function plot_losses (line 387) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  function main (line 410) | def main(gpt_config, settings):

FILE: ch05/10_llm-training-speed/01_opt_single_gpu.py
  class GPTDatasetV1 (line 22) | class GPTDatasetV1(Dataset):
    method __init__ (line 23) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 37) | def __len__(self):
    method __getitem__ (line 40) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 44) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class PyTorchMultiHeadAttention (line 64) | class PyTorchMultiHeadAttention(nn.Module):
    method __init__ (line 65) | def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False):
    method forward (line 78) | def forward(self, x):
  class FeedForward (line 111) | class FeedForward(nn.Module):
    method __init__ (line 112) | def __init__(self, cfg):
    method forward (line 120) | def forward(self, x):
  class TransformerBlock (line 124) | class TransformerBlock(nn.Module):
    method __init__ (line 125) | def __init__(self, cfg):
    method forward (line 138) | def forward(self, x):
  class GPTModel (line 156) | class GPTModel(nn.Module):
    method __init__ (line 157) | def __init__(self, cfg):
    method forward (line 169) | def forward(self, in_idx):
  function generate_text_simple (line 181) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function text_to_token_ids (line 211) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 217) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_batch (line 222) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 229) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 246) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 255) | def generate_and_print_sample(model, tokenizer, device, start_context):
  function train_model_simple_with_timing (line 269) | def train_model_simple_with_timing(model, train_loader, val_loader, opti...
  function plot_losses (line 351) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  function main (line 374) | def main(gpt_config, settings):

FILE: ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py
  function ddp_setup (line 27) | def ddp_setup(rank, world_size):
  class GPTDatasetV1 (line 58) | class GPTDatasetV1(Dataset):
    method __init__ (line 59) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 73) | def __len__(self):
    method __getitem__ (line 76) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 82) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class PyTorchMultiHeadAttention (line 107) | class PyTorchMultiHeadAttention(nn.Module):
    method __init__ (line 108) | def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False):
    method forward (line 121) | def forward(self, x):
  class FeedForward (line 154) | class FeedForward(nn.Module):
    method __init__ (line 155) | def __init__(self, cfg):
    method forward (line 163) | def forward(self, x):
  class TransformerBlock (line 167) | class TransformerBlock(nn.Module):
    method __init__ (line 168) | def __init__(self, cfg):
    method forward (line 181) | def forward(self, x):
  class GPTModel (line 199) | class GPTModel(nn.Module):
    method __init__ (line 200) | def __init__(self, cfg):
    method forward (line 212) | def forward(self, in_idx):
  function generate_text_simple (line 224) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function text_to_token_ids (line 254) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 260) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_batch (line 265) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 272) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 289) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 298) | def generate_and_print_sample(model, device, start_context):
  function train_model_simple_with_timing (line 314) | def train_model_simple_with_timing(model, train_loader, val_loader, opti...
  function plot_losses (line 416) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  function main (line 440) | def main(gpt_config, settings, rank, world_size):

FILE: ch05/11_qwen3/qwen3-chat-interface/qwen3-chat-interface-multiturn.py
  function get_qwen_config (line 33) | def get_qwen_config(name):
  function build_repo_and_local (line 53) | def build_repo_and_local(model_name, reasoning, local_dir_arg):
  function get_device (line 60) | def get_device(name):
  function get_model_and_tokenizer (line 76) | def get_model_and_tokenizer(qwen3_config, repo_id, local_dir, device, us...
  function build_prompt_from_history (line 99) | def build_prompt_from_history(history, add_assistant_header=True):
  function on_start (line 125) | async def on_start():
  function main (line 133) | async def main(message: chainlit.Message):

FILE: ch05/11_qwen3/qwen3-chat-interface/qwen3-chat-interface.py
  function get_qwen_config (line 32) | def get_qwen_config(name):
  function build_repo_and_local (line 52) | def build_repo_and_local(model_name, reasoning, local_dir_arg):
  function get_device (line 59) | def get_device(name):
  function get_model_and_tokenizer (line 75) | def get_model_and_tokenizer(qwen3_config, repo_id, local_dir, device, us...
  function on_start (line 105) | async def on_start():
  function main (line 113) | async def main(message: chainlit.Message):

FILE: ch05/11_qwen3/tests/test_qwen3_kvcache_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function dummy_cfg_moe (line 50) | def dummy_cfg_moe(dummy_cfg_base):
  function test_dummy_qwen3_forward (line 61) | def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, import_noteboo...
  function test_qwen3_base_equivalence_with_transformers (line 71) | def test_qwen3_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/11_qwen3/tests/test_qwen3_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function dummy_cfg_moe (line 50) | def dummy_cfg_moe(dummy_cfg_base):
  function test_dummy_qwen3_forward (line 61) | def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, import_noteboo...
  function test_qwen3_base_equivalence_with_transformers (line 71) | def test_qwen3_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/12_gemma3/tests/test_gemma3_kv_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function test_dummy_gemma3_forward (line 53) | def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, import_notebo...
  function test_gemma3_base_equivalence_with_transformers (line 62) | def test_gemma3_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/12_gemma3/tests/test_gemma3_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function test_dummy_gemma3_forward (line 53) | def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, import_notebo...
  function test_gemma3_base_equivalence_with_transformers (line 62) | def test_gemma3_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/13_olmo3/tests/olmo3_layer_debugger.py
  function tiny_debug_config (line 20) | def tiny_debug_config():
  function yarn_debug_config (line 46) | def yarn_debug_config():
  function _hf_config_from_dict (line 74) | def _hf_config_from_dict(cfg):
  function load_notebook_defs (line 114) | def load_notebook_defs(nb_name="standalone-olmo3.ipynb"):
  function build_olmo3_pair (line 119) | def build_olmo3_pair(import_notebook_defs, cfg, hf_checkpoint=None):
  function _attach_debug_hooks (line 143) | def _attach_debug_hooks(model, is_hf):
  function _layer_sort_key (line 169) | def _layer_sort_key(name):
  function layerwise_differences (line 182) | def layerwise_differences(ours, hf_model, input_ids, rtol=1e-5, atol=1e-5):
  function first_mismatch (line 244) | def first_mismatch(differences):
  function format_report (line 251) | def format_report(differences):

FILE: ch05/13_olmo3/tests/test_olmo3_kvcache_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function test_dummy_olmo3_forward (line 58) | def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, import_noteboo...
  function test_olmo3_base_equivalence_with_transformers (line 68) | def test_olmo3_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/13_olmo3/tests/test_olmo3_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function test_dummy_olmo3_forward (line 58) | def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, import_noteboo...
  function test_olmo3_base_equivalence_with_transformers (line 68) | def test_olmo3_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/15_tiny-aya/tests/test_tiny_aya_kvcache_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function test_dummy_tiny_aya_forward (line 55) | def test_dummy_tiny_aya_forward(dummy_cfg_base, dummy_input, import_note...
  function test_tiny_aya_base_equivalence_with_transformers (line 65) | def test_tiny_aya_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/15_tiny-aya/tests/test_tiny_aya_nb.py
  function import_notebook_defs (line 19) | def import_notebook_defs():
  function dummy_input (line 26) | def dummy_input():
  function dummy_cfg_base (line 32) | def dummy_cfg_base():
  function test_dummy_tiny_aya_forward (line 54) | def test_dummy_tiny_aya_forward(dummy_cfg_base, dummy_input, import_note...
  function test_tiny_aya_base_equivalence_with_transformers (line 64) | def test_tiny_aya_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch05/15_tiny-aya/tests/tiny_aya_layer_debugger.py
  function tiny_debug_config (line 19) | def tiny_debug_config():
  function _hf_config_from_dict (line 41) | def _hf_config_from_dict(cfg):
  function load_notebook_defs (line 65) | def load_notebook_defs(nb_name="standalone-tiny-aya.ipynb"):
  function build_tiny_aya_pair (line 70) | def build_tiny_aya_pair(import_notebook_defs, cfg, hf_checkpoint=None):
  function _attach_debug_hooks (line 93) | def _attach_debug_hooks(model, is_hf):
  function _layer_sort_key (line 125) | def _layer_sort_key(name):
  function layerwise_differences (line 138) | def layerwise_differences(ours, hf_model, input_ids, rtol=1e-5, atol=1e-5):
  function format_report (line 199) | def format_report(differences):

FILE: ch05/16_qwen3.5/qwen3_5_transformers.py
  class _NotebookLogger (line 25) | class _NotebookLogger:
    method __init__ (line 26) | def __init__(self):
    method warning_once (line 29) | def warning_once(self, msg):
  class Qwen3_5Config (line 40) | class Qwen3_5Config:
  class Qwen3_5DynamicCache (line 44) | class Qwen3_5DynamicCache:
  class Qwen3_5RMSNormGated (line 48) | class Qwen3_5RMSNormGated(nn.Module):
    method __init__ (line 49) | def __init__(self, hidden_size, eps=1e-6, **kwargs):
    method forward (line 54) | def forward(self, hidden_states, gate=None):
  function apply_mask_to_padding_states (line 66) | def apply_mask_to_padding_states(hidden_states, attention_mask):
  function torch_causal_conv1d_update (line 78) | def torch_causal_conv1d_update(
  function l2norm (line 96) | def l2norm(x, dim=-1, eps=1e-6):
  function torch_chunk_gated_delta_rule (line 102) | def torch_chunk_gated_delta_rule(
  function torch_recurrent_gated_delta_rule (line 182) | def torch_recurrent_gated_delta_rule(
  class Qwen3_5GatedDeltaNet (line 226) | class Qwen3_5GatedDeltaNet(nn.Module):
    method __init__ (line 227) | def __init__(self, config, layer_idx):
    method forward (line 296) | def forward(

FILE: ch05/16_qwen3.5/tests/qwen3_5_layer_debugger.py
  function _import_qwen3_5_classes (line 14) | def _import_qwen3_5_classes():
  function tiny_debug_config (line 44) | def tiny_debug_config():
  function _hf_config_from_dict (line 68) | def _hf_config_from_dict(cfg):
  function load_notebook_defs (line 105) | def load_notebook_defs(nb_name="qwen3.5.ipynb"):
  function build_qwen3_5_pair (line 112) | def build_qwen3_5_pair(import_notebook_defs, cfg, hf_checkpoint=None):
  function _attach_debug_hooks (line 140) | def _attach_debug_hooks(model, is_hf):
  function _layer_sort_key (line 174) | def _layer_sort_key(name):
  function layerwise_differences (line 187) | def layerwise_differences(ours, hf_model, input_ids, rtol=1e-5, atol=1e-5):
  function format_report (line 248) | def format_report(differences):

FILE: ch05/16_qwen3.5/tests/test_qwen3_5_nb.py
  function _import_qwen3_5_classes (line 16) | def _import_qwen3_5_classes():
  function import_notebook_defs (line 51) | def import_notebook_defs():
  function dummy_input (line 61) | def dummy_input():
  function dummy_cfg_base (line 67) | def dummy_cfg_base():
  function test_dummy_qwen3_5_forward (line 92) | def test_dummy_qwen3_5_forward(dummy_cfg_base, dummy_input, import_noteb...
  function test_qwen3_5_base_equivalence_with_transformers (line 103) | def test_qwen3_5_base_equivalence_with_transformers(import_notebook_defs):

FILE: ch06/01_main-chapter-code/gpt_class_finetune.py
  function download_and_unzip_spam_data (line 24) | def download_and_unzip_spam_data(url, zip_path, extracted_path, data_fil...
  function create_balanced_dataset (line 47) | def create_balanced_dataset(df):
  function random_split (line 60) | def random_split(df, train_frac, validation_frac):
  class SpamDataset (line 76) | class SpamDataset(Dataset):
    method __init__ (line 77) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method __getitem__ (line 101) | def __getitem__(self, index):
    method __len__ (line 109) | def __len__(self):
    method _longest_encoded_length (line 112) | def _longest_encoded_length(self):
  function calc_accuracy_loader (line 124) | def calc_accuracy_loader(data_loader, model, device, num_batches=None):
  function calc_loss_batch (line 147) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 154) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 171) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function train_classifier_simple (line 180) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ...
  function plot_values (line 218) | def plot_values(epochs_seen, examples_seen, train_values, val_values, la...

FILE: ch06/01_main-chapter-code/gpt_download.py
  function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 49) | def download_file(url, destination, backup_url=None):
  function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):

FILE: ch06/01_main-chapter-code/previous_chapters.py
  class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset):
    method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 36) | def __len__(self):
    method __getitem__ (line 39) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 43) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 61) | class MultiHeadAttention(nn.Module):
    method __init__ (line 62) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 77) | def forward(self, x):
  class LayerNorm (line 120) | class LayerNorm(nn.Module):
    method __init__ (line 121) | def __init__(self, emb_dim):
    method forward (line 127) | def forward(self, x):
  class GELU (line 134) | class GELU(nn.Module):
    method __init__ (line 135) | def __init__(self):
    method forward (line 138) | def forward(self, x):
  class FeedForward (line 145) | class FeedForward(nn.Module):
    method __init__ (line 146) | def __init__(self, cfg):
    method forward (line 154) | def forward(self, x):
  class TransformerBlock (line 158) | class TransformerBlock(nn.Module):
    method __init__ (line 159) | def __init__(self, cfg):
    method forward (line 173) | def forward(self, x):
  class GPTModel (line 191) | class GPTModel(nn.Module):
    method __init__ (line 192) | def __init__(self, cfg):
    method forward (line 204) | def forward(self, in_idx):
  function generate_text_simple (line 216) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function assign (line 245) | def assign(left, right):
  function load_weights_into_gpt (line 251) | def load_weights_into_gpt(gpt, params):
  function text_to_token_ids (line 312) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 318) | def token_ids_to_text(token_ids, tokenizer):

FILE: ch06/01_main-chapter-code/tests.py
  function test_gpt_class_finetune (line 12) | def test_gpt_class_finetune():

FILE: ch06/02_bonus_additional-experiments/additional_experiments.py
  class LoRALayer (line 32) | class LoRALayer(torch.nn.Module):
    method __init__ (line 33) | def __init__(self, in_dim, out_dim, rank, alpha):
    method forward (line 40) | def forward(self, x):
  class LinearWithLoRA (line 45) | class LinearWithLoRA(torch.nn.Module):
    method __init__ (line 46) | def __init__(self, linear, rank, alpha):
    method forward (line 53) | def forward(self, x):
  class LinearWithLoRAMerged (line 58) | class LinearWithLoRAMerged(torch.nn.Module):
    method __init__ (line 59) | def __init__(self, linear, rank, alpha):
    method forward (line 66) | def forward(self, x):
  class SpamDataset (line 72) | class SpamDataset(Dataset):
    method __init__ (line 73) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method __getitem__ (line 90) | def __getitem__(self, index):
    method __len__ (line 95) | def __len__(self):
    method _longest_encoded_length (line 98) | def _longest_encoded_length(self, tokenizer):
  function download_and_unzip (line 110) | def download_and_unzip(url, zip_path, extract_to, new_file_path):
  function random_split (line 133) | def random_split(df, train_frac, val_frac):
  function create_dataset_csvs (line 149) | def create_dataset_csvs(new_file_path):
  function instantiate_model (line 166) | def instantiate_model(choose_model, load_weights):
  function calc_loss_batch (line 197) | def calc_loss_batch(input_batch, target_batch, model, device,
  function calc_loss_loader (line 231) | def calc_loss_loader(data_loader, model, device,
  function calc_accuracy_loader (line 257) | def calc_accuracy_loader(data_loader, model, device, num_batches=None,
  function evaluate_model (line 310) | def evaluate_model(model, train_loader, val_loader, device,
  function train_classifier_simple (line 329) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ...
  function replace_linear_with_lora (line 398) | def replace_linear_with_lora(model, rank, alpha, alternative=False):

FILE: ch06/02_bonus_additional-experiments/gpt_download.py
  function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 49) | def download_file(url, destination, backup_url=None):
  function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):

FILE: ch06/02_bonus_additional-experiments/previous_chapters.py
  class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset):
    method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 36) | def __len__(self):
    method __getitem__ (line 39) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 43) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 61) | class MultiHeadAttention(nn.Module):
    method __init__ (line 62) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 80) | def forward(self, x):
  class LayerNorm (line 124) | class LayerNorm(nn.Module):
    method __init__ (line 125) | def __init__(self, emb_dim):
    method forward (line 131) | def forward(self, x):
  class GELU (line 138) | class GELU(nn.Module):
    method __init__ (line 139) | def __init__(self):
    method forward (line 142) | def forward(self, x):
  class FeedForward (line 149) | class FeedForward(nn.Module):
    method __init__ (line 150) | def __init__(self, cfg):
    method forward (line 158) | def forward(self, x):
  class TransformerBlock (line 162) | class TransformerBlock(nn.Module):
    method __init__ (line 163) | def __init__(self, cfg, disable_causal_mask=False):
    method forward (line 179) | def forward(self, x):
  class GPTModel (line 197) | class GPTModel(nn.Module):
    method __init__ (line 198) | def __init__(self, cfg, disable_causal_mask=False):
    method forward (line 210) | def forward(self, in_idx):
  function generate_text_simple (line 222) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function assign (line 251) | def assign(left, right):
  function load_weights_into_gpt (line 257) | def load_weights_into_gpt(gpt, params):
  function generate (line 318) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ...

FILE: ch06/03_bonus_imdb-classification/download_prepare_dataset.py
  function reporthook (line 14) | def reporthook(count, block_size, total_size):
  function download_and_extract_dataset (line 31) | def download_and_extract_dataset(dataset_url, target_file, directory):
  function load_dataset_to_dataframe (line 51) | def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg...
  function partition_and_save (line 66) | def partition_and_save(df, sizes=(35000, 5000, 10000)):

FILE: ch06/03_bonus_imdb-classification/gpt_download.py
  function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 49) | def download_file(url, destination, backup_url=None):
  function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):

FILE: ch06/03_bonus_imdb-classification/previous_chapters.py
  class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset):
    method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 37) | def __len__(self):
    method __getitem__ (line 40) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 44) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 62) | class MultiHeadAttention(nn.Module):
    method __init__ (line 63) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 78) | def forward(self, x):
  class LayerNorm (line 121) | class LayerNorm(nn.Module):
    method __init__ (line 122) | def __init__(self, emb_dim):
    method forward (line 128) | def forward(self, x):
  class GELU (line 135) | class GELU(nn.Module):
    method __init__ (line 136) | def __init__(self):
    method forward (line 139) | def forward(self, x):
  class FeedForward (line 146) | class FeedForward(nn.Module):
    method __init__ (line 147) | def __init__(self, cfg):
    method forward (line 155) | def forward(self, x):
  class TransformerBlock (line 159) | class TransformerBlock(nn.Module):
    method __init__ (line 160) | def __init__(self, cfg):
    method forward (line 174) | def forward(self, x):
  class GPTModel (line 192) | class GPTModel(nn.Module):
    method __init__ (line 193) | def __init__(self, cfg):
    method forward (line 205) | def forward(self, in_idx):
  function generate_text_simple (line 217) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function assign (line 246) | def assign(left, right):
  function load_weights_into_gpt (line 252) | def load_weights_into_gpt(gpt, params):
  function text_to_token_ids (line 313) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 319) | def token_ids_to_text(token_ids, tokenizer):

FILE: ch06/03_bonus_imdb-classification/train_bert_hf.py
  class IMDbDataset (line 18) | class IMDbDataset(Dataset):
    method __init__ (line 19) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method _create_attention_mask (line 43) | def _create_attention_mask(self, encoded_text):
    method __getitem__ (line 46) | def __getitem__(self, index):
    method __len__ (line 61) | def __len__(self):
    method _longest_encoded_length (line 64) | def _longest_encoded_length(self, tokenizer):
  function calc_loss_batch (line 73) | def calc_loss_batch(input_batch, attention_mask_batch, target_batch, mod...
  function calc_loss_loader (line 83) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function calc_accuracy_loader (line 101) | def calc_accuracy_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 123) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function train_classifier_simple (line 132) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ...

FILE: ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
  class SpamDataset (line 21) | class SpamDataset(Dataset):
    method __init__ (line 22) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method __getitem__ (line 39) | def __getitem__(self, index):
    method __len__ (line 44) | def __len__(self):
    method _longest_encoded_length (line 47) | def _longest_encoded_length(self, tokenizer):
  function download_and_unzip (line 59) | def download_and_unzip(url, zip_path, extract_to, new_file_path):
  function random_split (line 82) | def random_split(df, train_frac, val_frac):
  function create_dataset_csvs (line 98) | def create_dataset_csvs(new_file_path):
  class SPAMDataset (line 115) | class SPAMDataset(Dataset):
    method __init__ (line 116) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method _create_attention_mask (line 140) | def _create_attention_mask(self, encoded_text):
    method __getitem__ (line 143) | def __getitem__(self, index):
    method __len__ (line 158) | def __len__(self):
    method _longest_encoded_length (line 161) | def _longest_encoded_length(self, tokenizer):
  function calc_loss_batch (line 170) | def calc_loss_batch(input_batch, attention_mask_batch, target_batch, mod...
  function calc_loss_loader (line 180) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function calc_accuracy_loader (line 198) | def calc_accuracy_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 220) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function train_classifier_simple (line 229) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ...

FILE: ch06/03_bonus_imdb-classification/train_gpt.py
  class IMDbDataset (line 20) | class IMDbDataset(Dataset):
    method __init__ (line 21) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method __getitem__ (line 36) | def __getitem__(self, index):
    method __len__ (line 41) | def __len__(self):
    method _longest_encoded_length (line 44) | def _longest_encoded_length(self, tokenizer):
  function instantiate_model (line 53) | def instantiate_model(choose_model, load_weights):
  function calc_loss_batch (line 84) | def calc_loss_batch(input_batch, target_batch, model, device,
  function calc_loss_loader (line 100) | def calc_loss_loader(data_loader, model, device,
  function calc_accuracy_loader (line 125) | def calc_accuracy_loader(data_loader, model, device,
  function evaluate_model (line 156) | def evaluate_model(model, train_loader, val_loader, device, eval_iter,
  function train_classifier_simple (line 172) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ...

FILE: ch06/03_bonus_imdb-classification/train_sklearn_logreg.py
  function load_dataframes (line 14) | def load_dataframes():
  function eval_model (line 22) | def eval_model(model, X_train, y_train, X_val, y_val, X_test, y_test):

FILE: ch06/04_user_interface/app.py
  function get_model_and_tokenizer (line 22) | def get_model_and_tokenizer():
  function main (line 69) | async def main(message: chainlit.Message):

FILE: ch07/01_main-chapter-code/exercise_experiments.py
  class InstructionDataset (line 37) | class InstructionDataset(Dataset):
    method __init__ (line 38) | def __init__(self, data, tokenizer):
    method __getitem__ (line 51) | def __getitem__(self, index):
    method __len__ (line 54) | def __len__(self):
  class InstructionDatasetWithMasking (line 58) | class InstructionDatasetWithMasking(Dataset):
    method __init__ (line 59) | def __init__(self, data, tokenizer):
    method __getitem__ (line 79) | def __getitem__(self, index):
    method __len__ (line 83) | def __len__(self):
  class InstructionDatasetPhi (line 87) | class InstructionDatasetPhi(Dataset):
    method __init__ (line 88) | def __init__(self, data, tokenizer):
    method __getitem__ (line 105) | def __getitem__(self, index):
    method __len__ (line 108) | def __len__(self):
  class LinearWithLoRA (line 112) | class LinearWithLoRA(torch.nn.Module):
    method __init__ (line 113) | def __init__(self, linear, rank, alpha):
    method forward (line 120) | def forward(self, x):
  class LoRALayer (line 124) | class LoRALayer(torch.nn.Module):
    method __init__ (line 125) | def __init__(self, in_dim, out_dim, rank, alpha):
    method forward (line 132) | def forward(self, x):
  function replace_linear_with_lora (line 137) | def replace_linear_with_lora(model, rank, alpha):
  function custom_collate_fn (line 147) | def custom_collate_fn(
  function custom_collate_with_masking_fn (line 190) | def custom_collate_with_masking_fn(
  function download_and_load_file (line 236) | def download_and_load_file(file_path, url):
  function format_input_phi (line 253) | def format_input_phi(entry):
  function format_input (line 263) | def format_input(entry):
  function plot_losses (line 275) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, plot...
  function main (line 297) | def main(mask_instructions=False, alpaca52k=False, phi3_prompt=False, lo...

FILE: ch07/01_main-chapter-code/gpt_download.py
  function download_and_load_gpt2 (line 16) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 48) | def download_file(url, destination, backup_url=None):
  function load_gpt2_params_from_tf_ckpt (line 95) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):

FILE: ch07/01_main-chapter-code/gpt_instruction_finetuning.py
  class InstructionDataset (line 35) | class InstructionDataset(Dataset):
    method __init__ (line 36) | def __init__(self, data, tokenizer):
    method __getitem__ (line 49) | def __getitem__(self, index):
    method __len__ (line 52) | def __len__(self):
  function custom_collate_fn (line 56) | def custom_collate_fn(
  function download_and_load_file (line 99) | def download_and_load_file(file_path, url):
  function format_input (line 113) | def format_input(entry):
  function plot_losses (line 125) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  function main (line 147) | def main(test_mode=False):

FILE: ch07/01_main-chapter-code/ollama_evaluate.py
  function query_model (line 14) | def query_model(prompt, model="llama3", url="http://localhost:11434/api/...
  function check_if_running (line 42) | def check_if_running(process_name):
  function format_input (line 51) | def format_input(entry):
  function main (line 63) | def main(file_path):
  function generate_model_scores (line 79) | def generate_model_scores(json_data, json_key, model="llama3"):

FILE: ch07/01_main-chapter-code/previous_chapters.py
  class GPTDatasetV1 (line 25) | class GPTDatasetV1(Dataset):
    method __init__ (line 26) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 41) | def __len__(self):
    method __getitem__ (line 44) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 48) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 66) | class MultiHeadAttention(nn.Module):
    method __init__ (line 67) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 82) | def forward(self, x):
  class LayerNorm (line 125) | class LayerNorm(nn.Module):
    method __init__ (line 126) | def __init__(self, emb_dim):
    method forward (line 132) | def forward(self, x):
  class GELU (line 139) | class GELU(nn.Module):
    method __init__ (line 140) | def __init__(self):
    method forward (line 143) | def forward(self, x):
  class FeedForward (line 150) | class FeedForward(nn.Module):
    method __init__ (line 151) | def __init__(self, cfg):
    method forward (line 159) | def forward(self, x):
  class TransformerBlock (line 163) | class TransformerBlock(nn.Module):
    method __init__ (line 164) | def __init__(self, cfg):
    method forward (line 178) | def forward(self, x):
  class GPTModel (line 196) | class GPTModel(nn.Module):
    method __init__ (line 197) | def __init__(self, cfg):
    method forward (line 209) | def forward(self, in_idx):
  function generate_text_simple (line 221) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function generate (line 250) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ...
  function train_model_simple (line 293) | def train_model_simple(model, train_loader, val_loader, optimizer, devic...
  function evaluate_model (line 329) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 338) | def generate_and_print_sample(model, tokenizer, device, start_context):
  function assign (line 352) | def assign(left, right):
  function load_weights_into_gpt (line 358) | def load_weights_into_gpt(gpt, params):
  function text_to_token_ids (line 419) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 425) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_batch (line 430) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 437) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function plot_losses (line 456) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):

FILE: ch07/01_main-chapter-code/tests.py
  function test_gpt_class_finetune (line 12) | def test_gpt_class_finetune():

FILE: ch07/02_dataset-utilities/find-near-duplicates.py
  function preprocess_text (line 33) | def preprocess_text(text):
  function find_near_duplicates (line 41) | def find_near_duplicates(json_data, threshold=0.75, key="instruction"):
  function find_print_and_remove_near_duplicates (line 76) | def find_print_and_remove_near_duplicates(json_data, remove_duplicates=F...

FILE: ch07/04_preference-tuning-with-dpo/previous_chapters.py
  class GPTDatasetV1 (line 25) | class GPTDatasetV1(Dataset):
    method __init__ (line 26) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 41) | def __len__(self):
    method __getitem__ (line 44) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 48) | def create_dataloader_v1(txt, batch_size=4, max_length=256,
  class MultiHeadAttention (line 66) | class MultiHeadAttention(nn.Module):
    method __init__ (line 67) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 82) | def forward(self, x):
  class LayerNorm (line 125) | class LayerNorm(nn.Module):
    method __init__ (line 126) | def __init__(self, emb_dim):
    method forward (line 132) | def forward(self, x):
  class GELU (line 139) | class GELU(nn.Module):
    method __init__ (line 140) | def __init__(self):
    method forward (line 143) | def forward(self, x):
  class FeedForward (line 150) | class FeedForward(nn.Module):
    method __init__ (line 151) | def __init__(self, cfg):
    method forward (line 159) | def forward(self, x):
  class TransformerBlock (line 163) | class TransformerBlock(nn.Module):
    method __init__ (line 164) | def __init__(self, cfg):
    method forward (line 178) | def forward(self, x):
  class GPTModel (line 196) | class GPTModel(nn.Module):
    method __init__ (line 197) | def __init__(self, cfg):
    method forward (line 209) | def forward(self, in_idx):
  function generate_text_simple (line 221) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  function generate (line 250) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ...
  function train_model_simple (line 294) | def train_model_simple(model, train_loader, val_loader, optimizer, devic...
  function evaluate_model (line 330) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 339) | def generate_and_print_sample(model, tokenizer, device, start_context):
  function assign (line 353) | def assign(left, right):
  function load_weights_into_gpt (line 359) | def load_weights_into_gpt(gpt, params):
  function text_to_token_ids (line 420) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 426) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_batch (line 431) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 438) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function plot_losses (line 457) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, labe...

FILE: ch07/06_user_interface/app.py
  function get_model_and_tokenizer (line 26) | def get_model_and_tokenizer():
  function extract_response (line 60) | def extract_response(response_text, input_text):
  function main (line 69) | async def main(message: chainlit.Message):

FILE: conftest.py
  function _get_env_number (line 8) | def _get_env_number(name, default, cast):
  function pytest_configure (line 19) | def pytest_configure(config):

FILE: pkg/llms_from_scratch/appendix_a.py
  class NeuralNetwork (line 10) | class NeuralNetwork(torch.nn.Module):
    method __init__ (line 11) | def __init__(self, num_inputs, num_outputs):
    method forward (line 28) | def forward(self, x):
  class ToyDataset (line 33) | class ToyDataset(Dataset):
    method __init__ (line 34) | def __init__(self, X, y):
    method __getitem__ (line 38) | def __getitem__(self, index):
    method __len__ (line 43) | def __len__(self):

FILE: pkg/llms_from_scratch/appendix_d.py
  function find_highest_gradient (line 12) | def find_highest_gradient(model):
  function train_model (line 23) | def train_model(model, train_loader, val_loader, optimizer, device,

FILE: pkg/llms_from_scratch/appendix_e.py
  class LoRALayer (line 10) | class LoRALayer(torch.nn.Module):
    method __init__ (line 11) | def __init__(self, in_dim, out_dim, rank, alpha):
    method forward (line 19) | def forward(self, x):
  class LinearWithLoRA (line 25) | class LinearWithLoRA(torch.nn.Module):
    method __init__ (line 26) | def __init__(self, linear, rank, alpha):
    method forward (line 33) | def forward(self, x):
  function replace_linear_with_lora (line 37) | def replace_linear_with_lora(model, rank, alpha):

FILE: pkg/llms_from_scratch/ch02.py
  class GPTDatasetV1 (line 11) | class GPTDatasetV1(Dataset):
    method __init__ (line 12) | def __init__(self, txt, tokenizer, max_length, stride):
    method __len__ (line 27) | def __len__(self):
    method __getitem__ (line 30) | def __getitem__(self, idx):
  function create_dataloader_v1 (line 34) | def create_dataloader_v1(txt, batch_size=4, max_length=256,

FILE: pkg/llms_from_scratch/ch03.py
  class SelfAttention_v1 (line 10) | class SelfAttention_v1(nn.Module):
    method __init__ (line 12) | def __init__(self, d_in, d_out):
    method forward (line 18) | def forward(self, x):
  class SelfAttention_v2 (line 32) | class SelfAttention_v2(nn.Module):
    method __init__ (line 34) | def __init__(self, d_in, d_out, qkv_bias=False):
    method forward (line 40) | def forward(self, x):
  class CausalAttention (line 52) | class CausalAttention(nn.Module):
    method __init__ (line 54) | def __init__(self, d_in, d_out, context_length,
    method forward (line 64) | def forward(self, x):
  class MultiHeadAttentionWrapper (line 86) | class MultiHeadAttentionWrapper(nn.Module):
    method __init__ (line 87) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 94) | def forward(self, x):
  class MultiHeadAttention (line 98) | class MultiHeadAttention(nn.Module):
    method __init__ (line 99) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 114) | def forward(self, x):
  class PyTorchMultiHeadAttention (line 159) | class PyTorchMultiHeadAttention(nn.Module):
    method __init__ (line 160) | def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False):
    method forward (line 173) | def forward(self, x):

FILE: pkg/llms_from_scratch/ch04.py
  class LayerNorm (line 11) | class LayerNorm(nn.Module):
    method __init__ (line 12) | def __init__(self, emb_dim):
    method forward (line 18) | def forward(self, x):
  class GELU (line 25) | class GELU(nn.Module):
    method __init__ (line 26) | def __init__(self):
    method forward (line 29) | def forward(self, x):
  class FeedForward (line 36) | class FeedForward(nn.Module):
    method __init__ (line 37) | def __init__(self, cfg):
    method forward (line 45) | def forward(self, x):
  class TransformerBlock (line 49) | class TransformerBlock(nn.Module):
    method __init__ (line 50) | def __init__(self, cfg):
    method forward (line 64) | def forward(self, x):
  class GPTModel (line 82) | class GPTModel(nn.Module):
    method __init__ (line 83) | def __init__(self, cfg):
    method forward (line 95) | def forward(self, in_idx):
  function generate_text_simple (line 107) | def generate_text_simple(model, idx, max_new_tokens, context_size):
  class FeedForwardFast (line 137) | class FeedForwardFast(nn.Module):
    method __init__ (line 138) | def __init__(self, cfg):
    method forward (line 146) | def forward(self, x):
  class TransformerBlockFast (line 150) | class TransformerBlockFast(nn.Module):
    method __init__ (line 151) | def __init__(self, cfg):
    method forward (line 164) | def forward(self, x):
  class GPTModelFast (line 182) | class GPTModelFast(nn.Module):
    method __init__ (line 196) | def __init__(self, cfg):
    method forward (line 208) | def forward(self, in_idx):

FILE: pkg/llms_from_scratch/ch05.py
  function generate (line 19) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ...
  function train_model_simple (line 62) | def train_model_simple(model, train_loader, val_loader, optimizer, devic...
  function evaluate_model (line 98) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function generate_and_print_sample (line 107) | def generate_and_print_sample(model, tokenizer, device, start_context):
  function assign (line 121) | def assign(left, right):
  function load_weights_into_gpt (line 127) | def load_weights_into_gpt(gpt, params):
  function text_to_token_ids (line 188) | def text_to_token_ids(text, tokenizer):
  function token_ids_to_text (line 194) | def token_ids_to_text(token_ids, tokenizer):
  function calc_loss_batch (line 199) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 206) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function plot_losses (line 225) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  function download_and_load_gpt2 (line 246) | def download_and_load_gpt2(model_size, models_dir):
  function download_file (line 280) | def download_file(url, destination, backup_url=None):
  function load_gpt2_params_from_tf_ckpt (line 327) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):

FILE: pkg/llms_from_scratch/ch06.py
  function download_and_unzip_spam_data (line 18) | def download_and_unzip_spam_data(url, zip_path, extracted_path, data_fil...
  function create_balanced_dataset (line 41) | def create_balanced_dataset(df):
  function random_split (line 55) | def random_split(df, train_frac, validation_frac):
  class SpamDataset (line 71) | class SpamDataset(Dataset):
    method __init__ (line 72) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=...
    method __getitem__ (line 96) | def __getitem__(self, index):
    method __len__ (line 104) | def __len__(self):
    method _longest_encoded_length (line 107) | def _longest_encoded_length(self):
  function calc_accuracy_loader (line 119) | def calc_accuracy_loader(data_loader, model, device, num_batches=None):
  function calc_loss_batch (line 142) | def calc_loss_batch(input_batch, target_batch, model, device):
  function calc_loss_loader (line 149) | def calc_loss_loader(data_loader, model, device, num_batches=None):
  function evaluate_model (line 168) | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  function train_classifier_simple (line 177) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ...
  function plot_values (line 215) | def plot_values(epochs_seen, examples_seen, train_values, val_values, la...
  function classify_review (line 235) | def classify_review(text, model, tokenizer, device, max_length=None, pad...

FILE: pkg/llms_from_scratch/ch07.py
  function download_and_load_file (line 16) | def download_and_load_file(file_path, url):
  function format_input (line 57) | def format_input(entry):
  class InstructionDataset (line 69) | class InstructionDataset(Dataset):
    method __init__ (line 70) | def __init__(self, data, tokenizer):
    method __getitem__ (line 83) | def __getitem__(self, index):
    method __len__ (line 86) | def __len__(self):
  function custom_collate_draft_1 (line 90) | def custom_collate_draft_1(
  function custom_collate_draft_2 (line 123) | def custom_collate_draft_2(
  function custom_collate_fn (line 154) | def custom_collate_fn(
  function check_if_running (line 200) | def check_if_running(process_name):
  function query_model (line 209) | def query_model(
  function generate_model_scores (line 241) | def generate_model_scores(json_data, json_key, model="llama3"):

FILE: pkg/llms_from_scratch/generate.py
  function trim_input_tensor (line 9) | def trim_input_tensor(input_ids_tensor, context_len, max_new_tokens):

FILE: pkg/llms_from_scratch/kv_cache/generate.py
  function generate_text_simple (line 11) | def generate_text_simple(model, idx, max_new_tokens, context_size=None, ...
  function generate_text_simple_stream (line 34) | def generate_text_simple_stream(model, token_ids, max_new_tokens, eos_to...

FILE: pkg/llms_from_scratch/kv_cache/gpt2.py
  class MultiHeadAttention (line 15) | class MultiHeadAttention(nn.Module):
    method __init__ (line 16) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk...
    method forward (line 30) | def forward(self, x, use_cache=False, start_pos=0, cache=None):
  class LayerNorm (line 82) | class LayerNorm(nn.Module):
    method __init__ (line 83) | def __init__(self, emb_dim):
    method forward (line 89) | def forward(self, x):
  class GELU (line 96) | class GELU(nn.Module):
    method __init__ (line 97) | def __init__(self):
    method forward (line 100) | def forward(self, x):
  class FeedForward (line 107) | class FeedForward(nn.Module):
    method __init__ (line 108) | def __init__(self, cfg):
    method forward (line 116) | def forward(self, x):
  class TransformerBlock (line 120) | class TransformerBlock(nn.Module):
    method __init__ (line 121) | def __init__(self, cfg):
    method forward (line 135) | def forward(self, x, use_cache=False, start_pos=0, cache=None):
  class GPTModel (line 153) | class GPTModel(nn.Module):
    method __init__ (line 154) | def __init__(self, cfg):
    method forward (line 167) | def forward(self, in_idx, use_cache=False, cache=None):

FILE: pkg/llms_from_scratch/kv_cache/llama3.py
  class Llama3Model (line 54) | class Llama3Model(nn.Module):
    method __init__ (line 55) | def __init__(self, cfg):
    method forward (line 80) | def forward(self, in_idx, cache=None):
    method reset_kv_cache (line 112) | def reset_kv_cache(self):
  class TransformerBlock (line 116) | class TransformerBlock(nn.Module):
    method __init__ (line 117) | def __init__(self, cfg):
    method forward (line 130) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None):
  class FeedForward (line 146) | class FeedForward(nn.Module):
    method __init__ (line 147) | def __init__(self, cfg):
    method forward (line 153) | def forward(self, x):
  class GroupedQueryAttention (line 160) | class GroupedQueryAttention(nn.Module):
    method __init__ (line 161) | def __init__(
    method forward (line 180) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None):
  function compute_rope_params (line 238) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096...
  function apply_rope (line 283) | def apply_rope(x, cos, sin, offset=0):
  class Llama3Tokenizer (line 309) | class Llama3Tokenizer:
    method __init__ (line 311) | def __init__(self, model_path):
    method encode (line 342) | def encode(self, text, bos=False, eos=False, **kwargs):
    method decode (line 349) | def decode(self, ids):
  class ChatFormat (line 353) | class ChatFormat:
    method __init__ (line 355) | def __init__(self, tokenizer: Llama3Tokenizer, *,
    method _header (line 360) | def _header(self, role):
    method encode (line 369) | def encode(self, user_message, system_message=None, allowed_special=No...
    method decode (line 389) | def decode(self, ids):
  function clean_text (line 393) | def clean_text(text, header_end="assistant<|end_header_id|>\n\n"):
  class GroupedQueryAttentionFast (line 409) | class GroupedQueryAttentionFast(nn.Module):
    method __init__ (line 415) | def __init__(self, d_in, d_out, num_heads, num_kv_groups, dtype=None):
    method forward (line 431) | def forward(self, x, cos, sin):
  class TransformerBlockFast (line 458) | class TransformerBlockFast(nn.Module):
    method __init__ (line 463) | def __init__(self, cfg):
    method forward (line 476) | def forward(self, x, cos, sin):
  class Llama3ModelFast (line 492) | class Llama3ModelFast(nn.Module):
    method __init__ (line 498) | def __init__(self, cfg):
    method forward (line 521) | def forward(self, in_idx):

FILE: pkg/llms_from_scratch/kv_cache/qwen3.py
  class Qwen3Model (line 19) | class Qwen3Model(nn.Module):
    method __init__ (line 20) | def __init__(self, cfg):
    method forward (line 47) | def forward(self, in_idx, cache=None):
    method reset_kv_cache (line 80) | def reset_kv_cache(self):
  class TransformerBlock (line 84) | class TransformerBlock(nn.Module):
    method __init__ (line 85) | def __init__(self, cfg):
    method forward (line 102) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None):
  class FeedForward (line 118) | class FeedForward(nn.Module):
    method __init__ (line 119) | def __init__(self, cfg):
    method forward (line 125) | def forward(self, x):
  class MoEFeedForward (line 132) | class MoEFeedForward(nn.Module):
    method __init__ (line 133) | def __init__(self, cfg):
    method forward (line 147) | def forward(self, x):
  class GroupedQueryAttention (line 185) | class GroupedQueryAttention(nn.Module):
    method __init__ (line 186) | def __init__(
    method forward (line 215) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None):
  function compute_rope_params (line 261) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096...
  function apply_rope (line 283) | def apply_rope(x, cos, sin, offset=0):
  class RMSNorm (line 304) | class RMSNorm(nn.Module):
    method __init__ (line 305) | def __init__(self, emb_dim, eps=1e-6, bias=False, qwen3_compatible=True):
    method forward (line 312) | def forward(self, x):

FILE: pkg/llms_from_scratch/kv_cache/utils.py
  class KVCache (line 6) | class KVCache:
    method __init__ (line 7) | def __init__(self, n_layers):
    method get (line 10) | def get(self, layer_idx):
    method update (line 13) | def update(self, layer_idx, value):
    method get_all (line 16) | def get_all(self):
    method reset (line 19) | def reset(self):

FILE: pkg/llms_from_scratch/kv_cache_batched/generate.py
  function generate_text_simple (line 11) | def generate_text_simple(model, idx, max_new_tokens, context_size=None, ...

FILE: pkg/llms_from_scratch/kv_cache_batched/qwen3.py
  class Qwen3Model (line 19) | class Qwen3Model(nn.Module):
    method __init__ (line 20) | def __init__(self, cfg):
    method forward (line 47) | def forward(self, in_idx, cache=None, start_pos=None):
    method reset_kv_cache (line 80) | def reset_kv_cache(self, batch_size, device=None):
  class TransformerBlock (line 85) | class TransformerBlock(nn.Module):
    method __init__ (line 86) | def __init__(self, cfg):
    method forward (line 100) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None):
  class FeedForward (line 116) | class FeedForward(nn.Module):
    method __init__ (line 117) | def __init__(self, cfg):
    method forward (line 123) | def forward(self, x):
  class GroupedQueryAttention (line 130) | class GroupedQueryAttention(nn.Module):
    method __init__ (line 131) | def __init__(self, d_in, num_heads, num_kv_groups, head_dim=None, qk_n...
    method forward (line 158) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None):
  function compute_rope_params (line 214) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096...
  function apply_rope (line 236) | def apply_rope(x, cos, sin, offset):
  class RMSNorm (line 266) | class RMSNorm(nn.Module):
    method __init__ (line 267) | def __init__(self, emb_dim, eps=1e-6, bias=False, qwen3_compatible=True):
    method forward (line 274) | def forward(self, x):

FILE: pkg/llms_from_scratch/kv_cache_batched/utils.py
  class KVCache (line 6) | class KVCache:
    method __init__ (line 7) | def __init__(self, n_layers, batch_size):
    method get (line 12) | def get(self, layer_idx, batch_idx):
    method update (line 15) | def update(self, layer_idx, batch_idx, value):
    method get_layer (line 18) | def get_layer(self, layer_idx):
    method reset (line 21) | def reset(self):

FILE: pkg/llms_from_scratch/llama3.py
  class Llama3Model (line 53) | class Llama3Model(nn.Module):
    method __init__ (line 54) | def __init__(self, cfg):
    method forward (line 78) | def forward(self, in_idx):
  class TransformerBlock (line 92) | class TransformerBlock(nn.Module):
    method __init__ (line 93) | def __init__(self, cfg):
    method forward (line 106) | def forward(self, x, mask, cos, sin):
  class FeedForward (line 122) | class FeedForward(nn.Module):
    method __init__ (line 123) | def __init__(self, cfg):
    method forward (line 129) | def forward(self, x):
  class GroupedQueryAttention (line 136) | class GroupedQueryAttention(nn.Module):
    method __init__ (line 137) | def __init__(
    method forward (line 156) | def forward(self, x, mask, cos, sin):
  function compute_rope_params (line 260) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096...
  function apply_rope (line 305) | def apply_rope(x, cos, sin):
  class Llama3Tokenizer (line 331) | class Llama3Tokenizer:
    method __init__ (line 333) | def __init__(self, model_path):
    method encode (line 364) | def encode(self, text, bos=False, eos=False, **kwargs):
    method decode (line 371) | def decode(self, ids):
  class ChatFormat (line 375) | class ChatFormat:
    method __init__ (line 377) | def __init__(self, tokenizer: Llama3Tokenizer, *,
    method _header (line 382) | def _header(self, role):
    method encode (line 391) | def encode(self, user_message, system_message=None, allowed_special=No...
    method decode (line 411) | def decode(self, ids):
  function clean_text (line 415) | def clean_text(text, header_end="assistant<|end_header_id|>\n\n"):
  class GroupedQueryAttentionFast (line 431) | class GroupedQueryAttentionFast(nn.Module):
    method __init__ (line 437) | def __init__(self, d_in, d_out, num_heads, num_kv_groups, dtype=None):
    method forward (line 453) | def forward(self, x, cos, sin):
  class TransformerBlockFast (line 480) | class TransformerBlockFast(nn.Module):
    method __init__ (line 485) | def __init__(self, cfg):
    method forward (line 498) | def forward(self, x, cos, sin):
  class Llama3ModelFast (line 514) | class Llama3ModelFast(nn.Module):
    method __init__ (line 520) | def __init__(self, cfg):
    method forward (line 543) | def forward(self, in_idx):
  function assign (line 554) | def assign(left, right, tensor_name="unknown"):
  function load_weights_into_llama (line 567) | def load_weights_into_llama(model, param_config, params):

FILE: pkg/llms_from_scratch/qwen3.py
  class Qwen3Model (line 123) | class Qwen3Model(nn.Module):
    method __init__ (line 124) | def __init__(self, cfg):
    method forward (line 150) | def forward(self, in_idx):
  class TransformerBlock (line 165) | class TransformerBlock(nn.Module):
    method __init__ (line 166) | def __init__(self, cfg):
    method forward (line 183) | def forward(self, x, mask, cos, sin):
  class FeedForward (line 199) | class FeedForward(nn.Module):
    method __init__ (line 200) | def __init__(self, cfg):
    method forward (line 206) | def forward(self, x):
  class MoEFeedForward (line 213) | class MoEFeedForward(nn.Module):
    method __init__ (line 214) | def __init__(self, cfg):
    method forward (line 228) | def forward(self, x):
  class GroupedQueryAttention (line 266) | class GroupedQueryAttention(nn.Module):
    method __init__ (line 267) | def __init__(
    method forward (line 296) | def forward(self, x, mask, cos, sin):
  function compute_rope_params (line 384) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096...
  function apply_rope (line 406) | def apply_rope(x, cos, sin):
  class RMSNorm (line 427) | class RMSNorm(nn.Module):
    method __init__ (line 428) | def __init__(self, emb_dim, eps=1e-6, bias=False, qwen3_compatible=True):
    method forward (line 435) | def forward(self, x):
  function load_weights_into_qwen (line 451) | def load_weights_into_qwen(model, param_config, params):
  class Qwen3Tokenizer (line 575) | class Qwen3Tokenizer:
    method __init__ (line 588) | def __init__(self, tokenizer_file_path="tokenizer.json", repo_id=None,
    method encode (line 620) | def encode(self, text, chat_wrapped=None):
    method decode (line 639) | def decode(self, ids):
    method _wrap_chat (line 642) | def _wrap_chat(self, user_msg):
  function download_from_huggingface (line 653) | def download_from_huggingface(repo_id, filename, local_dir, revision="ma...
  function download_from_huggingface_from_snapshots (line 673) | def download_from_huggingface_from_snapshots(repo_id, local_dir):

FILE: pkg/llms_from_scratch/tests/test_appendix_a.py
  function test_dataset (line 13) | def test_dataset():

FILE: pkg/llms_from_scratch/tests/test_appendix_d.py
  function test_train (line 18) | def test_train(tmp_path):

FILE: pkg/llms_from_scratch/tests/test_appendix_e.py
  function test_train_classifier_lora (line 23) | def test_train_classifier_lora(tmp_path):

FILE: pkg/llms_from_scratch/tests/test_ch02.py
  function test_dataloader (line 16) | def test_dataloader(tmp_path, file_name):

FILE: pkg/llms_from_scratch/tests/test_ch03.py
  function test_mha (line 11) | def test_mha():

FILE: pkg/llms_from_scratch/tests/test_ch04.py
  function test_gpt_model_variants (line 29) | def test_gpt_model_variants(ModelClass, generate_fn):

FILE: pkg/llms_from_scratch/tests/test_ch05.py
  function test_train_simple (line 38) | def test_train_simple(tmp_path, ModelClass):

FILE: pkg/llms_from_scratch/tests/test_ch06.py
  function test_train_classifier (line 22) | def test_train_classifier(tmp_path):

FILE: pkg/llms_from_scratch/tests/test_ch07.py
  function test_instruction_finetune (line 19) | def test_instruction_finetune(tmp_path):

FILE: pkg/llms_from_scratch/tests/test_generate.py
  function test_dataloader (line 16) | def test_dataloader(tmp_path, file_name):

FILE: pkg/llms_from_scratch/tests/test_llama3.py
  class LitGPTRMSNorm (line 26) | class LitGPTRMSNorm(torch.nn.Module):
    method __init__ (line 36) | def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_un...
    method forward (line 43) | def forward(self, x: torch.Tensor) -> torch.Tensor:
    method reset_parameters (line 52) | def reset_parameters(self) -> None:
  function test_rope (line 60) | def test_rope():
  function test_grouped_query_attention_equivalence (line 157) | def test_grouped_query_attention_equivalence():
  function llama3_weights_path (line 194) | def llama3_weights_path(tmp_path_factory):
  function test_model_variants (line 212) | def test_model_variants(ModelClass, generate_fn, llama3_weights_path):
  function test_rmsnorm_equivalence (line 249) | def test_rmsnorm_equivalence():
  function test_llama3_base_equivalence_with_transformers (line 273) | def test_llama3_base_equivalence_with_transformers():

FILE: pkg/llms_from_scratch/tests/test_qwen3.py
  class Qwen3RMSNorm (line 37) | class Qwen3RMSNorm(nn.Module):
    method __init__ (line 40) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 48) | def forward(self, hidden_states):
    method extra_repr (line 56) | def extra_repr(self):
  function _hf_ids (line 63) | def _hf_ids(obj):
  function dummy_input (line 94) | def dummy_input():
  function dummy_cfg_base (line 100) | def dummy_cfg_base():
  function dummy_cfg_moe (line 118) | def dummy_cfg_moe(dummy_cfg_base):
  function test_dummy_qwen3_forward (line 129) | def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input):
  function test_dummy_qwen3_moe_forward (line 138) | def test_dummy_qwen3_moe_forward(dummy_cfg_moe, dummy_input):
  function test_moe_forward_matches_reference (line 149) | def test_moe_forward_matches_reference(dummy_cfg_moe):
  function test_qwen3_kvcache_equivalence (line 180) | def test_qwen3_kvcache_equivalence(cfg_name, request):
  function test_rope (line 214) | def test_rope(context_len):
  function qwen3_weights_path (line 288) | def qwen3_weights_path(tmp_path_factory):
  function test_model_variants (line 302) | def test_model_variants(ModelClass, qwen3_weights_path, generate_fn):
  function test_model_KV_noKV (line 340) | def test_model_KV_noKV(qwen3_weights_path):
  function test_model_batched_KV (line 381) | def test_model_batched_KV(qwen3_weights_path):
  function test_rmsnorm_equivalence (line 444) | def test_rmsnorm_equivalence():
  function test_all_special_tokens_roundtrip (line 471) | def test_all_special_tokens_roundtrip(repo_id, tok_file):
  function test_chat_wrap_and_equivalence (line 523) | def test_chat_wrap_and_equivalence(add_gen, add_think):
  function test_multiturn_equivalence (line 573) | def test_multiturn_equivalence(repo_id, tok_file, add_gen, add_think):
  function test_tokenizer_equivalence (line 614) | def test_tokenizer_equivalence():
  function test_multiturn_prefix_stability (line 688) | def test_multiturn_prefix_stability(repo_id, tok_file, add_gen, add_think):
  function test_qwen3_base_equivalence_with_transformers (line 764) | def test_qwen3_base_equivalence_with_transformers():

FILE: pkg/llms_from_scratch/utils.py
  function _extract_imports (line 17) | def _extract_imports(src: str):
  function _extract_defs_and_classes_from_code (line 39) | def _extract_defs_and_classes_from_code(src):
  function import_definitions_from_notebook (line 110) | def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None,...
  function download_file (line 153) | def download_file(url, out_dir="."):

FILE: setup/02_installing-python-libraries/python_environment_check.py
  function get_packages (line 20) | def get_packages(pkgs):
  function get_requirements_dict (line 66) | def get_requirements_dict():
  function check_packages (line 102) | def check_packages(reqs):
  function main (line 122) | def main():

FILE: setup/02_installing-python-libraries/tests.py
  function test_main (line 11) | def test_main(capsys):