SYMBOL INDEX (1376 symbols across 126 files) FILE: .github/scripts/check_double_quotes.py function should_skip (line 37) | def should_skip(path): function collect_fstring_expr_string_positions (line 42) | def collect_fstring_expr_string_positions(source): function check_quotes_in_source (line 76) | def check_quotes_in_source(source, path): function check_file (line 104) | def check_file(path): function check_notebook (line 115) | def check_notebook(path): function parse_args (line 126) | def parse_args(): function main (line 136) | def main(): FILE: appendix-A/01_main-chapter-code/DDP-script-torchrun.py function ddp_setup (line 22) | def ddp_setup(rank, world_size): class ToyDataset (line 48) | class ToyDataset(Dataset): method __init__ (line 49) | def __init__(self, X, y): method __getitem__ (line 53) | def __getitem__(self, index): method __len__ (line 58) | def __len__(self): class NeuralNetwork (line 62) | class NeuralNetwork(torch.nn.Module): method __init__ (line 63) | def __init__(self, num_inputs, num_outputs): method forward (line 79) | def forward(self, x): function prepare_dataset (line 84) | def prepare_dataset(): function main (line 128) | def main(rank, world_size, num_epochs): function compute_accuracy (line 181) | def compute_accuracy(model, dataloader, device): FILE: appendix-A/01_main-chapter-code/DDP-script.py function ddp_setup (line 23) | def ddp_setup(rank, world_size): class ToyDataset (line 49) | class ToyDataset(Dataset): method __init__ (line 50) | def __init__(self, X, y): method __getitem__ (line 54) | def __getitem__(self, index): method __len__ (line 59) | def __len__(self): class NeuralNetwork (line 63) | class NeuralNetwork(torch.nn.Module): method __init__ (line 64) | def __init__(self, num_inputs, num_outputs): method forward (line 80) | def forward(self, x): function prepare_dataset (line 85) | def prepare_dataset(): function main (line 129) | def main(rank, world_size, num_epochs): function compute_accuracy (line 182) | def compute_accuracy(model, dataloader, device): FILE: appendix-D/01_main-chapter-code/previous_chapters.py class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset): method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 36) | def __len__(self): method __getitem__ (line 39) | def __getitem__(self, idx): function create_dataloader_v1 (line 43) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 62) | class MultiHeadAttention(nn.Module): method __init__ (line 63) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 78) | def forward(self, x): class LayerNorm (line 122) | class LayerNorm(nn.Module): method __init__ (line 123) | def __init__(self, emb_dim): method forward (line 129) | def forward(self, x): class GELU (line 136) | class GELU(nn.Module): method __init__ (line 137) | def __init__(self): method forward (line 140) | def forward(self, x): class FeedForward (line 147) | class FeedForward(nn.Module): method __init__ (line 148) | def __init__(self, cfg): method forward (line 156) | def forward(self, x): class TransformerBlock (line 160) | class TransformerBlock(nn.Module): method __init__ (line 161) | def __init__(self, cfg): method forward (line 175) | def forward(self, x): class GPTModel (line 193) | class GPTModel(nn.Module): method __init__ (line 194) | def __init__(self, cfg): method forward (line 206) | def forward(self, in_idx): function generate_text_simple (line 218) | def generate_text_simple(model, idx, max_new_tokens, context_size): function calc_loss_batch (line 249) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 256) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 273) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 282) | def generate_and_print_sample(model, tokenizer, device, start_context): function plot_losses (line 295) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): function text_to_token_ids (line 314) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 320) | def token_ids_to_text(token_ids, tokenizer): FILE: appendix-E/01_main-chapter-code/gpt_download.py function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 49) | def download_file(url, destination, backup_url=None): function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): FILE: appendix-E/01_main-chapter-code/previous_chapters.py class GPTDatasetV1 (line 29) | class GPTDatasetV1(Dataset): method __init__ (line 30) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 44) | def __len__(self): method __getitem__ (line 47) | def __getitem__(self, idx): function create_dataloader_v1 (line 51) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 69) | class MultiHeadAttention(nn.Module): method __init__ (line 70) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 85) | def forward(self, x): class LayerNorm (line 128) | class LayerNorm(nn.Module): method __init__ (line 129) | def __init__(self, emb_dim): method forward (line 135) | def forward(self, x): class GELU (line 142) | class GELU(nn.Module): method __init__ (line 143) | def __init__(self): method forward (line 146) | def forward(self, x): class FeedForward (line 153) | class FeedForward(nn.Module): method __init__ (line 154) | def __init__(self, cfg): method forward (line 162) | def forward(self, x): class TransformerBlock (line 166) | class TransformerBlock(nn.Module): method __init__ (line 167) | def __init__(self, cfg): method forward (line 181) | def forward(self, x): class GPTModel (line 199) | class GPTModel(nn.Module): method __init__ (line 200) | def __init__(self, cfg): method forward (line 212) | def forward(self, in_idx): function generate_text_simple (line 224) | def generate_text_simple(model, idx, max_new_tokens, context_size): function assign (line 253) | def assign(left, right): function load_weights_into_gpt (line 259) | def load_weights_into_gpt(gpt, params): function text_to_token_ids (line 320) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 326) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_loader (line 331) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 350) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function download_and_unzip_spam_data (line 364) | def download_and_unzip_spam_data(url, zip_path, extracted_path, data_fil... function create_balanced_dataset (line 387) | def create_balanced_dataset(df): function random_split (line 401) | def random_split(df, train_frac, validation_frac): class SpamDataset (line 417) | class SpamDataset(Dataset): method __init__ (line 418) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method __getitem__ (line 442) | def __getitem__(self, index): method __len__ (line 447) | def __len__(self): method _longest_encoded_length (line 450) | def _longest_encoded_length(self): function calc_accuracy_loader (line 463) | def calc_accuracy_loader(data_loader, model, device, num_batches=None): function calc_loss_batch (line 484) | def calc_loss_batch(input_batch, target_batch, model, device): function train_classifier_simple (line 492) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ... function plot_values (line 530) | def plot_values(epochs_seen, examples_seen, train_values, val_values, la... FILE: ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py function bytes_to_unicode (line 37) | def bytes_to_unicode(): function get_pairs (line 59) | def get_pairs(word): class Encoder (line 72) | class Encoder: method __init__ (line 73) | def __init__(self, encoder, bpe_merges, errors="replace"): method bpe (line 85) | def bpe(self, token): method encode (line 126) | def encode(self, text): method decode (line 133) | def decode(self, tokens): function get_encoder (line 139) | def get_encoder(model_name, models_dir): function download_vocab (line 148) | def download_vocab(): FILE: ch02/05_bpe-from-scratch/tests.py function import_definitions_from_notebook (line 11) | def import_definitions_from_notebook(fullname, names): function imported_module (line 39) | def imported_module(): function verdict_file (line 46) | def verdict_file(imported_module): function gpt2_files (line 64) | def gpt2_files(imported_module): function test_tokenizer_training (line 79) | def test_tokenizer_training(imported_module, verdict_file): function test_gpt2_tokenizer_openai_simple (line 108) | def test_gpt2_tokenizer_openai_simple(imported_module, gpt2_files): function test_gpt2_tokenizer_openai_edgecases (line 123) | def test_gpt2_tokenizer_openai_edgecases(imported_module, gpt2_files): function test_gpt2_newline_and_eot_ids (line 163) | def test_gpt2_newline_and_eot_ids(imported_module, gpt2_files): function test_no_eot_aliasing_and_disallowed_logic (line 185) | def test_no_eot_aliasing_and_disallowed_logic(imported_module, gpt2_files): function test_newline_roundtrip_and_equivalence (line 214) | def test_newline_roundtrip_and_equivalence(imported_module, gpt2_files, ... function test_space_newline_space_patterns (line 234) | def test_space_newline_space_patterns(imported_module, gpt2_files): function test_multiple_leading_spaces_roundtrip (line 250) | def test_multiple_leading_spaces_roundtrip(imported_module, gpt2_files): FILE: ch03/02_bonus_efficient-multihead-attention/tests/test_mha_implementations.py function import_notebook_defs (line 10) | def import_notebook_defs(): function copy_weights (line 16) | def copy_weights(from_mha, to_mha): function test_mha_einsum_matches_ch03 (line 34) | def test_mha_einsum_matches_ch03(d_in, d_out, batch, seq_len, num_heads,... FILE: ch04/01_main-chapter-code/gpt.py class GPTDatasetV1 (line 15) | class GPTDatasetV1(Dataset): method __init__ (line 16) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 30) | def __len__(self): method __getitem__ (line 33) | def __getitem__(self, idx): function create_dataloader_v1 (line 37) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 55) | class MultiHeadAttention(nn.Module): method __init__ (line 56) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 71) | def forward(self, x): class LayerNorm (line 114) | class LayerNorm(nn.Module): method __init__ (line 115) | def __init__(self, emb_dim): method forward (line 121) | def forward(self, x): class GELU (line 128) | class GELU(nn.Module): method __init__ (line 129) | def __init__(self): method forward (line 132) | def forward(self, x): class FeedForward (line 139) | class FeedForward(nn.Module): method __init__ (line 140) | def __init__(self, cfg): method forward (line 148) | def forward(self, x): class TransformerBlock (line 152) | class TransformerBlock(nn.Module): method __init__ (line 153) | def __init__(self, cfg): method forward (line 167) | def forward(self, x): class GPTModel (line 185) | class GPTModel(nn.Module): method __init__ (line 186) | def __init__(self, cfg): method forward (line 198) | def forward(self, in_idx): function generate_text_simple (line 210) | def generate_text_simple(model, idx, max_new_tokens, context_size): function main (line 236) | def main(): FILE: ch04/01_main-chapter-code/previous_chapters.py class GPTDatasetV1 (line 12) | class GPTDatasetV1(Dataset): method __init__ (line 13) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 27) | def __len__(self): method __getitem__ (line 30) | def __getitem__(self, idx): function create_dataloader_v1 (line 34) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 49) | class MultiHeadAttention(nn.Module): method __init__ (line 50) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 65) | def forward(self, x): FILE: ch04/01_main-chapter-code/tests.py function test_main (line 31) | def test_main(capsys): FILE: ch04/03_kv-cache/gpt_ch04.py class MultiHeadAttention (line 14) | class MultiHeadAttention(nn.Module): method __init__ (line 15) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 34) | def forward(self, x): class LayerNorm (line 77) | class LayerNorm(nn.Module): method __init__ (line 78) | def __init__(self, emb_dim): method forward (line 84) | def forward(self, x): class GELU (line 91) | class GELU(nn.Module): method __init__ (line 92) | def __init__(self): method forward (line 95) | def forward(self, x): class FeedForward (line 102) | class FeedForward(nn.Module): method __init__ (line 103) | def __init__(self, cfg): method forward (line 111) | def forward(self, x): class TransformerBlock (line 115) | class TransformerBlock(nn.Module): method __init__ (line 116) | def __init__(self, cfg): method forward (line 130) | def forward(self, x): class GPTModel (line 148) | class GPTModel(nn.Module): method __init__ (line 149) | def __init__(self, cfg): method forward (line 161) | def forward(self, in_idx): function generate_text_simple (line 173) | def generate_text_simple(model, idx, max_new_tokens, context_size): function main (line 200) | def main(): FILE: ch04/03_kv-cache/gpt_with_kv_cache.py class MultiHeadAttention (line 14) | class MultiHeadAttention(nn.Module): method __init__ (line 15) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 41) | def forward(self, x, use_cache=False): method reset_cache (line 106) | def reset_cache(self): class LayerNorm (line 115) | class LayerNorm(nn.Module): method __init__ (line 116) | def __init__(self, emb_dim): method forward (line 122) | def forward(self, x): class GELU (line 129) | class GELU(nn.Module): method __init__ (line 130) | def __init__(self): method forward (line 133) | def forward(self, x): class FeedForward (line 140) | class FeedForward(nn.Module): method __init__ (line 141) | def __init__(self, cfg): method forward (line 149) | def forward(self, x): class TransformerBlock (line 153) | class TransformerBlock(nn.Module): method __init__ (line 154) | def __init__(self, cfg): method forward (line 168) | def forward(self, x, use_cache=False): class GPTModel (line 192) | class GPTModel(nn.Module): method __init__ (line 193) | def __init__(self, cfg): method forward (line 212) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 245) | def reset_kv_cache(self): function generate_text_simple (line 252) | def generate_text_simple(model, idx, max_new_tokens, context_size): function generate_text_simple_cached (line 280) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 308) | def main(): FILE: ch04/03_kv-cache/gpt_with_kv_cache_optimized.py class MultiHeadAttention (line 14) | class MultiHeadAttention(nn.Module): method __init__ (line 15) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 37) | def forward(self, x, use_cache=False): method reset_cache (line 124) | def reset_cache(self): class LayerNorm (line 132) | class LayerNorm(nn.Module): method __init__ (line 133) | def __init__(self, emb_dim): method forward (line 139) | def forward(self, x): class GELU (line 146) | class GELU(nn.Module): method __init__ (line 147) | def __init__(self): method forward (line 150) | def forward(self, x): class FeedForward (line 157) | class FeedForward(nn.Module): method __init__ (line 158) | def __init__(self, cfg): method forward (line 166) | def forward(self, x): class TransformerBlock (line 170) | class TransformerBlock(nn.Module): method __init__ (line 171) | def __init__(self, cfg): method forward (line 187) | def forward(self, x, use_cache=False): class GPTModel (line 211) | class GPTModel(nn.Module): method __init__ (line 212) | def __init__(self, cfg): method forward (line 232) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 271) | def reset_kv_cache(self): function generate_text_simple (line 278) | def generate_text_simple(model, idx, max_new_tokens, context_size): function generate_text_simple_cached (line 306) | def generate_text_simple_cached(model, idx, max_new_tokens, context_size... function main (line 343) | def main(): FILE: ch04/03_kv-cache/tests.py function test_gpt_model_equivalence_not_cached (line 32) | def test_gpt_model_equivalence_not_cached(ModelClass): function test_gpt_model_equivalence_cached (line 66) | def test_gpt_model_equivalence_cached(ModelClass): function test_context_overflow_bug (line 113) | def test_context_overflow_bug(): function test_prefill_chunking_basic (line 150) | def test_prefill_chunking_basic(): FILE: ch04/04_gqa/gpt_with_kv_gqa.py class GroupedQueryAttention (line 20) | class GroupedQueryAttention(nn.Module): method __init__ (line 21) | def __init__( method forward (line 45) | def forward(self, x, use_cache=False): method reset_cache (line 121) | def reset_cache(self): class LayerNorm (line 129) | class LayerNorm(nn.Module): method __init__ (line 130) | def __init__(self, emb_dim): method forward (line 136) | def forward(self, x): class GELU (line 143) | class GELU(nn.Module): method __init__ (line 144) | def __init__(self): method forward (line 147) | def forward(self, x): class FeedForward (line 154) | class FeedForward(nn.Module): method __init__ (line 155) | def __init__(self, cfg): method forward (line 163) | def forward(self, x): class TransformerBlock (line 167) | class TransformerBlock(nn.Module): method __init__ (line 168) | def __init__(self, cfg): method forward (line 182) | def forward(self, x, use_cache=False): class GPTModel (line 206) | class GPTModel(nn.Module): method __init__ (line 207) | def __init__(self, cfg): method forward (line 226) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 258) | def reset_kv_cache(self): function generate_text_simple_cached (line 265) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 292) | def main(): FILE: ch04/04_gqa/gpt_with_kv_mha.py class MultiHeadAttention (line 20) | class MultiHeadAttention(nn.Module): method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False): method forward (line 42) | def forward(self, x, use_cache=False): method reset_cache (line 110) | def reset_cache(self): class LayerNorm (line 118) | class LayerNorm(nn.Module): method __init__ (line 119) | def __init__(self, emb_dim): method forward (line 125) | def forward(self, x): class GELU (line 132) | class GELU(nn.Module): method __init__ (line 133) | def __init__(self): method forward (line 136) | def forward(self, x): class FeedForward (line 143) | class FeedForward(nn.Module): method __init__ (line 144) | def __init__(self, cfg): method forward (line 152) | def forward(self, x): class TransformerBlock (line 156) | class TransformerBlock(nn.Module): method __init__ (line 157) | def __init__(self, cfg): method forward (line 170) | def forward(self, x, use_cache=False): class GPTModel (line 194) | class GPTModel(nn.Module): method __init__ (line 195) | def __init__(self, cfg): method forward (line 214) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 246) | def reset_kv_cache(self): function generate_text_simple_cached (line 253) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 280) | def main(): FILE: ch04/04_gqa/memory_estimator_gqa.py function convert_bytes (line 21) | def convert_bytes(n): function calc_kv_bytes_total (line 26) | def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads, function main (line 33) | def main(): FILE: ch04/04_gqa/plot_memory_estimates_gqa.py function bytes_convert (line 14) | def bytes_convert(n): function savings_percent (line 19) | def savings_percent(total_mha, total_gqa): function plot_abs_kv_vs_context_multi_groups (line 23) | def plot_abs_kv_vs_context_multi_groups(): FILE: ch04/05_mla/gpt_with_kv_mha.py class MultiHeadAttention (line 20) | class MultiHeadAttention(nn.Module): method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False): method forward (line 42) | def forward(self, x, use_cache=False): method reset_cache (line 110) | def reset_cache(self): class LayerNorm (line 118) | class LayerNorm(nn.Module): method __init__ (line 119) | def __init__(self, emb_dim): method forward (line 125) | def forward(self, x): class GELU (line 132) | class GELU(nn.Module): method __init__ (line 133) | def __init__(self): method forward (line 136) | def forward(self, x): class FeedForward (line 143) | class FeedForward(nn.Module): method __init__ (line 144) | def __init__(self, cfg): method forward (line 152) | def forward(self, x): class TransformerBlock (line 156) | class TransformerBlock(nn.Module): method __init__ (line 157) | def __init__(self, cfg): method forward (line 170) | def forward(self, x, use_cache=False): class GPTModel (line 194) | class GPTModel(nn.Module): method __init__ (line 195) | def __init__(self, cfg): method forward (line 214) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 246) | def reset_kv_cache(self): function generate_text_simple_cached (line 253) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 280) | def main(): FILE: ch04/05_mla/gpt_with_kv_mla.py class MultiHeadLatentAttention (line 24) | class MultiHeadLatentAttention(nn.Module): method __init__ (line 25) | def __init__(self, d_in, d_out, dropout, num_heads, method reset_cache (line 50) | def reset_cache(self): method _reshape_to_heads (line 55) | def _reshape_to_heads(x, num_heads, head_dim): method forward (line 60) | def forward(self, x, use_cache=False): class LayerNorm (line 124) | class LayerNorm(nn.Module): method __init__ (line 125) | def __init__(self, emb_dim): method forward (line 131) | def forward(self, x): class GELU (line 138) | class GELU(nn.Module): method __init__ (line 139) | def __init__(self): method forward (line 142) | def forward(self, x): class FeedForward (line 149) | class FeedForward(nn.Module): method __init__ (line 150) | def __init__(self, cfg): method forward (line 158) | def forward(self, x): class TransformerBlock (line 162) | class TransformerBlock(nn.Module): method __init__ (line 163) | def __init__(self, cfg): method forward (line 178) | def forward(self, x, use_cache=False): class GPTModel (line 202) | class GPTModel(nn.Module): method __init__ (line 203) | def __init__(self, cfg): method forward (line 222) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 254) | def reset_kv_cache(self): function generate_text_simple_cached (line 261) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 288) | def main(): FILE: ch04/05_mla/memory_estimator_mla.py function convert_bytes (line 20) | def convert_bytes(n): function calc_kv_bytes_total (line 25) | def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads, function calc_mla_bytes_total (line 33) | def calc_mla_bytes_total(batch, context_length, n_layers, latent_dim, by... function main (line 39) | def main(): FILE: ch04/05_mla/plot_memory_estimates_mla.py function convert_bytes_to_gb (line 18) | def convert_bytes_to_gb(n_bytes): function calc_kv_bytes_total_mha (line 22) | def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_heads, function calc_kv_bytes_total_mla (line 29) | def calc_kv_bytes_total_mla(batch, context_length, n_layers, latent_dim,... function plot_abs_kv_vs_context_multiple (line 33) | def plot_abs_kv_vs_context_multiple(): FILE: ch04/06_swa/gpt_with_kv_mha.py class MultiHeadAttention (line 20) | class MultiHeadAttention(nn.Module): method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False): method forward (line 42) | def forward(self, x, use_cache=False): method reset_cache (line 110) | def reset_cache(self): class LayerNorm (line 118) | class LayerNorm(nn.Module): method __init__ (line 119) | def __init__(self, emb_dim): method forward (line 125) | def forward(self, x): class GELU (line 132) | class GELU(nn.Module): method __init__ (line 133) | def __init__(self): method forward (line 136) | def forward(self, x): class FeedForward (line 143) | class FeedForward(nn.Module): method __init__ (line 144) | def __init__(self, cfg): method forward (line 152) | def forward(self, x): class TransformerBlock (line 156) | class TransformerBlock(nn.Module): method __init__ (line 157) | def __init__(self, cfg): method forward (line 170) | def forward(self, x, use_cache=False): class GPTModel (line 194) | class GPTModel(nn.Module): method __init__ (line 195) | def __init__(self, cfg): method forward (line 214) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 246) | def reset_kv_cache(self): function generate_text_simple_cached (line 253) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 280) | def main(): FILE: ch04/06_swa/gpt_with_kv_swa.py class MultiHeadAttentionWithSWA (line 20) | class MultiHeadAttentionWithSWA(nn.Module): method __init__ (line 21) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False, sl... method forward (line 43) | def forward(self, x, use_cache=False): method reset_cache (line 127) | def reset_cache(self): class LayerNorm (line 135) | class LayerNorm(nn.Module): method __init__ (line 136) | def __init__(self, emb_dim): method forward (line 142) | def forward(self, x): class GELU (line 149) | class GELU(nn.Module): method __init__ (line 150) | def __init__(self): method forward (line 153) | def forward(self, x): class FeedForward (line 160) | class FeedForward(nn.Module): method __init__ (line 161) | def __init__(self, cfg): method forward (line 169) | def forward(self, x): class TransformerBlock (line 173) | class TransformerBlock(nn.Module): method __init__ (line 174) | def __init__(self, cfg): method forward (line 189) | def forward(self, x, use_cache=False): class GPTModel (line 213) | class GPTModel(nn.Module): method __init__ (line 214) | def __init__(self, cfg): method forward (line 247) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 279) | def reset_kv_cache(self): function generate_text_simple_cached (line 286) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 313) | def main(): FILE: ch04/06_swa/memory_estimator_swa.py function convert_bytes (line 20) | def convert_bytes(n): function calc_kv_bytes_per_layer (line 25) | def calc_kv_bytes_per_layer(batch, context_length, head_dim, n_kv_heads,... function parse_ratio (line 30) | def parse_ratio(ratio_str): function distribute_layers (line 41) | def distribute_layers(n_layers, a, b): function estimate_totals (line 50) | def estimate_totals(context_length, sliding_window_size, emb_dim, n_head... function main (line 92) | def main(): FILE: ch04/06_swa/plot_memory_estimates_swa.py function convert_bytes_to_gb (line 27) | def convert_bytes_to_gb(n_bytes): function parse_ratio (line 31) | def parse_ratio(ratio_str): function calc_kv_bytes_total_mha (line 42) | def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, by... function calc_kv_bytes_total_gqa (line 48) | def calc_kv_bytes_total_gqa( function calc_kv_bytes_total_mha_swa (line 57) | def calc_kv_bytes_total_mha_swa( function calc_kv_bytes_total_gqa_swa (line 75) | def calc_kv_bytes_total_gqa_swa( function main (line 104) | def main(): FILE: ch04/07_moe/gpt_with_kv_ffn.py class MultiHeadAttention (line 23) | class MultiHeadAttention(nn.Module): method __init__ (line 24) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False): method forward (line 45) | def forward(self, x, use_cache=False): method reset_cache (line 113) | def reset_cache(self): class LayerNorm (line 121) | class LayerNorm(nn.Module): method __init__ (line 122) | def __init__(self, emb_dim): method forward (line 128) | def forward(self, x): class GELU (line 135) | class GELU(nn.Module): method __init__ (line 136) | def __init__(self): method forward (line 139) | def forward(self, x): class FeedForward (line 159) | class FeedForward(nn.Module): method __init__ (line 160) | def __init__(self, cfg): method forward (line 166) | def forward(self, x): class TransformerBlock (line 170) | class TransformerBlock(nn.Module): method __init__ (line 171) | def __init__(self, cfg): method forward (line 185) | def forward(self, x, use_cache=False): class GPTModel (line 220) | class GPTModel(nn.Module): method __init__ (line 221) | def __init__(self, cfg): method forward (line 240) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 272) | def reset_kv_cache(self): function generate_text_simple_cached (line 279) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 343) | def main(): FILE: ch04/07_moe/gpt_with_kv_moe.py class MultiHeadAttention (line 23) | class MultiHeadAttention(nn.Module): method __init__ (line 24) | def __init__(self, d_in, d_out, dropout, num_heads, qkv_bias=False): method forward (line 45) | def forward(self, x, use_cache=False): method reset_cache (line 113) | def reset_cache(self): class LayerNorm (line 121) | class LayerNorm(nn.Module): method __init__ (line 122) | def __init__(self, emb_dim): method forward (line 128) | def forward(self, x): class GELU (line 135) | class GELU(nn.Module): method __init__ (line 136) | def __init__(self): method forward (line 139) | def forward(self, x): class FeedForward (line 146) | class FeedForward(nn.Module): method __init__ (line 147) | def __init__(self, cfg): method forward (line 155) | def forward(self, x): class MoEFeedForward (line 159) | class MoEFeedForward(nn.Module): method __init__ (line 160) | def __init__(self, cfg): method forward (line 186) | def forward(self, x): class TransformerBlock (line 230) | class TransformerBlock(nn.Module): method __init__ (line 231) | def __init__(self, cfg): method forward (line 245) | def forward(self, x, use_cache=False): class GPTModel (line 280) | class GPTModel(nn.Module): method __init__ (line 281) | def __init__(self, cfg): method forward (line 300) | def forward(self, in_idx, use_cache=False): method reset_kv_cache (line 332) | def reset_kv_cache(self): function generate_text_simple_cached (line 339) | def generate_text_simple_cached(model, idx, max_new_tokens, function main (line 403) | def main(): FILE: ch04/07_moe/memory_estimator_moe.py function convert_bytes (line 17) | def convert_bytes(n): function get_num_param_matrices (line 22) | def get_num_param_matrices(ffn_type): function calc_ffn_params (line 31) | def calc_ffn_params(emb_dim, hidden_dim, ffn_type): function calc_router_params (line 35) | def calc_router_params(emb_dim, num_experts): function estimate_params_and_hidden (line 39) | def estimate_params_and_hidden( function main (line 67) | def main(): FILE: ch04/07_moe/plot_memory_estimates_moe.py function calc_moe_active_and_total (line 16) | def calc_moe_active_and_total( function plot_active_params_vs_experts (line 42) | def plot_active_params_vs_experts( function main (line 93) | def main(): FILE: ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py function calc_kv_bytes_total_mha (line 20) | def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, by... function calc_kv_bytes_total_deltanet_no_conv (line 27) | def calc_kv_bytes_total_deltanet_no_conv(batch, emb_dim, n_layers, bytes... function convert_to_gb (line 34) | def convert_to_gb(x): function main (line 38) | def main(): FILE: ch05/01_main-chapter-code/gpt_download.py function download_and_load_gpt2 (line 16) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 48) | def download_file(url, destination, backup_url=None): function load_gpt2_params_from_tf_ckpt (line 126) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): FILE: ch05/01_main-chapter-code/gpt_generate.py function text_to_token_ids (line 21) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 27) | def token_ids_to_text(token_ids, tokenizer): function download_and_load_gpt2 (line 32) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 62) | def download_file(url, destination): function load_gpt2_params_from_tf_ckpt (line 91) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): function assign (line 120) | def assign(left, right): function load_weights_into_gpt (line 126) | def load_weights_into_gpt(gpt, params): function generate (line 187) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ... function main (line 230) | def main(gpt_config, input_prompt, model_size, device): FILE: ch05/01_main-chapter-code/gpt_train.py function text_to_token_ids (line 17) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 23) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_batch (line 28) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 35) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 52) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 61) | def generate_and_print_sample(model, tokenizer, device, start_context): function train_model_simple (line 75) | def train_model_simple(model, train_loader, val_loader, optimizer, devic... function plot_losses (line 112) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): function main (line 131) | def main(gpt_config, settings): FILE: ch05/01_main-chapter-code/previous_chapters.py class GPTDatasetV1 (line 20) | class GPTDatasetV1(Dataset): method __init__ (line 21) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 35) | def __len__(self): method __getitem__ (line 38) | def __getitem__(self, idx): function create_dataloader_v1 (line 42) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 60) | class MultiHeadAttention(nn.Module): method __init__ (line 61) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 76) | def forward(self, x): class LayerNorm (line 119) | class LayerNorm(nn.Module): method __init__ (line 120) | def __init__(self, emb_dim): method forward (line 126) | def forward(self, x): class GELU (line 133) | class GELU(nn.Module): method __init__ (line 134) | def __init__(self): method forward (line 137) | def forward(self, x): class FeedForward (line 144) | class FeedForward(nn.Module): method __init__ (line 145) | def __init__(self, cfg): method forward (line 153) | def forward(self, x): class TransformerBlock (line 157) | class TransformerBlock(nn.Module): method __init__ (line 158) | def __init__(self, cfg): method forward (line 172) | def forward(self, x): class GPTModel (line 190) | class GPTModel(nn.Module): method __init__ (line 191) | def __init__(self, cfg): method forward (line 203) | def forward(self, in_idx): function generate_text_simple (line 215) | def generate_text_simple(model, idx, max_new_tokens, context_size): FILE: ch05/01_main-chapter-code/tests.py function gpt_config (line 13) | def gpt_config(): function other_settings (line 26) | def other_settings(): function test_main (line 35) | def test_main(gpt_config, other_settings): function check_file_size (line 43) | def check_file_size(url, expected_size): function test_model_files (line 63) | def test_model_files(): FILE: ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py function is_english (line 17) | def is_english(text, threshold=0.9): function combine_files (line 22) | def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|... FILE: ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py function read_text_file (line 28) | def read_text_file(file_path): function create_dataloaders (line 34) | def create_dataloaders(text_data, train_ratio, batch_size, max_length, s... function convert_time (line 57) | def convert_time(seconds): function print_eta (line 63) | def print_eta(start_time, book_start_time, index, total_files): function train_model_simple (line 80) | def train_model_simple(model, optimizer, device, n_epochs, FILE: ch05/03_bonus_pretraining_on_gutenberg/tests.py function test_pretraining (line 13) | def test_pretraining(): FILE: ch05/05_bonus_hparam_tuning/hparam_search.py function calc_loss_loader (line 31) | def calc_loss_loader(data_loader, model, device, num_batches=None): function calc_loss_batch (line 48) | def calc_loss_batch(input_batch, target_batch, model, device): function evaluate_model (line 57) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function train_model (line 66) | def train_model(model, train_loader, val_loader, optimizer, device, FILE: ch05/06_user_interface/app_orig.py function get_model_and_tokenizer (line 24) | def get_model_and_tokenizer(): function main (line 68) | async def main(message: chainlit.Message): FILE: ch05/06_user_interface/app_own.py function get_model_and_tokenizer (line 26) | def get_model_and_tokenizer(): function main (line 62) | async def main(message: chainlit.Message): FILE: ch05/07_gpt_to_llama/previous_chapters.py function text_to_token_ids (line 16) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 22) | def token_ids_to_text(token_ids, tokenizer): function generate (line 27) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ... FILE: ch05/07_gpt_to_llama/tests/test_llama32_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function test_dummy_llama3_forward (line 54) | def test_dummy_llama3_forward(dummy_cfg_base, dummy_input, import_notebo... function test_llama3_base_equivalence_with_transformers (line 63) | def test_llama3_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py function litgpt_build_rope_cache (line 27) | def litgpt_build_rope_cache( function litgpt_apply_rope (line 79) | def litgpt_apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Ten... function notebook (line 96) | def notebook(): function set_seed (line 139) | def set_seed(): function test_rope_llama2 (line 143) | def test_rope_llama2(notebook): function test_rope_llama3 (line 207) | def test_rope_llama3(notebook): function test_rope_llama3_12 (line 277) | def test_rope_llama3_12(notebook): function test_silu (line 371) | def test_silu(notebook): function test_rmsnorm (line 378) | def test_rmsnorm(notebook): FILE: ch05/08_memory_efficient_weight_loading/previous_chapters.py class MultiHeadAttention (line 18) | class MultiHeadAttention(nn.Module): method __init__ (line 19) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 34) | def forward(self, x): class LayerNorm (line 77) | class LayerNorm(nn.Module): method __init__ (line 78) | def __init__(self, emb_dim): method forward (line 84) | def forward(self, x): class GELU (line 91) | class GELU(nn.Module): method __init__ (line 92) | def __init__(self): method forward (line 95) | def forward(self, x): class FeedForward (line 102) | class FeedForward(nn.Module): method __init__ (line 103) | def __init__(self, cfg): method forward (line 111) | def forward(self, x): class TransformerBlock (line 115) | class TransformerBlock(nn.Module): method __init__ (line 116) | def __init__(self, cfg): method forward (line 130) | def forward(self, x): class GPTModel (line 148) | class GPTModel(nn.Module): method __init__ (line 149) | def __init__(self, cfg): method forward (line 161) | def forward(self, in_idx): FILE: ch05/10_llm-training-speed/00_orig.py class GPTDatasetV1 (line 22) | class GPTDatasetV1(Dataset): method __init__ (line 23) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 37) | def __len__(self): method __getitem__ (line 40) | def __getitem__(self, idx): function create_dataloader_v1 (line 44) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 62) | class MultiHeadAttention(nn.Module): method __init__ (line 63) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 78) | def forward(self, x): class LayerNorm (line 121) | class LayerNorm(nn.Module): method __init__ (line 122) | def __init__(self, emb_dim): method forward (line 128) | def forward(self, x): class GELU (line 135) | class GELU(nn.Module): method __init__ (line 136) | def __init__(self): method forward (line 139) | def forward(self, x): class FeedForward (line 146) | class FeedForward(nn.Module): method __init__ (line 147) | def __init__(self, cfg): method forward (line 155) | def forward(self, x): class TransformerBlock (line 159) | class TransformerBlock(nn.Module): method __init__ (line 160) | def __init__(self, cfg): method forward (line 174) | def forward(self, x): class GPTModel (line 192) | class GPTModel(nn.Module): method __init__ (line 193) | def __init__(self, cfg): method forward (line 205) | def forward(self, in_idx): function generate_text_simple (line 217) | def generate_text_simple(model, idx, max_new_tokens, context_size): function text_to_token_ids (line 247) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 253) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_batch (line 258) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 265) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 282) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 291) | def generate_and_print_sample(model, tokenizer, device, start_context): function train_model_simple_with_timing (line 305) | def train_model_simple_with_timing(model, train_loader, val_loader, opti... function plot_losses (line 387) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): function main (line 410) | def main(gpt_config, settings): FILE: ch05/10_llm-training-speed/01_opt_single_gpu.py class GPTDatasetV1 (line 22) | class GPTDatasetV1(Dataset): method __init__ (line 23) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 37) | def __len__(self): method __getitem__ (line 40) | def __getitem__(self, idx): function create_dataloader_v1 (line 44) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class PyTorchMultiHeadAttention (line 64) | class PyTorchMultiHeadAttention(nn.Module): method __init__ (line 65) | def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False): method forward (line 78) | def forward(self, x): class FeedForward (line 111) | class FeedForward(nn.Module): method __init__ (line 112) | def __init__(self, cfg): method forward (line 120) | def forward(self, x): class TransformerBlock (line 124) | class TransformerBlock(nn.Module): method __init__ (line 125) | def __init__(self, cfg): method forward (line 138) | def forward(self, x): class GPTModel (line 156) | class GPTModel(nn.Module): method __init__ (line 157) | def __init__(self, cfg): method forward (line 169) | def forward(self, in_idx): function generate_text_simple (line 181) | def generate_text_simple(model, idx, max_new_tokens, context_size): function text_to_token_ids (line 211) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 217) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_batch (line 222) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 229) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 246) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 255) | def generate_and_print_sample(model, tokenizer, device, start_context): function train_model_simple_with_timing (line 269) | def train_model_simple_with_timing(model, train_loader, val_loader, opti... function plot_losses (line 351) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): function main (line 374) | def main(gpt_config, settings): FILE: ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py function ddp_setup (line 27) | def ddp_setup(rank, world_size): class GPTDatasetV1 (line 58) | class GPTDatasetV1(Dataset): method __init__ (line 59) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 73) | def __len__(self): method __getitem__ (line 76) | def __getitem__(self, idx): function create_dataloader_v1 (line 82) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class PyTorchMultiHeadAttention (line 107) | class PyTorchMultiHeadAttention(nn.Module): method __init__ (line 108) | def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False): method forward (line 121) | def forward(self, x): class FeedForward (line 154) | class FeedForward(nn.Module): method __init__ (line 155) | def __init__(self, cfg): method forward (line 163) | def forward(self, x): class TransformerBlock (line 167) | class TransformerBlock(nn.Module): method __init__ (line 168) | def __init__(self, cfg): method forward (line 181) | def forward(self, x): class GPTModel (line 199) | class GPTModel(nn.Module): method __init__ (line 200) | def __init__(self, cfg): method forward (line 212) | def forward(self, in_idx): function generate_text_simple (line 224) | def generate_text_simple(model, idx, max_new_tokens, context_size): function text_to_token_ids (line 254) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 260) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_batch (line 265) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 272) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 289) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 298) | def generate_and_print_sample(model, device, start_context): function train_model_simple_with_timing (line 314) | def train_model_simple_with_timing(model, train_loader, val_loader, opti... function plot_losses (line 416) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): function main (line 440) | def main(gpt_config, settings, rank, world_size): FILE: ch05/11_qwen3/qwen3-chat-interface/qwen3-chat-interface-multiturn.py function get_qwen_config (line 33) | def get_qwen_config(name): function build_repo_and_local (line 53) | def build_repo_and_local(model_name, reasoning, local_dir_arg): function get_device (line 60) | def get_device(name): function get_model_and_tokenizer (line 76) | def get_model_and_tokenizer(qwen3_config, repo_id, local_dir, device, us... function build_prompt_from_history (line 99) | def build_prompt_from_history(history, add_assistant_header=True): function on_start (line 125) | async def on_start(): function main (line 133) | async def main(message: chainlit.Message): FILE: ch05/11_qwen3/qwen3-chat-interface/qwen3-chat-interface.py function get_qwen_config (line 32) | def get_qwen_config(name): function build_repo_and_local (line 52) | def build_repo_and_local(model_name, reasoning, local_dir_arg): function get_device (line 59) | def get_device(name): function get_model_and_tokenizer (line 75) | def get_model_and_tokenizer(qwen3_config, repo_id, local_dir, device, us... function on_start (line 105) | async def on_start(): function main (line 113) | async def main(message: chainlit.Message): FILE: ch05/11_qwen3/tests/test_qwen3_kvcache_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function dummy_cfg_moe (line 50) | def dummy_cfg_moe(dummy_cfg_base): function test_dummy_qwen3_forward (line 61) | def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, import_noteboo... function test_qwen3_base_equivalence_with_transformers (line 71) | def test_qwen3_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/11_qwen3/tests/test_qwen3_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function dummy_cfg_moe (line 50) | def dummy_cfg_moe(dummy_cfg_base): function test_dummy_qwen3_forward (line 61) | def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, import_noteboo... function test_qwen3_base_equivalence_with_transformers (line 71) | def test_qwen3_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/12_gemma3/tests/test_gemma3_kv_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function test_dummy_gemma3_forward (line 53) | def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, import_notebo... function test_gemma3_base_equivalence_with_transformers (line 62) | def test_gemma3_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/12_gemma3/tests/test_gemma3_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function test_dummy_gemma3_forward (line 53) | def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, import_notebo... function test_gemma3_base_equivalence_with_transformers (line 62) | def test_gemma3_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/13_olmo3/tests/olmo3_layer_debugger.py function tiny_debug_config (line 20) | def tiny_debug_config(): function yarn_debug_config (line 46) | def yarn_debug_config(): function _hf_config_from_dict (line 74) | def _hf_config_from_dict(cfg): function load_notebook_defs (line 114) | def load_notebook_defs(nb_name="standalone-olmo3.ipynb"): function build_olmo3_pair (line 119) | def build_olmo3_pair(import_notebook_defs, cfg, hf_checkpoint=None): function _attach_debug_hooks (line 143) | def _attach_debug_hooks(model, is_hf): function _layer_sort_key (line 169) | def _layer_sort_key(name): function layerwise_differences (line 182) | def layerwise_differences(ours, hf_model, input_ids, rtol=1e-5, atol=1e-5): function first_mismatch (line 244) | def first_mismatch(differences): function format_report (line 251) | def format_report(differences): FILE: ch05/13_olmo3/tests/test_olmo3_kvcache_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function test_dummy_olmo3_forward (line 58) | def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, import_noteboo... function test_olmo3_base_equivalence_with_transformers (line 68) | def test_olmo3_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/13_olmo3/tests/test_olmo3_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function test_dummy_olmo3_forward (line 58) | def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, import_noteboo... function test_olmo3_base_equivalence_with_transformers (line 68) | def test_olmo3_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/15_tiny-aya/tests/test_tiny_aya_kvcache_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function test_dummy_tiny_aya_forward (line 55) | def test_dummy_tiny_aya_forward(dummy_cfg_base, dummy_input, import_note... function test_tiny_aya_base_equivalence_with_transformers (line 65) | def test_tiny_aya_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/15_tiny-aya/tests/test_tiny_aya_nb.py function import_notebook_defs (line 19) | def import_notebook_defs(): function dummy_input (line 26) | def dummy_input(): function dummy_cfg_base (line 32) | def dummy_cfg_base(): function test_dummy_tiny_aya_forward (line 54) | def test_dummy_tiny_aya_forward(dummy_cfg_base, dummy_input, import_note... function test_tiny_aya_base_equivalence_with_transformers (line 64) | def test_tiny_aya_base_equivalence_with_transformers(import_notebook_defs): FILE: ch05/15_tiny-aya/tests/tiny_aya_layer_debugger.py function tiny_debug_config (line 19) | def tiny_debug_config(): function _hf_config_from_dict (line 41) | def _hf_config_from_dict(cfg): function load_notebook_defs (line 65) | def load_notebook_defs(nb_name="standalone-tiny-aya.ipynb"): function build_tiny_aya_pair (line 70) | def build_tiny_aya_pair(import_notebook_defs, cfg, hf_checkpoint=None): function _attach_debug_hooks (line 93) | def _attach_debug_hooks(model, is_hf): function _layer_sort_key (line 125) | def _layer_sort_key(name): function layerwise_differences (line 138) | def layerwise_differences(ours, hf_model, input_ids, rtol=1e-5, atol=1e-5): function format_report (line 199) | def format_report(differences): FILE: ch05/16_qwen3.5/qwen3_5_transformers.py class _NotebookLogger (line 25) | class _NotebookLogger: method __init__ (line 26) | def __init__(self): method warning_once (line 29) | def warning_once(self, msg): class Qwen3_5Config (line 40) | class Qwen3_5Config: class Qwen3_5DynamicCache (line 44) | class Qwen3_5DynamicCache: class Qwen3_5RMSNormGated (line 48) | class Qwen3_5RMSNormGated(nn.Module): method __init__ (line 49) | def __init__(self, hidden_size, eps=1e-6, **kwargs): method forward (line 54) | def forward(self, hidden_states, gate=None): function apply_mask_to_padding_states (line 66) | def apply_mask_to_padding_states(hidden_states, attention_mask): function torch_causal_conv1d_update (line 78) | def torch_causal_conv1d_update( function l2norm (line 96) | def l2norm(x, dim=-1, eps=1e-6): function torch_chunk_gated_delta_rule (line 102) | def torch_chunk_gated_delta_rule( function torch_recurrent_gated_delta_rule (line 182) | def torch_recurrent_gated_delta_rule( class Qwen3_5GatedDeltaNet (line 226) | class Qwen3_5GatedDeltaNet(nn.Module): method __init__ (line 227) | def __init__(self, config, layer_idx): method forward (line 296) | def forward( FILE: ch05/16_qwen3.5/tests/qwen3_5_layer_debugger.py function _import_qwen3_5_classes (line 14) | def _import_qwen3_5_classes(): function tiny_debug_config (line 44) | def tiny_debug_config(): function _hf_config_from_dict (line 68) | def _hf_config_from_dict(cfg): function load_notebook_defs (line 105) | def load_notebook_defs(nb_name="qwen3.5.ipynb"): function build_qwen3_5_pair (line 112) | def build_qwen3_5_pair(import_notebook_defs, cfg, hf_checkpoint=None): function _attach_debug_hooks (line 140) | def _attach_debug_hooks(model, is_hf): function _layer_sort_key (line 174) | def _layer_sort_key(name): function layerwise_differences (line 187) | def layerwise_differences(ours, hf_model, input_ids, rtol=1e-5, atol=1e-5): function format_report (line 248) | def format_report(differences): FILE: ch05/16_qwen3.5/tests/test_qwen3_5_nb.py function _import_qwen3_5_classes (line 16) | def _import_qwen3_5_classes(): function import_notebook_defs (line 51) | def import_notebook_defs(): function dummy_input (line 61) | def dummy_input(): function dummy_cfg_base (line 67) | def dummy_cfg_base(): function test_dummy_qwen3_5_forward (line 92) | def test_dummy_qwen3_5_forward(dummy_cfg_base, dummy_input, import_noteb... function test_qwen3_5_base_equivalence_with_transformers (line 103) | def test_qwen3_5_base_equivalence_with_transformers(import_notebook_defs): FILE: ch06/01_main-chapter-code/gpt_class_finetune.py function download_and_unzip_spam_data (line 24) | def download_and_unzip_spam_data(url, zip_path, extracted_path, data_fil... function create_balanced_dataset (line 47) | def create_balanced_dataset(df): function random_split (line 60) | def random_split(df, train_frac, validation_frac): class SpamDataset (line 76) | class SpamDataset(Dataset): method __init__ (line 77) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method __getitem__ (line 101) | def __getitem__(self, index): method __len__ (line 109) | def __len__(self): method _longest_encoded_length (line 112) | def _longest_encoded_length(self): function calc_accuracy_loader (line 124) | def calc_accuracy_loader(data_loader, model, device, num_batches=None): function calc_loss_batch (line 147) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 154) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 171) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function train_classifier_simple (line 180) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ... function plot_values (line 218) | def plot_values(epochs_seen, examples_seen, train_values, val_values, la... FILE: ch06/01_main-chapter-code/gpt_download.py function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 49) | def download_file(url, destination, backup_url=None): function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): FILE: ch06/01_main-chapter-code/previous_chapters.py class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset): method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 36) | def __len__(self): method __getitem__ (line 39) | def __getitem__(self, idx): function create_dataloader_v1 (line 43) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 61) | class MultiHeadAttention(nn.Module): method __init__ (line 62) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 77) | def forward(self, x): class LayerNorm (line 120) | class LayerNorm(nn.Module): method __init__ (line 121) | def __init__(self, emb_dim): method forward (line 127) | def forward(self, x): class GELU (line 134) | class GELU(nn.Module): method __init__ (line 135) | def __init__(self): method forward (line 138) | def forward(self, x): class FeedForward (line 145) | class FeedForward(nn.Module): method __init__ (line 146) | def __init__(self, cfg): method forward (line 154) | def forward(self, x): class TransformerBlock (line 158) | class TransformerBlock(nn.Module): method __init__ (line 159) | def __init__(self, cfg): method forward (line 173) | def forward(self, x): class GPTModel (line 191) | class GPTModel(nn.Module): method __init__ (line 192) | def __init__(self, cfg): method forward (line 204) | def forward(self, in_idx): function generate_text_simple (line 216) | def generate_text_simple(model, idx, max_new_tokens, context_size): function assign (line 245) | def assign(left, right): function load_weights_into_gpt (line 251) | def load_weights_into_gpt(gpt, params): function text_to_token_ids (line 312) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 318) | def token_ids_to_text(token_ids, tokenizer): FILE: ch06/01_main-chapter-code/tests.py function test_gpt_class_finetune (line 12) | def test_gpt_class_finetune(): FILE: ch06/02_bonus_additional-experiments/additional_experiments.py class LoRALayer (line 32) | class LoRALayer(torch.nn.Module): method __init__ (line 33) | def __init__(self, in_dim, out_dim, rank, alpha): method forward (line 40) | def forward(self, x): class LinearWithLoRA (line 45) | class LinearWithLoRA(torch.nn.Module): method __init__ (line 46) | def __init__(self, linear, rank, alpha): method forward (line 53) | def forward(self, x): class LinearWithLoRAMerged (line 58) | class LinearWithLoRAMerged(torch.nn.Module): method __init__ (line 59) | def __init__(self, linear, rank, alpha): method forward (line 66) | def forward(self, x): class SpamDataset (line 72) | class SpamDataset(Dataset): method __init__ (line 73) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method __getitem__ (line 90) | def __getitem__(self, index): method __len__ (line 95) | def __len__(self): method _longest_encoded_length (line 98) | def _longest_encoded_length(self, tokenizer): function download_and_unzip (line 110) | def download_and_unzip(url, zip_path, extract_to, new_file_path): function random_split (line 133) | def random_split(df, train_frac, val_frac): function create_dataset_csvs (line 149) | def create_dataset_csvs(new_file_path): function instantiate_model (line 166) | def instantiate_model(choose_model, load_weights): function calc_loss_batch (line 197) | def calc_loss_batch(input_batch, target_batch, model, device, function calc_loss_loader (line 231) | def calc_loss_loader(data_loader, model, device, function calc_accuracy_loader (line 257) | def calc_accuracy_loader(data_loader, model, device, num_batches=None, function evaluate_model (line 310) | def evaluate_model(model, train_loader, val_loader, device, function train_classifier_simple (line 329) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ... function replace_linear_with_lora (line 398) | def replace_linear_with_lora(model, rank, alpha, alternative=False): FILE: ch06/02_bonus_additional-experiments/gpt_download.py function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 49) | def download_file(url, destination, backup_url=None): function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): FILE: ch06/02_bonus_additional-experiments/previous_chapters.py class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset): method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 36) | def __len__(self): method __getitem__ (line 39) | def __getitem__(self, idx): function create_dataloader_v1 (line 43) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 61) | class MultiHeadAttention(nn.Module): method __init__ (line 62) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 80) | def forward(self, x): class LayerNorm (line 124) | class LayerNorm(nn.Module): method __init__ (line 125) | def __init__(self, emb_dim): method forward (line 131) | def forward(self, x): class GELU (line 138) | class GELU(nn.Module): method __init__ (line 139) | def __init__(self): method forward (line 142) | def forward(self, x): class FeedForward (line 149) | class FeedForward(nn.Module): method __init__ (line 150) | def __init__(self, cfg): method forward (line 158) | def forward(self, x): class TransformerBlock (line 162) | class TransformerBlock(nn.Module): method __init__ (line 163) | def __init__(self, cfg, disable_causal_mask=False): method forward (line 179) | def forward(self, x): class GPTModel (line 197) | class GPTModel(nn.Module): method __init__ (line 198) | def __init__(self, cfg, disable_causal_mask=False): method forward (line 210) | def forward(self, in_idx): function generate_text_simple (line 222) | def generate_text_simple(model, idx, max_new_tokens, context_size): function assign (line 251) | def assign(left, right): function load_weights_into_gpt (line 257) | def load_weights_into_gpt(gpt, params): function generate (line 318) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ... FILE: ch06/03_bonus_imdb-classification/download_prepare_dataset.py function reporthook (line 14) | def reporthook(count, block_size, total_size): function download_and_extract_dataset (line 31) | def download_and_extract_dataset(dataset_url, target_file, directory): function load_dataset_to_dataframe (line 51) | def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg... function partition_and_save (line 66) | def partition_and_save(df, sizes=(35000, 5000, 10000)): FILE: ch06/03_bonus_imdb-classification/gpt_download.py function download_and_load_gpt2 (line 17) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 49) | def download_file(url, destination, backup_url=None): function load_gpt2_params_from_tf_ckpt (line 131) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): FILE: ch06/03_bonus_imdb-classification/previous_chapters.py class GPTDatasetV1 (line 21) | class GPTDatasetV1(Dataset): method __init__ (line 22) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 37) | def __len__(self): method __getitem__ (line 40) | def __getitem__(self, idx): function create_dataloader_v1 (line 44) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 62) | class MultiHeadAttention(nn.Module): method __init__ (line 63) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 78) | def forward(self, x): class LayerNorm (line 121) | class LayerNorm(nn.Module): method __init__ (line 122) | def __init__(self, emb_dim): method forward (line 128) | def forward(self, x): class GELU (line 135) | class GELU(nn.Module): method __init__ (line 136) | def __init__(self): method forward (line 139) | def forward(self, x): class FeedForward (line 146) | class FeedForward(nn.Module): method __init__ (line 147) | def __init__(self, cfg): method forward (line 155) | def forward(self, x): class TransformerBlock (line 159) | class TransformerBlock(nn.Module): method __init__ (line 160) | def __init__(self, cfg): method forward (line 174) | def forward(self, x): class GPTModel (line 192) | class GPTModel(nn.Module): method __init__ (line 193) | def __init__(self, cfg): method forward (line 205) | def forward(self, in_idx): function generate_text_simple (line 217) | def generate_text_simple(model, idx, max_new_tokens, context_size): function assign (line 246) | def assign(left, right): function load_weights_into_gpt (line 252) | def load_weights_into_gpt(gpt, params): function text_to_token_ids (line 313) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 319) | def token_ids_to_text(token_ids, tokenizer): FILE: ch06/03_bonus_imdb-classification/train_bert_hf.py class IMDbDataset (line 18) | class IMDbDataset(Dataset): method __init__ (line 19) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method _create_attention_mask (line 43) | def _create_attention_mask(self, encoded_text): method __getitem__ (line 46) | def __getitem__(self, index): method __len__ (line 61) | def __len__(self): method _longest_encoded_length (line 64) | def _longest_encoded_length(self, tokenizer): function calc_loss_batch (line 73) | def calc_loss_batch(input_batch, attention_mask_batch, target_batch, mod... function calc_loss_loader (line 83) | def calc_loss_loader(data_loader, model, device, num_batches=None): function calc_accuracy_loader (line 101) | def calc_accuracy_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 123) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function train_classifier_simple (line 132) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ... FILE: ch06/03_bonus_imdb-classification/train_bert_hf_spam.py class SpamDataset (line 21) | class SpamDataset(Dataset): method __init__ (line 22) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method __getitem__ (line 39) | def __getitem__(self, index): method __len__ (line 44) | def __len__(self): method _longest_encoded_length (line 47) | def _longest_encoded_length(self, tokenizer): function download_and_unzip (line 59) | def download_and_unzip(url, zip_path, extract_to, new_file_path): function random_split (line 82) | def random_split(df, train_frac, val_frac): function create_dataset_csvs (line 98) | def create_dataset_csvs(new_file_path): class SPAMDataset (line 115) | class SPAMDataset(Dataset): method __init__ (line 116) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method _create_attention_mask (line 140) | def _create_attention_mask(self, encoded_text): method __getitem__ (line 143) | def __getitem__(self, index): method __len__ (line 158) | def __len__(self): method _longest_encoded_length (line 161) | def _longest_encoded_length(self, tokenizer): function calc_loss_batch (line 170) | def calc_loss_batch(input_batch, attention_mask_batch, target_batch, mod... function calc_loss_loader (line 180) | def calc_loss_loader(data_loader, model, device, num_batches=None): function calc_accuracy_loader (line 198) | def calc_accuracy_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 220) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function train_classifier_simple (line 229) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ... FILE: ch06/03_bonus_imdb-classification/train_gpt.py class IMDbDataset (line 20) | class IMDbDataset(Dataset): method __init__ (line 21) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method __getitem__ (line 36) | def __getitem__(self, index): method __len__ (line 41) | def __len__(self): method _longest_encoded_length (line 44) | def _longest_encoded_length(self, tokenizer): function instantiate_model (line 53) | def instantiate_model(choose_model, load_weights): function calc_loss_batch (line 84) | def calc_loss_batch(input_batch, target_batch, model, device, function calc_loss_loader (line 100) | def calc_loss_loader(data_loader, model, device, function calc_accuracy_loader (line 125) | def calc_accuracy_loader(data_loader, model, device, function evaluate_model (line 156) | def evaluate_model(model, train_loader, val_loader, device, eval_iter, function train_classifier_simple (line 172) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ... FILE: ch06/03_bonus_imdb-classification/train_sklearn_logreg.py function load_dataframes (line 14) | def load_dataframes(): function eval_model (line 22) | def eval_model(model, X_train, y_train, X_val, y_val, X_test, y_test): FILE: ch06/04_user_interface/app.py function get_model_and_tokenizer (line 22) | def get_model_and_tokenizer(): function main (line 69) | async def main(message: chainlit.Message): FILE: ch07/01_main-chapter-code/exercise_experiments.py class InstructionDataset (line 37) | class InstructionDataset(Dataset): method __init__ (line 38) | def __init__(self, data, tokenizer): method __getitem__ (line 51) | def __getitem__(self, index): method __len__ (line 54) | def __len__(self): class InstructionDatasetWithMasking (line 58) | class InstructionDatasetWithMasking(Dataset): method __init__ (line 59) | def __init__(self, data, tokenizer): method __getitem__ (line 79) | def __getitem__(self, index): method __len__ (line 83) | def __len__(self): class InstructionDatasetPhi (line 87) | class InstructionDatasetPhi(Dataset): method __init__ (line 88) | def __init__(self, data, tokenizer): method __getitem__ (line 105) | def __getitem__(self, index): method __len__ (line 108) | def __len__(self): class LinearWithLoRA (line 112) | class LinearWithLoRA(torch.nn.Module): method __init__ (line 113) | def __init__(self, linear, rank, alpha): method forward (line 120) | def forward(self, x): class LoRALayer (line 124) | class LoRALayer(torch.nn.Module): method __init__ (line 125) | def __init__(self, in_dim, out_dim, rank, alpha): method forward (line 132) | def forward(self, x): function replace_linear_with_lora (line 137) | def replace_linear_with_lora(model, rank, alpha): function custom_collate_fn (line 147) | def custom_collate_fn( function custom_collate_with_masking_fn (line 190) | def custom_collate_with_masking_fn( function download_and_load_file (line 236) | def download_and_load_file(file_path, url): function format_input_phi (line 253) | def format_input_phi(entry): function format_input (line 263) | def format_input(entry): function plot_losses (line 275) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, plot... function main (line 297) | def main(mask_instructions=False, alpaca52k=False, phi3_prompt=False, lo... FILE: ch07/01_main-chapter-code/gpt_download.py function download_and_load_gpt2 (line 16) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 48) | def download_file(url, destination, backup_url=None): function load_gpt2_params_from_tf_ckpt (line 95) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): FILE: ch07/01_main-chapter-code/gpt_instruction_finetuning.py class InstructionDataset (line 35) | class InstructionDataset(Dataset): method __init__ (line 36) | def __init__(self, data, tokenizer): method __getitem__ (line 49) | def __getitem__(self, index): method __len__ (line 52) | def __len__(self): function custom_collate_fn (line 56) | def custom_collate_fn( function download_and_load_file (line 99) | def download_and_load_file(file_path, url): function format_input (line 113) | def format_input(entry): function plot_losses (line 125) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): function main (line 147) | def main(test_mode=False): FILE: ch07/01_main-chapter-code/ollama_evaluate.py function query_model (line 14) | def query_model(prompt, model="llama3", url="http://localhost:11434/api/... function check_if_running (line 42) | def check_if_running(process_name): function format_input (line 51) | def format_input(entry): function main (line 63) | def main(file_path): function generate_model_scores (line 79) | def generate_model_scores(json_data, json_key, model="llama3"): FILE: ch07/01_main-chapter-code/previous_chapters.py class GPTDatasetV1 (line 25) | class GPTDatasetV1(Dataset): method __init__ (line 26) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 41) | def __len__(self): method __getitem__ (line 44) | def __getitem__(self, idx): function create_dataloader_v1 (line 48) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 66) | class MultiHeadAttention(nn.Module): method __init__ (line 67) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 82) | def forward(self, x): class LayerNorm (line 125) | class LayerNorm(nn.Module): method __init__ (line 126) | def __init__(self, emb_dim): method forward (line 132) | def forward(self, x): class GELU (line 139) | class GELU(nn.Module): method __init__ (line 140) | def __init__(self): method forward (line 143) | def forward(self, x): class FeedForward (line 150) | class FeedForward(nn.Module): method __init__ (line 151) | def __init__(self, cfg): method forward (line 159) | def forward(self, x): class TransformerBlock (line 163) | class TransformerBlock(nn.Module): method __init__ (line 164) | def __init__(self, cfg): method forward (line 178) | def forward(self, x): class GPTModel (line 196) | class GPTModel(nn.Module): method __init__ (line 197) | def __init__(self, cfg): method forward (line 209) | def forward(self, in_idx): function generate_text_simple (line 221) | def generate_text_simple(model, idx, max_new_tokens, context_size): function generate (line 250) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ... function train_model_simple (line 293) | def train_model_simple(model, train_loader, val_loader, optimizer, devic... function evaluate_model (line 329) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 338) | def generate_and_print_sample(model, tokenizer, device, start_context): function assign (line 352) | def assign(left, right): function load_weights_into_gpt (line 358) | def load_weights_into_gpt(gpt, params): function text_to_token_ids (line 419) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 425) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_batch (line 430) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 437) | def calc_loss_loader(data_loader, model, device, num_batches=None): function plot_losses (line 456) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): FILE: ch07/01_main-chapter-code/tests.py function test_gpt_class_finetune (line 12) | def test_gpt_class_finetune(): FILE: ch07/02_dataset-utilities/find-near-duplicates.py function preprocess_text (line 33) | def preprocess_text(text): function find_near_duplicates (line 41) | def find_near_duplicates(json_data, threshold=0.75, key="instruction"): function find_print_and_remove_near_duplicates (line 76) | def find_print_and_remove_near_duplicates(json_data, remove_duplicates=F... FILE: ch07/04_preference-tuning-with-dpo/previous_chapters.py class GPTDatasetV1 (line 25) | class GPTDatasetV1(Dataset): method __init__ (line 26) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 41) | def __len__(self): method __getitem__ (line 44) | def __getitem__(self, idx): function create_dataloader_v1 (line 48) | def create_dataloader_v1(txt, batch_size=4, max_length=256, class MultiHeadAttention (line 66) | class MultiHeadAttention(nn.Module): method __init__ (line 67) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 82) | def forward(self, x): class LayerNorm (line 125) | class LayerNorm(nn.Module): method __init__ (line 126) | def __init__(self, emb_dim): method forward (line 132) | def forward(self, x): class GELU (line 139) | class GELU(nn.Module): method __init__ (line 140) | def __init__(self): method forward (line 143) | def forward(self, x): class FeedForward (line 150) | class FeedForward(nn.Module): method __init__ (line 151) | def __init__(self, cfg): method forward (line 159) | def forward(self, x): class TransformerBlock (line 163) | class TransformerBlock(nn.Module): method __init__ (line 164) | def __init__(self, cfg): method forward (line 178) | def forward(self, x): class GPTModel (line 196) | class GPTModel(nn.Module): method __init__ (line 197) | def __init__(self, cfg): method forward (line 209) | def forward(self, in_idx): function generate_text_simple (line 221) | def generate_text_simple(model, idx, max_new_tokens, context_size): function generate (line 250) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ... function train_model_simple (line 294) | def train_model_simple(model, train_loader, val_loader, optimizer, devic... function evaluate_model (line 330) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 339) | def generate_and_print_sample(model, tokenizer, device, start_context): function assign (line 353) | def assign(left, right): function load_weights_into_gpt (line 359) | def load_weights_into_gpt(gpt, params): function text_to_token_ids (line 420) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 426) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_batch (line 431) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 438) | def calc_loss_loader(data_loader, model, device, num_batches=None): function plot_losses (line 457) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, labe... FILE: ch07/06_user_interface/app.py function get_model_and_tokenizer (line 26) | def get_model_and_tokenizer(): function extract_response (line 60) | def extract_response(response_text, input_text): function main (line 69) | async def main(message: chainlit.Message): FILE: conftest.py function _get_env_number (line 8) | def _get_env_number(name, default, cast): function pytest_configure (line 19) | def pytest_configure(config): FILE: pkg/llms_from_scratch/appendix_a.py class NeuralNetwork (line 10) | class NeuralNetwork(torch.nn.Module): method __init__ (line 11) | def __init__(self, num_inputs, num_outputs): method forward (line 28) | def forward(self, x): class ToyDataset (line 33) | class ToyDataset(Dataset): method __init__ (line 34) | def __init__(self, X, y): method __getitem__ (line 38) | def __getitem__(self, index): method __len__ (line 43) | def __len__(self): FILE: pkg/llms_from_scratch/appendix_d.py function find_highest_gradient (line 12) | def find_highest_gradient(model): function train_model (line 23) | def train_model(model, train_loader, val_loader, optimizer, device, FILE: pkg/llms_from_scratch/appendix_e.py class LoRALayer (line 10) | class LoRALayer(torch.nn.Module): method __init__ (line 11) | def __init__(self, in_dim, out_dim, rank, alpha): method forward (line 19) | def forward(self, x): class LinearWithLoRA (line 25) | class LinearWithLoRA(torch.nn.Module): method __init__ (line 26) | def __init__(self, linear, rank, alpha): method forward (line 33) | def forward(self, x): function replace_linear_with_lora (line 37) | def replace_linear_with_lora(model, rank, alpha): FILE: pkg/llms_from_scratch/ch02.py class GPTDatasetV1 (line 11) | class GPTDatasetV1(Dataset): method __init__ (line 12) | def __init__(self, txt, tokenizer, max_length, stride): method __len__ (line 27) | def __len__(self): method __getitem__ (line 30) | def __getitem__(self, idx): function create_dataloader_v1 (line 34) | def create_dataloader_v1(txt, batch_size=4, max_length=256, FILE: pkg/llms_from_scratch/ch03.py class SelfAttention_v1 (line 10) | class SelfAttention_v1(nn.Module): method __init__ (line 12) | def __init__(self, d_in, d_out): method forward (line 18) | def forward(self, x): class SelfAttention_v2 (line 32) | class SelfAttention_v2(nn.Module): method __init__ (line 34) | def __init__(self, d_in, d_out, qkv_bias=False): method forward (line 40) | def forward(self, x): class CausalAttention (line 52) | class CausalAttention(nn.Module): method __init__ (line 54) | def __init__(self, d_in, d_out, context_length, method forward (line 64) | def forward(self, x): class MultiHeadAttentionWrapper (line 86) | class MultiHeadAttentionWrapper(nn.Module): method __init__ (line 87) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 94) | def forward(self, x): class MultiHeadAttention (line 98) | class MultiHeadAttention(nn.Module): method __init__ (line 99) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 114) | def forward(self, x): class PyTorchMultiHeadAttention (line 159) | class PyTorchMultiHeadAttention(nn.Module): method __init__ (line 160) | def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False): method forward (line 173) | def forward(self, x): FILE: pkg/llms_from_scratch/ch04.py class LayerNorm (line 11) | class LayerNorm(nn.Module): method __init__ (line 12) | def __init__(self, emb_dim): method forward (line 18) | def forward(self, x): class GELU (line 25) | class GELU(nn.Module): method __init__ (line 26) | def __init__(self): method forward (line 29) | def forward(self, x): class FeedForward (line 36) | class FeedForward(nn.Module): method __init__ (line 37) | def __init__(self, cfg): method forward (line 45) | def forward(self, x): class TransformerBlock (line 49) | class TransformerBlock(nn.Module): method __init__ (line 50) | def __init__(self, cfg): method forward (line 64) | def forward(self, x): class GPTModel (line 82) | class GPTModel(nn.Module): method __init__ (line 83) | def __init__(self, cfg): method forward (line 95) | def forward(self, in_idx): function generate_text_simple (line 107) | def generate_text_simple(model, idx, max_new_tokens, context_size): class FeedForwardFast (line 137) | class FeedForwardFast(nn.Module): method __init__ (line 138) | def __init__(self, cfg): method forward (line 146) | def forward(self, x): class TransformerBlockFast (line 150) | class TransformerBlockFast(nn.Module): method __init__ (line 151) | def __init__(self, cfg): method forward (line 164) | def forward(self, x): class GPTModelFast (line 182) | class GPTModelFast(nn.Module): method __init__ (line 196) | def __init__(self, cfg): method forward (line 208) | def forward(self, in_idx): FILE: pkg/llms_from_scratch/ch05.py function generate (line 19) | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, ... function train_model_simple (line 62) | def train_model_simple(model, train_loader, val_loader, optimizer, devic... function evaluate_model (line 98) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function generate_and_print_sample (line 107) | def generate_and_print_sample(model, tokenizer, device, start_context): function assign (line 121) | def assign(left, right): function load_weights_into_gpt (line 127) | def load_weights_into_gpt(gpt, params): function text_to_token_ids (line 188) | def text_to_token_ids(text, tokenizer): function token_ids_to_text (line 194) | def token_ids_to_text(token_ids, tokenizer): function calc_loss_batch (line 199) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 206) | def calc_loss_loader(data_loader, model, device, num_batches=None): function plot_losses (line 225) | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): function download_and_load_gpt2 (line 246) | def download_and_load_gpt2(model_size, models_dir): function download_file (line 280) | def download_file(url, destination, backup_url=None): function load_gpt2_params_from_tf_ckpt (line 327) | def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): FILE: pkg/llms_from_scratch/ch06.py function download_and_unzip_spam_data (line 18) | def download_and_unzip_spam_data(url, zip_path, extracted_path, data_fil... function create_balanced_dataset (line 41) | def create_balanced_dataset(df): function random_split (line 55) | def random_split(df, train_frac, validation_frac): class SpamDataset (line 71) | class SpamDataset(Dataset): method __init__ (line 72) | def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=... method __getitem__ (line 96) | def __getitem__(self, index): method __len__ (line 104) | def __len__(self): method _longest_encoded_length (line 107) | def _longest_encoded_length(self): function calc_accuracy_loader (line 119) | def calc_accuracy_loader(data_loader, model, device, num_batches=None): function calc_loss_batch (line 142) | def calc_loss_batch(input_batch, target_batch, model, device): function calc_loss_loader (line 149) | def calc_loss_loader(data_loader, model, device, num_batches=None): function evaluate_model (line 168) | def evaluate_model(model, train_loader, val_loader, device, eval_iter): function train_classifier_simple (line 177) | def train_classifier_simple(model, train_loader, val_loader, optimizer, ... function plot_values (line 215) | def plot_values(epochs_seen, examples_seen, train_values, val_values, la... function classify_review (line 235) | def classify_review(text, model, tokenizer, device, max_length=None, pad... FILE: pkg/llms_from_scratch/ch07.py function download_and_load_file (line 16) | def download_and_load_file(file_path, url): function format_input (line 57) | def format_input(entry): class InstructionDataset (line 69) | class InstructionDataset(Dataset): method __init__ (line 70) | def __init__(self, data, tokenizer): method __getitem__ (line 83) | def __getitem__(self, index): method __len__ (line 86) | def __len__(self): function custom_collate_draft_1 (line 90) | def custom_collate_draft_1( function custom_collate_draft_2 (line 123) | def custom_collate_draft_2( function custom_collate_fn (line 154) | def custom_collate_fn( function check_if_running (line 200) | def check_if_running(process_name): function query_model (line 209) | def query_model( function generate_model_scores (line 241) | def generate_model_scores(json_data, json_key, model="llama3"): FILE: pkg/llms_from_scratch/generate.py function trim_input_tensor (line 9) | def trim_input_tensor(input_ids_tensor, context_len, max_new_tokens): FILE: pkg/llms_from_scratch/kv_cache/generate.py function generate_text_simple (line 11) | def generate_text_simple(model, idx, max_new_tokens, context_size=None, ... function generate_text_simple_stream (line 34) | def generate_text_simple_stream(model, token_ids, max_new_tokens, eos_to... FILE: pkg/llms_from_scratch/kv_cache/gpt2.py class MultiHeadAttention (line 15) | class MultiHeadAttention(nn.Module): method __init__ (line 16) | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qk... method forward (line 30) | def forward(self, x, use_cache=False, start_pos=0, cache=None): class LayerNorm (line 82) | class LayerNorm(nn.Module): method __init__ (line 83) | def __init__(self, emb_dim): method forward (line 89) | def forward(self, x): class GELU (line 96) | class GELU(nn.Module): method __init__ (line 97) | def __init__(self): method forward (line 100) | def forward(self, x): class FeedForward (line 107) | class FeedForward(nn.Module): method __init__ (line 108) | def __init__(self, cfg): method forward (line 116) | def forward(self, x): class TransformerBlock (line 120) | class TransformerBlock(nn.Module): method __init__ (line 121) | def __init__(self, cfg): method forward (line 135) | def forward(self, x, use_cache=False, start_pos=0, cache=None): class GPTModel (line 153) | class GPTModel(nn.Module): method __init__ (line 154) | def __init__(self, cfg): method forward (line 167) | def forward(self, in_idx, use_cache=False, cache=None): FILE: pkg/llms_from_scratch/kv_cache/llama3.py class Llama3Model (line 54) | class Llama3Model(nn.Module): method __init__ (line 55) | def __init__(self, cfg): method forward (line 80) | def forward(self, in_idx, cache=None): method reset_kv_cache (line 112) | def reset_kv_cache(self): class TransformerBlock (line 116) | class TransformerBlock(nn.Module): method __init__ (line 117) | def __init__(self, cfg): method forward (line 130) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None): class FeedForward (line 146) | class FeedForward(nn.Module): method __init__ (line 147) | def __init__(self, cfg): method forward (line 153) | def forward(self, x): class GroupedQueryAttention (line 160) | class GroupedQueryAttention(nn.Module): method __init__ (line 161) | def __init__( method forward (line 180) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None): function compute_rope_params (line 238) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096... function apply_rope (line 283) | def apply_rope(x, cos, sin, offset=0): class Llama3Tokenizer (line 309) | class Llama3Tokenizer: method __init__ (line 311) | def __init__(self, model_path): method encode (line 342) | def encode(self, text, bos=False, eos=False, **kwargs): method decode (line 349) | def decode(self, ids): class ChatFormat (line 353) | class ChatFormat: method __init__ (line 355) | def __init__(self, tokenizer: Llama3Tokenizer, *, method _header (line 360) | def _header(self, role): method encode (line 369) | def encode(self, user_message, system_message=None, allowed_special=No... method decode (line 389) | def decode(self, ids): function clean_text (line 393) | def clean_text(text, header_end="assistant<|end_header_id|>\n\n"): class GroupedQueryAttentionFast (line 409) | class GroupedQueryAttentionFast(nn.Module): method __init__ (line 415) | def __init__(self, d_in, d_out, num_heads, num_kv_groups, dtype=None): method forward (line 431) | def forward(self, x, cos, sin): class TransformerBlockFast (line 458) | class TransformerBlockFast(nn.Module): method __init__ (line 463) | def __init__(self, cfg): method forward (line 476) | def forward(self, x, cos, sin): class Llama3ModelFast (line 492) | class Llama3ModelFast(nn.Module): method __init__ (line 498) | def __init__(self, cfg): method forward (line 521) | def forward(self, in_idx): FILE: pkg/llms_from_scratch/kv_cache/qwen3.py class Qwen3Model (line 19) | class Qwen3Model(nn.Module): method __init__ (line 20) | def __init__(self, cfg): method forward (line 47) | def forward(self, in_idx, cache=None): method reset_kv_cache (line 80) | def reset_kv_cache(self): class TransformerBlock (line 84) | class TransformerBlock(nn.Module): method __init__ (line 85) | def __init__(self, cfg): method forward (line 102) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None): class FeedForward (line 118) | class FeedForward(nn.Module): method __init__ (line 119) | def __init__(self, cfg): method forward (line 125) | def forward(self, x): class MoEFeedForward (line 132) | class MoEFeedForward(nn.Module): method __init__ (line 133) | def __init__(self, cfg): method forward (line 147) | def forward(self, x): class GroupedQueryAttention (line 185) | class GroupedQueryAttention(nn.Module): method __init__ (line 186) | def __init__( method forward (line 215) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None): function compute_rope_params (line 261) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096... function apply_rope (line 283) | def apply_rope(x, cos, sin, offset=0): class RMSNorm (line 304) | class RMSNorm(nn.Module): method __init__ (line 305) | def __init__(self, emb_dim, eps=1e-6, bias=False, qwen3_compatible=True): method forward (line 312) | def forward(self, x): FILE: pkg/llms_from_scratch/kv_cache/utils.py class KVCache (line 6) | class KVCache: method __init__ (line 7) | def __init__(self, n_layers): method get (line 10) | def get(self, layer_idx): method update (line 13) | def update(self, layer_idx, value): method get_all (line 16) | def get_all(self): method reset (line 19) | def reset(self): FILE: pkg/llms_from_scratch/kv_cache_batched/generate.py function generate_text_simple (line 11) | def generate_text_simple(model, idx, max_new_tokens, context_size=None, ... FILE: pkg/llms_from_scratch/kv_cache_batched/qwen3.py class Qwen3Model (line 19) | class Qwen3Model(nn.Module): method __init__ (line 20) | def __init__(self, cfg): method forward (line 47) | def forward(self, in_idx, cache=None, start_pos=None): method reset_kv_cache (line 80) | def reset_kv_cache(self, batch_size, device=None): class TransformerBlock (line 85) | class TransformerBlock(nn.Module): method __init__ (line 86) | def __init__(self, cfg): method forward (line 100) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None): class FeedForward (line 116) | class FeedForward(nn.Module): method __init__ (line 117) | def __init__(self, cfg): method forward (line 123) | def forward(self, x): class GroupedQueryAttention (line 130) | class GroupedQueryAttention(nn.Module): method __init__ (line 131) | def __init__(self, d_in, num_heads, num_kv_groups, head_dim=None, qk_n... method forward (line 158) | def forward(self, x, mask, cos, sin, start_pos=0, cache=None): function compute_rope_params (line 214) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096... function apply_rope (line 236) | def apply_rope(x, cos, sin, offset): class RMSNorm (line 266) | class RMSNorm(nn.Module): method __init__ (line 267) | def __init__(self, emb_dim, eps=1e-6, bias=False, qwen3_compatible=True): method forward (line 274) | def forward(self, x): FILE: pkg/llms_from_scratch/kv_cache_batched/utils.py class KVCache (line 6) | class KVCache: method __init__ (line 7) | def __init__(self, n_layers, batch_size): method get (line 12) | def get(self, layer_idx, batch_idx): method update (line 15) | def update(self, layer_idx, batch_idx, value): method get_layer (line 18) | def get_layer(self, layer_idx): method reset (line 21) | def reset(self): FILE: pkg/llms_from_scratch/llama3.py class Llama3Model (line 53) | class Llama3Model(nn.Module): method __init__ (line 54) | def __init__(self, cfg): method forward (line 78) | def forward(self, in_idx): class TransformerBlock (line 92) | class TransformerBlock(nn.Module): method __init__ (line 93) | def __init__(self, cfg): method forward (line 106) | def forward(self, x, mask, cos, sin): class FeedForward (line 122) | class FeedForward(nn.Module): method __init__ (line 123) | def __init__(self, cfg): method forward (line 129) | def forward(self, x): class GroupedQueryAttention (line 136) | class GroupedQueryAttention(nn.Module): method __init__ (line 137) | def __init__( method forward (line 156) | def forward(self, x, mask, cos, sin): function compute_rope_params (line 260) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096... function apply_rope (line 305) | def apply_rope(x, cos, sin): class Llama3Tokenizer (line 331) | class Llama3Tokenizer: method __init__ (line 333) | def __init__(self, model_path): method encode (line 364) | def encode(self, text, bos=False, eos=False, **kwargs): method decode (line 371) | def decode(self, ids): class ChatFormat (line 375) | class ChatFormat: method __init__ (line 377) | def __init__(self, tokenizer: Llama3Tokenizer, *, method _header (line 382) | def _header(self, role): method encode (line 391) | def encode(self, user_message, system_message=None, allowed_special=No... method decode (line 411) | def decode(self, ids): function clean_text (line 415) | def clean_text(text, header_end="assistant<|end_header_id|>\n\n"): class GroupedQueryAttentionFast (line 431) | class GroupedQueryAttentionFast(nn.Module): method __init__ (line 437) | def __init__(self, d_in, d_out, num_heads, num_kv_groups, dtype=None): method forward (line 453) | def forward(self, x, cos, sin): class TransformerBlockFast (line 480) | class TransformerBlockFast(nn.Module): method __init__ (line 485) | def __init__(self, cfg): method forward (line 498) | def forward(self, x, cos, sin): class Llama3ModelFast (line 514) | class Llama3ModelFast(nn.Module): method __init__ (line 520) | def __init__(self, cfg): method forward (line 543) | def forward(self, in_idx): function assign (line 554) | def assign(left, right, tensor_name="unknown"): function load_weights_into_llama (line 567) | def load_weights_into_llama(model, param_config, params): FILE: pkg/llms_from_scratch/qwen3.py class Qwen3Model (line 123) | class Qwen3Model(nn.Module): method __init__ (line 124) | def __init__(self, cfg): method forward (line 150) | def forward(self, in_idx): class TransformerBlock (line 165) | class TransformerBlock(nn.Module): method __init__ (line 166) | def __init__(self, cfg): method forward (line 183) | def forward(self, x, mask, cos, sin): class FeedForward (line 199) | class FeedForward(nn.Module): method __init__ (line 200) | def __init__(self, cfg): method forward (line 206) | def forward(self, x): class MoEFeedForward (line 213) | class MoEFeedForward(nn.Module): method __init__ (line 214) | def __init__(self, cfg): method forward (line 228) | def forward(self, x): class GroupedQueryAttention (line 266) | class GroupedQueryAttention(nn.Module): method __init__ (line 267) | def __init__( method forward (line 296) | def forward(self, x, mask, cos, sin): function compute_rope_params (line 384) | def compute_rope_params(head_dim, theta_base=10_000, context_length=4096... function apply_rope (line 406) | def apply_rope(x, cos, sin): class RMSNorm (line 427) | class RMSNorm(nn.Module): method __init__ (line 428) | def __init__(self, emb_dim, eps=1e-6, bias=False, qwen3_compatible=True): method forward (line 435) | def forward(self, x): function load_weights_into_qwen (line 451) | def load_weights_into_qwen(model, param_config, params): class Qwen3Tokenizer (line 575) | class Qwen3Tokenizer: method __init__ (line 588) | def __init__(self, tokenizer_file_path="tokenizer.json", repo_id=None, method encode (line 620) | def encode(self, text, chat_wrapped=None): method decode (line 639) | def decode(self, ids): method _wrap_chat (line 642) | def _wrap_chat(self, user_msg): function download_from_huggingface (line 653) | def download_from_huggingface(repo_id, filename, local_dir, revision="ma... function download_from_huggingface_from_snapshots (line 673) | def download_from_huggingface_from_snapshots(repo_id, local_dir): FILE: pkg/llms_from_scratch/tests/test_appendix_a.py function test_dataset (line 13) | def test_dataset(): FILE: pkg/llms_from_scratch/tests/test_appendix_d.py function test_train (line 18) | def test_train(tmp_path): FILE: pkg/llms_from_scratch/tests/test_appendix_e.py function test_train_classifier_lora (line 23) | def test_train_classifier_lora(tmp_path): FILE: pkg/llms_from_scratch/tests/test_ch02.py function test_dataloader (line 16) | def test_dataloader(tmp_path, file_name): FILE: pkg/llms_from_scratch/tests/test_ch03.py function test_mha (line 11) | def test_mha(): FILE: pkg/llms_from_scratch/tests/test_ch04.py function test_gpt_model_variants (line 29) | def test_gpt_model_variants(ModelClass, generate_fn): FILE: pkg/llms_from_scratch/tests/test_ch05.py function test_train_simple (line 38) | def test_train_simple(tmp_path, ModelClass): FILE: pkg/llms_from_scratch/tests/test_ch06.py function test_train_classifier (line 22) | def test_train_classifier(tmp_path): FILE: pkg/llms_from_scratch/tests/test_ch07.py function test_instruction_finetune (line 19) | def test_instruction_finetune(tmp_path): FILE: pkg/llms_from_scratch/tests/test_generate.py function test_dataloader (line 16) | def test_dataloader(tmp_path, file_name): FILE: pkg/llms_from_scratch/tests/test_llama3.py class LitGPTRMSNorm (line 26) | class LitGPTRMSNorm(torch.nn.Module): method __init__ (line 36) | def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_un... method forward (line 43) | def forward(self, x: torch.Tensor) -> torch.Tensor: method reset_parameters (line 52) | def reset_parameters(self) -> None: function test_rope (line 60) | def test_rope(): function test_grouped_query_attention_equivalence (line 157) | def test_grouped_query_attention_equivalence(): function llama3_weights_path (line 194) | def llama3_weights_path(tmp_path_factory): function test_model_variants (line 212) | def test_model_variants(ModelClass, generate_fn, llama3_weights_path): function test_rmsnorm_equivalence (line 249) | def test_rmsnorm_equivalence(): function test_llama3_base_equivalence_with_transformers (line 273) | def test_llama3_base_equivalence_with_transformers(): FILE: pkg/llms_from_scratch/tests/test_qwen3.py class Qwen3RMSNorm (line 37) | class Qwen3RMSNorm(nn.Module): method __init__ (line 40) | def __init__(self, hidden_size, eps=1e-6): method forward (line 48) | def forward(self, hidden_states): method extra_repr (line 56) | def extra_repr(self): function _hf_ids (line 63) | def _hf_ids(obj): function dummy_input (line 94) | def dummy_input(): function dummy_cfg_base (line 100) | def dummy_cfg_base(): function dummy_cfg_moe (line 118) | def dummy_cfg_moe(dummy_cfg_base): function test_dummy_qwen3_forward (line 129) | def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input): function test_dummy_qwen3_moe_forward (line 138) | def test_dummy_qwen3_moe_forward(dummy_cfg_moe, dummy_input): function test_moe_forward_matches_reference (line 149) | def test_moe_forward_matches_reference(dummy_cfg_moe): function test_qwen3_kvcache_equivalence (line 180) | def test_qwen3_kvcache_equivalence(cfg_name, request): function test_rope (line 214) | def test_rope(context_len): function qwen3_weights_path (line 288) | def qwen3_weights_path(tmp_path_factory): function test_model_variants (line 302) | def test_model_variants(ModelClass, qwen3_weights_path, generate_fn): function test_model_KV_noKV (line 340) | def test_model_KV_noKV(qwen3_weights_path): function test_model_batched_KV (line 381) | def test_model_batched_KV(qwen3_weights_path): function test_rmsnorm_equivalence (line 444) | def test_rmsnorm_equivalence(): function test_all_special_tokens_roundtrip (line 471) | def test_all_special_tokens_roundtrip(repo_id, tok_file): function test_chat_wrap_and_equivalence (line 523) | def test_chat_wrap_and_equivalence(add_gen, add_think): function test_multiturn_equivalence (line 573) | def test_multiturn_equivalence(repo_id, tok_file, add_gen, add_think): function test_tokenizer_equivalence (line 614) | def test_tokenizer_equivalence(): function test_multiturn_prefix_stability (line 688) | def test_multiturn_prefix_stability(repo_id, tok_file, add_gen, add_think): function test_qwen3_base_equivalence_with_transformers (line 764) | def test_qwen3_base_equivalence_with_transformers(): FILE: pkg/llms_from_scratch/utils.py function _extract_imports (line 17) | def _extract_imports(src: str): function _extract_defs_and_classes_from_code (line 39) | def _extract_defs_and_classes_from_code(src): function import_definitions_from_notebook (line 110) | def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None,... function download_file (line 153) | def download_file(url, out_dir="."): FILE: setup/02_installing-python-libraries/python_environment_check.py function get_packages (line 20) | def get_packages(pkgs): function get_requirements_dict (line 66) | def get_requirements_dict(): function check_packages (line 102) | def check_packages(reqs): function main (line 122) | def main(): FILE: setup/02_installing-python-libraries/tests.py function test_main (line 11) | def test_main(capsys):