SYMBOL INDEX (287 symbols across 32 files) FILE: dev/cpu/matmul_forward.c function matmul_forward_cpu (line 21) | void matmul_forward_cpu(float* out, function matmul_forward_ngc92 (line 43) | void matmul_forward_ngc92(float* out, function matmul_forward (line 92) | void matmul_forward(int kernel_num, function main (line 114) | int main(int argc, char **argv) { function validate_results_cpu (line 196) | void validate_results_cpu(const float* kernel_result, const float* cpu_r... FILE: dev/cuda/benchmark_on_modal.py function execute_command (line 83) | def execute_command(command: str): function run_benchmark (line 103) | def run_benchmark(compile_command: str, run_command: str): function inference_main (line 119) | def inference_main(compile_command: str, run_command: str): FILE: dev/cuda/common.h function __device__ (line 16) | __device__ float warpReduceSum(float val) { function blockReduce (line 30) | float blockReduce(float val, bool final_sync, float out_of_bounds) { function blockReduce (line 52) | float blockReduce(float val) { function cuda_check (line 60) | void cuda_check(cudaError_t error, const char *file, int line) { function cublasCheck (line 70) | void cublasCheck(cublasStatus_t status, const char *file, int line) function __device__ (line 113) | __device__ explicit Packed128(int4 bits) { function __device__ (line 118) | __device__ static Packed128 constant(ElementType value) { function __device__ (line 126) | __device__ static Packed128 zeros() { function __device__ (line 130) | __device__ static Packed128 ones() { function __device__ (line 134) | __device__ ElementType& operator[](int index) { function __device__ (line 137) | __device__ const ElementType& operator[](int index) const { function __device__ (line 140) | __device__ int4 get_bits() const { type __nv_bfloat16 (line 186) | typedef __nv_bfloat16 floatX; type __nv_bfloat16 (line 187) | typedef __nv_bfloat16 floatN; type half (line 194) | typedef half floatX; type half (line 195) | typedef half floatN; type floatX (line 199) | typedef float floatX; type floatN (line 200) | typedef float floatN; type Packed128 (line 203) | typedef Packed128 x128; function __device__ (line 211) | __device__ floatX __ldcs(const floatX* address) { FILE: dev/data/data_common.py function download_file (line 10) | def download_file(url: str, fname: str, chunk_size=1024): function write_datafile (line 39) | def write_datafile(filename, toks, model_desc="gpt-2"): function write_evalfile (line 62) | def write_evalfile(filename, datas): FILE: dev/data/fineweb.py function tokenize_llama (line 67) | def tokenize_llama(doc): function tokenize_gpt2 (line 79) | def tokenize_gpt2(doc): FILE: dev/data/hellaswag.py function download (line 52) | def download(split): function render_example (line 63) | def render_example(example): function iterate_examples (line 102) | def iterate_examples(split): function evaluate (line 111) | def evaluate(model_type, device): FILE: dev/data/mmlu.py function download (line 30) | def download(): function iterate_examples (line 42) | def iterate_examples(): function render_example (line 61) | def render_example(example): function evaluate (line 90) | def evaluate(model_type, device): FILE: dev/data/tinyshakespeare.py function download (line 35) | def download(): function tokenize (line 47) | def tokenize(model_desc): FILE: dev/data/tinystories.py function download (line 43) | def download(): function process_shard (line 73) | def process_shard(shard_index, shard_filename, model_desc): function tokenize (line 98) | def tokenize(model_desc): FILE: dev/eval/export_hf.py function tensor_bf16 (line 24) | def tensor_bf16(data_int16, transpose=False): function tensor_fp32 (line 29) | def tensor_fp32(data_float32, transpose=False): function convert (line 37) | def convert(filepath, output, push_to_hub=False, out_dtype="bfloat16"): function spin (line 146) | def spin(output): FILE: dev/loss_checker_ci.py function read_numbers_from_file (line 7) | def read_numbers_from_file(file_path, col_start, col_end): function compare_numbers (line 32) | def compare_numbers(read_values, fixed_values, percent_accuracy): function main (line 44) | def main(): FILE: dev/test/test_dataloader.c function check_range (line 18) | void check_range(const int *tokens, const int start, const int end, cons... function check_equals (line 35) | void check_equals(const int *tokens, const int n, const int expected, co... function test_simple (line 51) | void test_simple(void) { function test_multiprocess_simple (line 87) | void test_multiprocess_simple(void) { function test_shuffled (line 127) | void test_shuffled(void) { function test_multiprocess_shuffled (line 194) | void test_multiprocess_shuffled(void) { function main (line 269) | int main(void) { FILE: dev/test/test_outlier_detector.c function main (line 11) | int main(void) { FILE: dev/unistd.h function clock_gettime (line 20) | static inline int clock_gettime(int ignore_variable, struct timespec* tv) type glob_t (line 35) | typedef struct glob_t { function replace_forward_slashes (line 40) | static inline void replace_forward_slashes(char* str) { function globfree (line 49) | static inline void globfree(glob_t *pglob) { function glob (line 56) | static inline int glob(const char* pattern, int ignored_flags, int (*ign... type dirent (line 114) | typedef struct dirent { type DIR (line 118) | typedef struct DIR { function DIR (line 124) | static inline DIR *opendir(const char *name) { type dirent (line 144) | struct dirent type dirent (line 145) | struct dirent function closedir (line 160) | static inline int closedir(DIR *directory) { FILE: doc/layernorm/layernorm.c function layernorm_forward (line 9) | void layernorm_forward(float* out, float* mean, float* rstd, function layernorm_backward (line 46) | void layernorm_backward(float* dinp, float* dweight, float* dbias, function check_tensor (line 90) | int check_tensor(float *a, float *b, int n, char* label) { function main (line 105) | int main() { FILE: doc/layernorm/layernorm.py class LayerNorm (line 5) | class LayerNorm: method forward (line 8) | def forward(x, w, b): method backward (line 21) | def backward(dout, cache): function write (line 56) | def write(tensor, handle): FILE: llmc/cublas_common.h function cublasCheck (line 37) | void cublasCheck(cublasStatus_t status, const char *file, int line) FILE: llmc/cuda_common.h function cudaCheck_ (line 52) | inline void cudaCheck_(cudaError_t error, const char *file, int line) { function cudaFreeCheck (line 62) | void cudaFreeCheck(T** ptr, const char *file, int line) { type PrecisionMode (line 75) | enum PrecisionMode { type floatX (line 83) | typedef float floatX; type half (line 87) | typedef half floatX; type __nv_bfloat16 (line 90) | typedef __nv_bfloat16 floatX; function __device__ (line 102) | __device__ floatX __ldcs(const floatX* address) { function device_to_file (line 130) | inline void device_to_file(FILE* dest, void* src, size_t num_bytes, size... function file_to_device (line 169) | inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size... FILE: llmc/cudnn_att.cpp function cuDNNCheck (line 26) | static void cuDNNCheck(cudnnStatus_t error, const char *file, int line) { function checkCudnnFE (line 34) | static void checkCudnnFE(const fe::error_object& e, const char *file, in... type UIDs (line 42) | enum UIDs { function lookup_cache_or_build_graph_fwd (line 60) | auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_in... function attention_forward_cudnn (line 222) | void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) function attention_backward_cudnn (line 256) | void attention_backward_cudnn(floatX* dqkvr, ... function create_cudnn (line 290) | void create_cudnn() { function destroy_cudnn (line 294) | void destroy_cudnn() { FILE: llmc/dataloader.h type DataLoader (line 29) | typedef struct { function dataloader_load_shard_ (line 61) | int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) { function prepare_intra_shard_indices_ (line 99) | void prepare_intra_shard_indices_(DataLoader *loader) { function dataloader_reset (line 110) | void dataloader_reset(DataLoader *loader) { function dataloader_advance_ (line 125) | void dataloader_advance_(DataLoader *loader) { function dataloader_init (line 142) | void dataloader_init(DataLoader *loader, function dataloader_load_batch (line 203) | void dataloader_load_batch(DataLoader* loader) { function dataloader_next_batch (line 222) | void dataloader_next_batch(DataLoader *loader) { function dataloader_resume (line 232) | void dataloader_resume(DataLoader *loader, size_t current_shard_idx, siz... function dataloader_free (line 239) | void dataloader_free(DataLoader *loader) { type EvalLoader (line 274) | typedef struct { function evalloader_reset (line 298) | void evalloader_reset(EvalLoader *loader) { function evalloader_init (line 340) | void evalloader_init(EvalLoader *loader, function evalloader_next_example_ (line 380) | void evalloader_next_example_(EvalLoader *loader, int example_batch_inde... function evalloader_next_batch (line 449) | void evalloader_next_batch(EvalLoader *loader) { function evalloader_stat_losses (line 468) | int evalloader_stat_losses(EvalLoader *loader, float* losses) { function evalloader_free (line 511) | void evalloader_free(EvalLoader *loader) { FILE: llmc/logger.h type Logger (line 14) | typedef struct { function logger_init (line 19) | void logger_init(Logger *logger, const char *log_dir, int process_rank, ... function logger_log_eval (line 34) | void logger_log_eval(Logger *logger, int step, float val) { function logger_log_val (line 42) | void logger_log_val(Logger *logger, int step, float val_loss) { function logger_log_train (line 50) | void logger_log_train(Logger *logger, int step, float train_loss, float ... FILE: llmc/mfu.h function nvml_check (line 20) | inline void nvml_check(nvmlReturn_t status, const char *file, int line) { type PerfData (line 30) | typedef struct { type GPUEntry (line 48) | typedef struct { function get_flops_promised (line 97) | float get_flops_promised(const char* device, int precision_mode) { type GPUUtilInfo (line 154) | struct GPUUtilInfo { function nvmlDevice_t (line 170) | nvmlDevice_t nvml_get_device() { function GPUUtilInfo (line 196) | GPUUtilInfo get_gpu_utilization_info() { function GPUUtilInfo (line 239) | GPUUtilInfo get_gpu_utilization_info() { FILE: llmc/outlier_detector.h type OutlierDetector (line 18) | typedef struct { function init_detector (line 26) | void init_detector(OutlierDetector *detector) { function update_detector (line 36) | double update_detector(OutlierDetector *detector, double new_value) { FILE: llmc/rand.h type mt19937_state (line 98) | typedef struct { function manual_seed (line 106) | void manual_seed(mt19937_state* state, unsigned int seed) { function next_state (line 118) | void next_state(mt19937_state* state) { function randint32 (line 134) | unsigned int randint32(mt19937_state* state) { function randint64 (line 148) | inline unsigned long long randint64(mt19937_state* state) { function randfloat32 (line 152) | inline float randfloat32(mt19937_state* state) { function randfloat64 (line 156) | inline double randfloat64(mt19937_state* state) { function uniform_ (line 160) | void uniform_(float* data, unsigned int numel, float from, float to, mt1... function normal_fill_16 (line 168) | void normal_fill_16(float* data, float mean, float std) { function normal_fill (line 180) | void normal_fill(float* data, unsigned int numel, float mean, float std,... function normal_ (line 197) | void normal_(float* data, unsigned int numel, float mean, float std, mt1... function init_identity_permutation (line 223) | void init_identity_permutation(int *data, int numel) { function random_permutation (line 229) | void random_permutation(int* data, int numel, mt19937_state* state) { FILE: llmc/sampler.h function random_u32 (line 10) | unsigned int random_u32(unsigned long long *state) { function random_f32 (line 18) | float random_f32(unsigned long long *state) { // random float32 in [0,1) function sample_softmax (line 22) | int sample_softmax(const float* logits, int n, float coin) { FILE: llmc/schedulers.h type LearningRateScheduler (line 11) | typedef struct { function lr_scheduler_init (line 19) | void lr_scheduler_init(LearningRateScheduler *scheduler, const char* sch... function get_learning_rate_cosine (line 28) | float get_learning_rate_cosine(LearningRateScheduler *scheduler, int ste... function get_learning_rate_linear (line 44) | float get_learning_rate_linear(LearningRateScheduler *scheduler, int ste... function get_learning_rate_constant (line 58) | float get_learning_rate_constant(LearningRateScheduler *scheduler, int s... function get_learning_rate_wsd (line 64) | float get_learning_rate_wsd(LearningRateScheduler *scheduler, int step) { function get_learning_rate (line 83) | float get_learning_rate(LearningRateScheduler *scheduler, int step) { FILE: llmc/tokenizer.h type Tokenizer (line 18) | typedef struct { function safe_printf (line 25) | void safe_printf(const char *piece) { function tokenizer_init (line 41) | void tokenizer_init(Tokenizer *tokenizer, const char *filename) { function tokenizer_free (line 98) | void tokenizer_free(Tokenizer *tokenizer) { FILE: llmc/utils.h function FILE (line 26) | extern inline FILE *fopen_check(const char *path, const char *mode, cons... function fread_check (line 44) | extern inline void fread_check(void *ptr, size_t size, size_t nmemb, FIL... function fclose_check (line 66) | extern inline void fclose_check(FILE *fp, const char *file, int line) { function sclose_check (line 78) | extern inline void sclose_check(int sockfd, const char *file, int line) { function closesocket_check (line 91) | extern inline void closesocket_check(int sockfd, const char *file, int l... function fseek_check (line 104) | extern inline void fseek_check(FILE *fp, long off, int whence, const cha... function fwrite_check (line 118) | extern inline void fwrite_check(void *ptr, size_t size, size_t nmemb, FI... function token_check (line 161) | extern inline void token_check(const int* tokens, int token_count, int v... function create_dir_if_not_exists (line 180) | extern inline void create_dir_if_not_exists(const char *dir) { function find_max_step (line 192) | extern inline int find_max_step(const char* output_log_dir) { function ends_with_bin (line 212) | extern inline int ends_with_bin(const char* str) { FILE: test_gpt2.c function check_tensor (line 5) | int check_tensor(float *a, float *b, int n, const char* label) { function main (line 39) | int main(int argc, char *argv[]) { FILE: train_gpt2.c function encoder_forward (line 35) | void encoder_forward(float* out, function encoder_backward (line 60) | void encoder_backward(float* dwte, float* dwpe, function layernorm_forward (line 78) | void layernorm_forward(float* out, float* mean, float* rstd, function layernorm_backward (line 120) | void layernorm_backward(float* dinp, float* dweight, float* dbias, function matmul_forward_naive (line 163) | void matmul_forward_naive(float* out, function matmul_forward (line 184) | void matmul_forward(float* out, function matmul_backward (line 231) | void matmul_backward(float* dinp, float* dweight, float* dbias, function attention_forward (line 271) | void attention_forward(float* out, float* preatt, float* att, function attention_backward (line 347) | void attention_backward(float* dinp, float* dpreatt, float* datt, function gelu_forward (line 408) | void gelu_forward(float* out, float* inp, int N) { function gelu_backward (line 420) | __attribute__((optimize("no-finite-math-only"))) function residual_forward (line 436) | void residual_forward(float* out, float* inp1, float* inp2, int N) { function residual_backward (line 442) | void residual_backward(float* dinp1, float* dinp2, float* dout, int N) { function softmax_forward (line 449) | void softmax_forward(float* probs, float* logits, int B, int T, int V, i... function crossentropy_forward (line 486) | void crossentropy_forward(float* losses, function crossentropy_softmax_backward (line 502) | void crossentropy_softmax_backward(float* dlogits, type GPT2Config (line 526) | typedef struct { type ParameterTensors (line 537) | typedef struct { function fill_in_parameter_sizes (line 556) | void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) { type ActivationTensors (line 602) | typedef struct { function fill_in_activation_sizes (line 628) | void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int ... type GPT2 (line 678) | typedef struct { function gpt2_build_from_checkpoint (line 707) | void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { function gpt2_forward (line 765) | void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size... function gpt2_zero_grad (line 893) | void gpt2_zero_grad(GPT2 *model) { function gpt2_backward (line 898) | void gpt2_backward(GPT2 *model) { function gpt2_update (line 1007) | void gpt2_update(GPT2 *model, float learning_rate, float beta1, float be... function gpt2_free (line 1035) | void gpt2_free(GPT2 *model) { function random_u32 (line 1051) | unsigned int random_u32(uint64_t *state) { function random_f32 (line 1058) | float random_f32(uint64_t *state) { // random float32 in [0,1) function sample_mult (line 1062) | int sample_mult(float* probabilities, int n, float coin) { function main (line 1077) | int main() { FILE: train_gpt2.py class NewGELU (line 40) | class NewGELU(nn.Module): method forward (line 42) | def forward(self, input): class CausalSelfAttention (line 48) | class CausalSelfAttention(nn.Module): method __init__ (line 50) | def __init__(self, config): method forward (line 65) | def forward(self, x): class MLP (line 88) | class MLP(nn.Module): method __init__ (line 90) | def __init__(self, config): method forward (line 97) | def forward(self, x): class Block (line 103) | class Block(nn.Module): method __init__ (line 105) | def __init__(self, config): method forward (line 112) | def forward(self, x): class GPTConfig (line 121) | class GPTConfig: class GPT (line 128) | class GPT(nn.Module): method __init__ (line 130) | def __init__(self, config): method _init_weights (line 149) | def _init_weights(self, module): method forward (line 162) | def forward(self, idx, targets=None, return_logits=True): method from_pretrained (line 193) | def from_pretrained(cls, model_type): method configure_optimizers (line 241) | def configure_optimizers(self, weight_decay, learning_rate, betas, dev... method generate (line 273) | def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): function _peek_data_shard (line 302) | def _peek_data_shard(filename): function _load_data_shard (line 317) | def _load_data_shard(filename): class DistributedDataLoader (line 329) | class DistributedDataLoader: method __init__ (line 330) | def __init__(self, filename_pattern, B, T, process_rank, num_processes): method reset (line 353) | def reset(self): method advance (line 361) | def advance(self): # advance to next data shard method next_batch (line 366) | def next_batch(self): function write_fp32 (line 383) | def write_fp32(tensor, file): function write_bf16 (line 388) | def write_bf16(tensor, file): function write_tensors (line 395) | def write_tensors(model_tensors, L, file, dtype): function pad_vocab (line 429) | def pad_vocab(tensor, multiple=128, value=0): function write_model (line 449) | def write_model(model, filename, dtype): function write_state (line 479) | def write_state(model, x, y, logits, loss, filename): function write_tokenizer (line 509) | def write_tokenizer(enc, filename): function print0 (line 529) | def print0(*args, **kwargs): function get_lr (line 716) | def get_lr(it): FILE: train_llama3.py function repeat_kv (line 59) | def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: function reshape_for_broadcast (line 73) | def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): function apply_scaling (line 80) | def apply_scaling(freqs: torch.Tensor): function apply_rotary_emb (line 104) | def apply_rotary_emb( function precompute_freqs_cis (line 116) | def precompute_freqs_cis( class RMSNorm (line 133) | class RMSNorm(torch.nn.Module): method __init__ (line 134) | def __init__(self, dim: int, eps: float = 1e-6): method _norm (line 139) | def _norm(self, x): method forward (line 142) | def forward(self, x): class CausalSelfAttention (line 146) | class CausalSelfAttention(nn.Module): method __init__ (line 148) | def __init__(self, config): method forward (line 167) | def forward(self, x, freqs_cis=None, start_pos=None, mask=None): class MLP (line 205) | class MLP(nn.Module): method __init__ (line 207) | def __init__(self, config): method forward (line 219) | def forward(self, x): class Block (line 228) | class Block(nn.Module): method __init__ (line 230) | def __init__(self, config): method forward (line 237) | def forward(self, x, freqs_cis=None, start_pos=None, mask=None): class LlamaConfig (line 246) | class LlamaConfig: method __init__ (line 263) | def __init__(self, **kwargs): class LLaMA (line 271) | class LLaMA(nn.Module): method __init__ (line 273) | def __init__(self, config): method forward (line 295) | def forward(self, idx, targets=None, return_logits=True, start_pos=0): method adapt_llama_state_dict_keys (line 325) | def adapt_llama_state_dict_keys(checkpoint, config: LlamaConfig): method adapt_llama_state_dict_keys_hf (line 361) | def adapt_llama_state_dict_keys_hf(checkpoint, config: LlamaConfig): method from_pretrained_llama3_hf (line 404) | def from_pretrained_llama3_hf(cls, model_id): method from_pretrained_llama3_meta (line 426) | def from_pretrained_llama3_meta(cls, ckpt_dir, tokenizer_path): method configure_optimizers (line 444) | def configure_optimizers(self, weight_decay, learning_rate, betas, dev... method generate (line 476) | def generate( function sample_top_p (line 559) | def sample_top_p(probs, p): class Tokenizer (line 596) | class Tokenizer: method __init__ (line 607) | def __init__(self, model_path: str): method encode (line 661) | def encode( method decode (line 717) | def decode(self, t: Sequence[int]) -> str: method _split_whitespaces_or_nonwhitespaces (line 722) | def _split_whitespaces_or_nonwhitespaces( function _peek_data_shard (line 750) | def _peek_data_shard(filename): function _load_data_shard (line 762) | def _load_data_shard(filename): class DistributedShardedDataLoader (line 774) | class DistributedShardedDataLoader: method __init__ (line 783) | def __init__(self, filename_pattern, B, T, process_rank, num_processes): method reset (line 806) | def reset(self): method advance (line 814) | def advance(self): # advance to next data shard method next_batch (line 819) | def next_batch(self): function write_fp32 (line 836) | def write_fp32(tensor, file): function write_bf16 (line 841) | def write_bf16(tensor, file): function write_tensors (line 848) | def write_tensors(model_tensors, L, file, dtype): function write_model (line 870) | def write_model(model, filename, dtype): function write_state (line 903) | def write_state(model, x, y, logits, loss, filename): function print0 (line 930) | def print0(*args, **kwargs): function get_lr (line 1109) | def get_lr(it):