SYMBOL INDEX (287 symbols across 32 files)

FILE: dev/cpu/matmul_forward.c
  function matmul_forward_cpu (line 21) | void matmul_forward_cpu(float* out,
  function matmul_forward_ngc92 (line 43) | void matmul_forward_ngc92(float* out,
  function matmul_forward (line 92) | void matmul_forward(int kernel_num,
  function main (line 114) | int main(int argc, char **argv) {
  function validate_results_cpu (line 196) | void validate_results_cpu(const float* kernel_result, const float* cpu_r...

FILE: dev/cuda/benchmark_on_modal.py
  function execute_command (line 83) | def execute_command(command: str):
  function run_benchmark (line 103) | def run_benchmark(compile_command: str, run_command: str):
  function inference_main (line 119) | def inference_main(compile_command: str, run_command: str):

FILE: dev/cuda/common.h
  function __device__ (line 16) | __device__ float warpReduceSum(float val) {
  function blockReduce (line 30) | float blockReduce(float val, bool final_sync, float out_of_bounds) {
  function blockReduce (line 52) | float blockReduce(float val) {
  function cuda_check (line 60) | void cuda_check(cudaError_t error, const char *file, int line) {
  function cublasCheck (line 70) | void cublasCheck(cublasStatus_t status, const char *file, int line)
  function __device__ (line 113) | __device__ explicit Packed128(int4 bits) {
  function __device__ (line 118) | __device__  static Packed128 constant(ElementType value) {
  function __device__ (line 126) | __device__ static Packed128 zeros() {
  function __device__ (line 130) | __device__ static Packed128 ones() {
  function __device__ (line 134) | __device__ ElementType& operator[](int index) {
  function __device__ (line 137) | __device__ const ElementType& operator[](int index) const {
  function __device__ (line 140) | __device__ int4 get_bits() const {
  type __nv_bfloat16 (line 186) | typedef __nv_bfloat16 floatX;
  type __nv_bfloat16 (line 187) | typedef __nv_bfloat16 floatN;
  type half (line 194) | typedef half floatX;
  type half (line 195) | typedef half floatN;
  type floatX (line 199) | typedef float floatX;
  type floatN (line 200) | typedef float floatN;
  type Packed128 (line 203) | typedef Packed128<floatX> x128;
  function __device__ (line 211) | __device__ floatX __ldcs(const floatX* address) {

FILE: dev/data/data_common.py
  function download_file (line 10) | def download_file(url: str, fname: str, chunk_size=1024):
  function write_datafile (line 39) | def write_datafile(filename, toks, model_desc="gpt-2"):
  function write_evalfile (line 62) | def write_evalfile(filename, datas):

FILE: dev/data/fineweb.py
  function tokenize_llama (line 67) | def tokenize_llama(doc):
  function tokenize_gpt2 (line 79) | def tokenize_gpt2(doc):

FILE: dev/data/hellaswag.py
  function download (line 52) | def download(split):
  function render_example (line 63) | def render_example(example):
  function iterate_examples (line 102) | def iterate_examples(split):
  function evaluate (line 111) | def evaluate(model_type, device):

FILE: dev/data/mmlu.py
  function download (line 30) | def download():
  function iterate_examples (line 42) | def iterate_examples():
  function render_example (line 61) | def render_example(example):
  function evaluate (line 90) | def evaluate(model_type, device):

FILE: dev/data/tinyshakespeare.py
  function download (line 35) | def download():
  function tokenize (line 47) | def tokenize(model_desc):

FILE: dev/data/tinystories.py
  function download (line 43) | def download():
  function process_shard (line 73) | def process_shard(shard_index, shard_filename, model_desc):
  function tokenize (line 98) | def tokenize(model_desc):

FILE: dev/eval/export_hf.py
  function tensor_bf16 (line 24) | def tensor_bf16(data_int16, transpose=False):
  function tensor_fp32 (line 29) | def tensor_fp32(data_float32, transpose=False):
  function convert (line 37) | def convert(filepath, output, push_to_hub=False, out_dtype="bfloat16"):
  function spin (line 146) | def spin(output):

FILE: dev/loss_checker_ci.py
  function read_numbers_from_file (line 7) | def read_numbers_from_file(file_path, col_start, col_end):
  function compare_numbers (line 32) | def compare_numbers(read_values, fixed_values, percent_accuracy):
  function main (line 44) | def main():

FILE: dev/test/test_dataloader.c
  function check_range (line 18) | void check_range(const int *tokens, const int start, const int end, cons...
  function check_equals (line 35) | void check_equals(const int *tokens, const int n, const int expected, co...
  function test_simple (line 51) | void test_simple(void) {
  function test_multiprocess_simple (line 87) | void test_multiprocess_simple(void) {
  function test_shuffled (line 127) | void test_shuffled(void) {
  function test_multiprocess_shuffled (line 194) | void test_multiprocess_shuffled(void) {
  function main (line 269) | int main(void) {

FILE: dev/test/test_outlier_detector.c
  function main (line 11) | int main(void) {

FILE: dev/unistd.h
  function clock_gettime (line 20) | static inline int clock_gettime(int ignore_variable, struct timespec* tv)
  type glob_t (line 35) | typedef struct glob_t {
  function replace_forward_slashes (line 40) | static inline void replace_forward_slashes(char* str) {
  function globfree (line 49) | static inline void globfree(glob_t *pglob) {
  function glob (line 56) | static inline int glob(const char* pattern, int ignored_flags, int (*ign...
  type dirent (line 114) | typedef struct dirent {
  type DIR (line 118) | typedef struct DIR {
  function DIR (line 124) | static inline DIR *opendir(const char *name) {
  type dirent (line 144) | struct dirent
  type dirent (line 145) | struct dirent
  function closedir (line 160) | static inline int closedir(DIR *directory) {

FILE: doc/layernorm/layernorm.c
  function layernorm_forward (line 9) | void layernorm_forward(float* out, float* mean, float* rstd,
  function layernorm_backward (line 46) | void layernorm_backward(float* dinp, float* dweight, float* dbias,
  function check_tensor (line 90) | int check_tensor(float *a, float *b, int n, char* label) {
  function main (line 105) | int main() {

FILE: doc/layernorm/layernorm.py
  class LayerNorm (line 5) | class LayerNorm:
    method forward (line 8) | def forward(x, w, b):
    method backward (line 21) | def backward(dout, cache):
  function write (line 56) | def write(tensor, handle):

FILE: llmc/cublas_common.h
  function cublasCheck (line 37) | void cublasCheck(cublasStatus_t status, const char *file, int line)

FILE: llmc/cuda_common.h
  function cudaCheck_ (line 52) | inline void cudaCheck_(cudaError_t error, const char *file, int line) {
  function cudaFreeCheck (line 62) | void cudaFreeCheck(T** ptr, const char *file, int line) {
  type PrecisionMode (line 75) | enum PrecisionMode {
  type floatX (line 83) | typedef float floatX;
  type half (line 87) | typedef half floatX;
  type __nv_bfloat16 (line 90) | typedef __nv_bfloat16 floatX;
  function __device__ (line 102) | __device__ floatX __ldcs(const floatX* address) {
  function device_to_file (line 130) | inline void device_to_file(FILE* dest, void* src, size_t num_bytes, size...
  function file_to_device (line 169) | inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size...

FILE: llmc/cudnn_att.cpp
  function cuDNNCheck (line 26) | static void cuDNNCheck(cudnnStatus_t error, const char *file, int line) {
  function checkCudnnFE (line 34) | static void checkCudnnFE(const fe::error_object& e, const char *file, in...
  type UIDs (line 42) | enum UIDs {
  function lookup_cache_or_build_graph_fwd (line 60) | auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_in...
  function attention_forward_cudnn (line 222) | void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
  function attention_backward_cudnn (line 256) | void attention_backward_cudnn(floatX* dqkvr,                            ...
  function create_cudnn (line 290) | void create_cudnn() {
  function destroy_cudnn (line 294) | void destroy_cudnn() {

FILE: llmc/dataloader.h
  type DataLoader (line 29) | typedef struct {
  function dataloader_load_shard_ (line 61) | int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) {
  function prepare_intra_shard_indices_ (line 99) | void prepare_intra_shard_indices_(DataLoader *loader) {
  function dataloader_reset (line 110) | void dataloader_reset(DataLoader *loader) {
  function dataloader_advance_ (line 125) | void dataloader_advance_(DataLoader *loader) {
  function dataloader_init (line 142) | void dataloader_init(DataLoader *loader,
  function dataloader_load_batch (line 203) | void dataloader_load_batch(DataLoader* loader) {
  function dataloader_next_batch (line 222) | void dataloader_next_batch(DataLoader *loader) {
  function dataloader_resume (line 232) | void dataloader_resume(DataLoader *loader, size_t current_shard_idx, siz...
  function dataloader_free (line 239) | void dataloader_free(DataLoader *loader) {
  type EvalLoader (line 274) | typedef struct {
  function evalloader_reset (line 298) | void evalloader_reset(EvalLoader *loader) {
  function evalloader_init (line 340) | void evalloader_init(EvalLoader *loader,
  function evalloader_next_example_ (line 380) | void evalloader_next_example_(EvalLoader *loader, int example_batch_inde...
  function evalloader_next_batch (line 449) | void evalloader_next_batch(EvalLoader *loader) {
  function evalloader_stat_losses (line 468) | int evalloader_stat_losses(EvalLoader *loader, float* losses) {
  function evalloader_free (line 511) | void evalloader_free(EvalLoader *loader) {

FILE: llmc/logger.h
  type Logger (line 14) | typedef struct {
  function logger_init (line 19) | void logger_init(Logger *logger, const char *log_dir, int process_rank, ...
  function logger_log_eval (line 34) | void logger_log_eval(Logger *logger, int step, float val) {
  function logger_log_val (line 42) | void logger_log_val(Logger *logger, int step, float val_loss) {
  function logger_log_train (line 50) | void logger_log_train(Logger *logger, int step, float train_loss, float ...

FILE: llmc/mfu.h
  function nvml_check (line 20) | inline void nvml_check(nvmlReturn_t status, const char *file, int line) {
  type PerfData (line 30) | typedef struct {
  type GPUEntry (line 48) | typedef struct {
  function get_flops_promised (line 97) | float get_flops_promised(const char* device, int precision_mode) {
  type GPUUtilInfo (line 154) | struct GPUUtilInfo {
  function nvmlDevice_t (line 170) | nvmlDevice_t nvml_get_device() {
  function GPUUtilInfo (line 196) | GPUUtilInfo get_gpu_utilization_info() {
  function GPUUtilInfo (line 239) | GPUUtilInfo get_gpu_utilization_info() {

FILE: llmc/outlier_detector.h
  type OutlierDetector (line 18) | typedef struct {
  function init_detector (line 26) | void init_detector(OutlierDetector *detector) {
  function update_detector (line 36) | double update_detector(OutlierDetector *detector, double new_value) {

FILE: llmc/rand.h
  type mt19937_state (line 98) | typedef struct {
  function manual_seed (line 106) | void manual_seed(mt19937_state* state, unsigned int seed) {
  function next_state (line 118) | void next_state(mt19937_state* state) {
  function randint32 (line 134) | unsigned int randint32(mt19937_state* state) {
  function randint64 (line 148) | inline unsigned long long randint64(mt19937_state* state) {
  function randfloat32 (line 152) | inline float randfloat32(mt19937_state* state) {
  function randfloat64 (line 156) | inline double randfloat64(mt19937_state* state) {
  function uniform_ (line 160) | void uniform_(float* data, unsigned int numel, float from, float to, mt1...
  function normal_fill_16 (line 168) | void normal_fill_16(float* data, float mean, float std) {
  function normal_fill (line 180) | void normal_fill(float* data, unsigned int numel, float mean, float std,...
  function normal_ (line 197) | void normal_(float* data, unsigned int numel, float mean, float std, mt1...
  function init_identity_permutation (line 223) | void init_identity_permutation(int *data, int numel) {
  function random_permutation (line 229) | void random_permutation(int* data, int numel, mt19937_state* state) {

FILE: llmc/sampler.h
  function random_u32 (line 10) | unsigned int random_u32(unsigned long long *state) {
  function random_f32 (line 18) | float random_f32(unsigned long long *state) { // random float32 in [0,1)
  function sample_softmax (line 22) | int sample_softmax(const float* logits, int n, float coin) {

FILE: llmc/schedulers.h
  type LearningRateScheduler (line 11) | typedef struct {
  function lr_scheduler_init (line 19) | void lr_scheduler_init(LearningRateScheduler *scheduler, const char* sch...
  function get_learning_rate_cosine (line 28) | float get_learning_rate_cosine(LearningRateScheduler *scheduler, int ste...
  function get_learning_rate_linear (line 44) | float get_learning_rate_linear(LearningRateScheduler *scheduler, int ste...
  function get_learning_rate_constant (line 58) | float get_learning_rate_constant(LearningRateScheduler *scheduler, int s...
  function get_learning_rate_wsd (line 64) | float get_learning_rate_wsd(LearningRateScheduler *scheduler, int step) {
  function get_learning_rate (line 83) | float get_learning_rate(LearningRateScheduler *scheduler, int step) {

FILE: llmc/tokenizer.h
  type Tokenizer (line 18) | typedef struct {
  function safe_printf (line 25) | void safe_printf(const char *piece) {
  function tokenizer_init (line 41) | void tokenizer_init(Tokenizer *tokenizer, const char *filename) {
  function tokenizer_free (line 98) | void tokenizer_free(Tokenizer *tokenizer) {

FILE: llmc/utils.h
  function FILE (line 26) | extern inline FILE *fopen_check(const char *path, const char *mode, cons...
  function fread_check (line 44) | extern inline void fread_check(void *ptr, size_t size, size_t nmemb, FIL...
  function fclose_check (line 66) | extern inline void fclose_check(FILE *fp, const char *file, int line) {
  function sclose_check (line 78) | extern inline void sclose_check(int sockfd, const char *file, int line) {
  function closesocket_check (line 91) | extern inline void closesocket_check(int sockfd, const char *file, int l...
  function fseek_check (line 104) | extern inline void fseek_check(FILE *fp, long off, int whence, const cha...
  function fwrite_check (line 118) | extern inline void fwrite_check(void *ptr, size_t size, size_t nmemb, FI...
  function token_check (line 161) | extern inline void token_check(const int* tokens, int token_count, int v...
  function create_dir_if_not_exists (line 180) | extern inline void create_dir_if_not_exists(const char *dir) {
  function find_max_step (line 192) | extern inline int find_max_step(const char* output_log_dir) {
  function ends_with_bin (line 212) | extern inline int ends_with_bin(const char* str) {

FILE: test_gpt2.c
  function check_tensor (line 5) | int check_tensor(float *a, float *b, int n, const char* label) {
  function main (line 39) | int main(int argc, char *argv[]) {

FILE: train_gpt2.c
  function encoder_forward (line 35) | void encoder_forward(float* out,
  function encoder_backward (line 60) | void encoder_backward(float* dwte, float* dwpe,
  function layernorm_forward (line 78) | void layernorm_forward(float* out, float* mean, float* rstd,
  function layernorm_backward (line 120) | void layernorm_backward(float* dinp, float* dweight, float* dbias,
  function matmul_forward_naive (line 163) | void matmul_forward_naive(float* out,
  function matmul_forward (line 184) | void matmul_forward(float* out,
  function matmul_backward (line 231) | void matmul_backward(float* dinp, float* dweight, float* dbias,
  function attention_forward (line 271) | void attention_forward(float* out, float* preatt, float* att,
  function attention_backward (line 347) | void attention_backward(float* dinp, float* dpreatt, float* datt,
  function gelu_forward (line 408) | void gelu_forward(float* out, float* inp, int N) {
  function gelu_backward (line 420) | __attribute__((optimize("no-finite-math-only")))
  function residual_forward (line 436) | void residual_forward(float* out, float* inp1, float* inp2, int N) {
  function residual_backward (line 442) | void residual_backward(float* dinp1, float* dinp2, float* dout, int N) {
  function softmax_forward (line 449) | void softmax_forward(float* probs, float* logits, int B, int T, int V, i...
  function crossentropy_forward (line 486) | void crossentropy_forward(float* losses,
  function crossentropy_softmax_backward (line 502) | void crossentropy_softmax_backward(float* dlogits,
  type GPT2Config (line 526) | typedef struct {
  type ParameterTensors (line 537) | typedef struct {
  function fill_in_parameter_sizes (line 556) | void fill_in_parameter_sizes(size_t* param_sizes, GPT2Config config) {
  type ActivationTensors (line 602) | typedef struct {
  function fill_in_activation_sizes (line 628) | void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int ...
  type GPT2 (line 678) | typedef struct {
  function gpt2_build_from_checkpoint (line 707) | void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
  function gpt2_forward (line 765) | void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size...
  function gpt2_zero_grad (line 893) | void gpt2_zero_grad(GPT2 *model) {
  function gpt2_backward (line 898) | void gpt2_backward(GPT2 *model) {
  function gpt2_update (line 1007) | void gpt2_update(GPT2 *model, float learning_rate, float beta1, float be...
  function gpt2_free (line 1035) | void gpt2_free(GPT2 *model) {
  function random_u32 (line 1051) | unsigned int random_u32(uint64_t *state) {
  function random_f32 (line 1058) | float random_f32(uint64_t *state) { // random float32 in [0,1)
  function sample_mult (line 1062) | int sample_mult(float* probabilities, int n, float coin) {
  function main (line 1077) | int main() {

FILE: train_gpt2.py
  class NewGELU (line 40) | class NewGELU(nn.Module):
    method forward (line 42) | def forward(self, input):
  class CausalSelfAttention (line 48) | class CausalSelfAttention(nn.Module):
    method __init__ (line 50) | def __init__(self, config):
    method forward (line 65) | def forward(self, x):
  class MLP (line 88) | class MLP(nn.Module):
    method __init__ (line 90) | def __init__(self, config):
    method forward (line 97) | def forward(self, x):
  class Block (line 103) | class Block(nn.Module):
    method __init__ (line 105) | def __init__(self, config):
    method forward (line 112) | def forward(self, x):
  class GPTConfig (line 121) | class GPTConfig:
  class GPT (line 128) | class GPT(nn.Module):
    method __init__ (line 130) | def __init__(self, config):
    method _init_weights (line 149) | def _init_weights(self, module):
    method forward (line 162) | def forward(self, idx, targets=None, return_logits=True):
    method from_pretrained (line 193) | def from_pretrained(cls, model_type):
    method configure_optimizers (line 241) | def configure_optimizers(self, weight_decay, learning_rate, betas, dev...
    method generate (line 273) | def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
  function _peek_data_shard (line 302) | def _peek_data_shard(filename):
  function _load_data_shard (line 317) | def _load_data_shard(filename):
  class DistributedDataLoader (line 329) | class DistributedDataLoader:
    method __init__ (line 330) | def __init__(self, filename_pattern, B, T, process_rank, num_processes):
    method reset (line 353) | def reset(self):
    method advance (line 361) | def advance(self): # advance to next data shard
    method next_batch (line 366) | def next_batch(self):
  function write_fp32 (line 383) | def write_fp32(tensor, file):
  function write_bf16 (line 388) | def write_bf16(tensor, file):
  function write_tensors (line 395) | def write_tensors(model_tensors, L, file, dtype):
  function pad_vocab (line 429) | def pad_vocab(tensor, multiple=128, value=0):
  function write_model (line 449) | def write_model(model, filename, dtype):
  function write_state (line 479) | def write_state(model, x, y, logits, loss, filename):
  function write_tokenizer (line 509) | def write_tokenizer(enc, filename):
  function print0 (line 529) | def print0(*args, **kwargs):
  function get_lr (line 716) | def get_lr(it):

FILE: train_llama3.py
  function repeat_kv (line 59) | def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
  function reshape_for_broadcast (line 73) | def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
  function apply_scaling (line 80) | def apply_scaling(freqs: torch.Tensor):
  function apply_rotary_emb (line 104) | def apply_rotary_emb(
  function precompute_freqs_cis (line 116) | def precompute_freqs_cis(
  class RMSNorm (line 133) | class RMSNorm(torch.nn.Module):
    method __init__ (line 134) | def __init__(self, dim: int, eps: float = 1e-6):
    method _norm (line 139) | def _norm(self, x):
    method forward (line 142) | def forward(self, x):
  class CausalSelfAttention (line 146) | class CausalSelfAttention(nn.Module):
    method __init__ (line 148) | def __init__(self, config):
    method forward (line 167) | def forward(self, x, freqs_cis=None, start_pos=None, mask=None):
  class MLP (line 205) | class MLP(nn.Module):
    method __init__ (line 207) | def __init__(self, config):
    method forward (line 219) | def forward(self, x):
  class Block (line 228) | class Block(nn.Module):
    method __init__ (line 230) | def __init__(self, config):
    method forward (line 237) | def forward(self, x, freqs_cis=None, start_pos=None, mask=None):
  class LlamaConfig (line 246) | class LlamaConfig:
    method __init__ (line 263) | def __init__(self, **kwargs):
  class LLaMA (line 271) | class LLaMA(nn.Module):
    method __init__ (line 273) | def __init__(self, config):
    method forward (line 295) | def forward(self, idx, targets=None, return_logits=True, start_pos=0):
    method adapt_llama_state_dict_keys (line 325) | def adapt_llama_state_dict_keys(checkpoint, config: LlamaConfig):
    method adapt_llama_state_dict_keys_hf (line 361) | def adapt_llama_state_dict_keys_hf(checkpoint, config: LlamaConfig):
    method from_pretrained_llama3_hf (line 404) | def from_pretrained_llama3_hf(cls, model_id):
    method from_pretrained_llama3_meta (line 426) | def from_pretrained_llama3_meta(cls, ckpt_dir, tokenizer_path):
    method configure_optimizers (line 444) | def configure_optimizers(self, weight_decay, learning_rate, betas, dev...
    method generate (line 476) | def generate(
  function sample_top_p (line 559) | def sample_top_p(probs, p):
  class Tokenizer (line 596) | class Tokenizer:
    method __init__ (line 607) | def __init__(self, model_path: str):
    method encode (line 661) | def encode(
    method decode (line 717) | def decode(self, t: Sequence[int]) -> str:
    method _split_whitespaces_or_nonwhitespaces (line 722) | def _split_whitespaces_or_nonwhitespaces(
  function _peek_data_shard (line 750) | def _peek_data_shard(filename):
  function _load_data_shard (line 762) | def _load_data_shard(filename):
  class DistributedShardedDataLoader (line 774) | class DistributedShardedDataLoader:
    method __init__ (line 783) | def __init__(self, filename_pattern, B, T, process_rank, num_processes):
    method reset (line 806) | def reset(self):
    method advance (line 814) | def advance(self): # advance to next data shard
    method next_batch (line 819) | def next_batch(self):
  function write_fp32 (line 836) | def write_fp32(tensor, file):
  function write_bf16 (line 841) | def write_bf16(tensor, file):
  function write_tensors (line 848) | def write_tensors(model_tensors, L, file, dtype):
  function write_model (line 870) | def write_model(model, filename, dtype):
  function write_state (line 903) | def write_state(model, x, y, logits, loss, filename):
  function print0 (line 930) | def print0(*args, **kwargs):
  function get_lr (line 1109) | def get_lr(it):