SYMBOL INDEX (66 symbols across 18 files)

FILE: ablations/evaluation/launch_evals.py
  function parse_date (line 22) | def parse_date(date_string: Optional[str]) -> Optional[datetime]:
  function checkpoint_exists (line 31) | def checkpoint_exists(logging_dir: str, model_name: str, checkpoint: str...
  function launch_slurm_job (line 49) | def launch_slurm_job(launch_file_contents, *args):
  function get_checkpoints_to_run (line 69) | def get_checkpoints_to_run(s3_path: str, model_name: str, checkpoints: s...

FILE: ablations/evaluation/launch_random_evals.py
  function launch_slurm_job (line 11) | def launch_slurm_job(launch_file_contents, *args):

FILE: ablations/training/launch_exp.py
  function launch_slurm_job (line 70) | def launch_slurm_job(launch_file_contents, *args):
  function print_differences (line 133) | def print_differences(target, updates):

FILE: fineweb-2-pipeline.py
  function above_lang_threshold (line 109) | def above_lang_threshold(doc, threshold):

FILE: misc/precision_filtering/count_common.py
  function generate_tokenizer_identifier (line 11) | def generate_tokenizer_identifier(row):
  function load_and_save_tokenizer_freq (line 44) | def load_and_save_tokenizer_freq(tokenizer_id, selected_language):

FILE: misc/precision_filtering/run_precision_filtering.py
  class Decontaminate (line 11) | class Decontaminate(BaseFilter):
    method __init__ (line 12) | def __init__(self, lang_code, language,
    method wordlist (line 50) | def wordlist(self):
    method wordlist_filter (line 57) | def wordlist_filter(self, doc):
    method filter (line 65) | def filter(self, document):
    method url_filter (line 68) | def url_filter(self, document):

FILE: misc/precision_filtering/slurm_count_word.py
  function generate_tokenizer_identifier (line 9) | def generate_tokenizer_identifier(row):
  class ComputeTermFrequency (line 26) | class ComputeTermFrequency(PipelineStep):
    method __init__ (line 27) | def __init__(self, tokenizer_name: str, example_language: str):
    method run (line 31) | def run(self, _data, rank, world_size):

FILE: misc/precision_filtering/wordlist_gen.py
  function filter_top_percentile (line 7) | def filter_top_percentile(counter, percentile=95):
  function filter_by_ratio (line 24) | def filter_by_ratio(counter1, counter2, threshold=0.85):
  function save (line 42) | def save(tokenizer_id, selected_language):
  function generate_tokenizer_identifier (line 70) | def generate_tokenizer_identifier(row):

FILE: misc/precision_filtering/wordlist_score.py
  function load_words_from_txt (line 3) | def load_words_from_txt(folder_path):
  function filter_score (line 51) | def filter_score(text, lang):

FILE: misc/reference_datasets/monolingual/ar/download_arabicweb24.py
  class ArrowReader (line 8) | class ArrowReader(BaseDiskReader):
    method __init__ (line 12) | def __init__(
    method read_file (line 48) | def read_file(self, filepath: str):

FILE: misc/reference_datasets/monolingual/fr/download_croissant.py
  class ArrowReader (line 7) | class ArrowReader(BaseDiskReader):
    method __init__ (line 11) | def __init__(
    method read_file (line 47) | def read_file(self, filepath: str):

FILE: misc/reference_datasets/monolingual/zh/download_mapcc.py
  class ConcatenatedFileStream (line 10) | class ConcatenatedFileStream:
    method __init__ (line 11) | def __init__(self, filepaths):
    method _open_next_file (line 17) | def _open_next_file(self):
    method read (line 27) | def read(self, size=-1):
    method close (line 42) | def close(self):
  class JsonlPartReader (line 46) | class JsonlPartReader(JsonlReader):
    method __init__ (line 47) | def __init__(
    method read_files_shard (line 67) | def read_files_shard(self, shard: list[str]):
  function open_concatenated_gzip_files (line 105) | def open_concatenated_gzip_files(filepaths):
  class ExtractMapccStep (line 114) | class ExtractMapccStep(PipelineStep):
    method run (line 119) | def run(self, data, rank: int = 0, world_size: int = 1):
  class CollectMapccStep (line 126) | class CollectMapccStep(PipelineStep):
    method run (line 136) | def run(self, data, rank: int = 0, world_size: int = 1):

FILE: misc/reference_datasets/multilingual/copy_raw_data.py
  function adapter (line 8) | def adapter(self, data: dict, path: str, id_in_file: int | str):
  class CachedListReader (line 29) | class CachedListReader(JsonlReader):
    method __init__ (line 30) | def __init__(self,
    method run (line 63) | def run(self, data=None, rank: int = 0, world_size: int = 1):
    method read_file (line 96) | def read_file(self, filepath: str):

FILE: misc/reference_datasets/multilingual/download_cc-100.py
  class CC100Reader (line 5) | class CC100Reader(PipelineStep):
    method run (line 6) | def run(self, data=None, rank: int = 0, world_size: int = 1):

FILE: misc/reference_datasets/multilingual/download_culturax.py
  function adapter (line 6) | def adapter(self, data: dict, path: str, id_in_file: int | str):

FILE: misc/reference_datasets/multilingual/download_hplt.py
  class HPLTReader (line 6) | class HPLTReader(JsonlReader):
    method run (line 8) | def run(self, data=None, rank: int = 0, world_size: int = 1):

FILE: misc/reference_datasets/multilingual/download_mc4.py
  function adapter (line 7) | def adapter(self, data: dict, path: str, id_in_file: int | str):

FILE: misc/reference_datasets/multilingual/part jsons.py
  class ConcatenatedFileStream (line 9) | class ConcatenatedFileStream:
    method __init__ (line 10) | def __init__(self, filepaths):
    method _open_next_file (line 16) | def _open_next_file(self):
    method read (line 26) | def read(self, size=-1):
    method close (line 41) | def close(self):
  function open_concatenated_gzip_files (line 46) | def open_concatenated_gzip_files(filepaths):
  class JsonlPartReader (line 56) | class JsonlPartReader(JsonlReader):
    method __init__ (line 57) | def __init__(
    method read_files_shard (line 77) | def read_files_shard(self, shard: list[str]):
    method read_file (line 115) | def read_file(self, filepath: str):