SYMBOL INDEX (66 symbols across 18 files) FILE: ablations/evaluation/launch_evals.py function parse_date (line 22) | def parse_date(date_string: Optional[str]) -> Optional[datetime]: function checkpoint_exists (line 31) | def checkpoint_exists(logging_dir: str, model_name: str, checkpoint: str... function launch_slurm_job (line 49) | def launch_slurm_job(launch_file_contents, *args): function get_checkpoints_to_run (line 69) | def get_checkpoints_to_run(s3_path: str, model_name: str, checkpoints: s... FILE: ablations/evaluation/launch_random_evals.py function launch_slurm_job (line 11) | def launch_slurm_job(launch_file_contents, *args): FILE: ablations/training/launch_exp.py function launch_slurm_job (line 70) | def launch_slurm_job(launch_file_contents, *args): function print_differences (line 133) | def print_differences(target, updates): FILE: fineweb-2-pipeline.py function above_lang_threshold (line 109) | def above_lang_threshold(doc, threshold): FILE: misc/precision_filtering/count_common.py function generate_tokenizer_identifier (line 11) | def generate_tokenizer_identifier(row): function load_and_save_tokenizer_freq (line 44) | def load_and_save_tokenizer_freq(tokenizer_id, selected_language): FILE: misc/precision_filtering/run_precision_filtering.py class Decontaminate (line 11) | class Decontaminate(BaseFilter): method __init__ (line 12) | def __init__(self, lang_code, language, method wordlist (line 50) | def wordlist(self): method wordlist_filter (line 57) | def wordlist_filter(self, doc): method filter (line 65) | def filter(self, document): method url_filter (line 68) | def url_filter(self, document): FILE: misc/precision_filtering/slurm_count_word.py function generate_tokenizer_identifier (line 9) | def generate_tokenizer_identifier(row): class ComputeTermFrequency (line 26) | class ComputeTermFrequency(PipelineStep): method __init__ (line 27) | def __init__(self, tokenizer_name: str, example_language: str): method run (line 31) | def run(self, _data, rank, world_size): FILE: misc/precision_filtering/wordlist_gen.py function filter_top_percentile (line 7) | def filter_top_percentile(counter, percentile=95): function filter_by_ratio (line 24) | def filter_by_ratio(counter1, counter2, threshold=0.85): function save (line 42) | def save(tokenizer_id, selected_language): function generate_tokenizer_identifier (line 70) | def generate_tokenizer_identifier(row): FILE: misc/precision_filtering/wordlist_score.py function load_words_from_txt (line 3) | def load_words_from_txt(folder_path): function filter_score (line 51) | def filter_score(text, lang): FILE: misc/reference_datasets/monolingual/ar/download_arabicweb24.py class ArrowReader (line 8) | class ArrowReader(BaseDiskReader): method __init__ (line 12) | def __init__( method read_file (line 48) | def read_file(self, filepath: str): FILE: misc/reference_datasets/monolingual/fr/download_croissant.py class ArrowReader (line 7) | class ArrowReader(BaseDiskReader): method __init__ (line 11) | def __init__( method read_file (line 47) | def read_file(self, filepath: str): FILE: misc/reference_datasets/monolingual/zh/download_mapcc.py class ConcatenatedFileStream (line 10) | class ConcatenatedFileStream: method __init__ (line 11) | def __init__(self, filepaths): method _open_next_file (line 17) | def _open_next_file(self): method read (line 27) | def read(self, size=-1): method close (line 42) | def close(self): class JsonlPartReader (line 46) | class JsonlPartReader(JsonlReader): method __init__ (line 47) | def __init__( method read_files_shard (line 67) | def read_files_shard(self, shard: list[str]): function open_concatenated_gzip_files (line 105) | def open_concatenated_gzip_files(filepaths): class ExtractMapccStep (line 114) | class ExtractMapccStep(PipelineStep): method run (line 119) | def run(self, data, rank: int = 0, world_size: int = 1): class CollectMapccStep (line 126) | class CollectMapccStep(PipelineStep): method run (line 136) | def run(self, data, rank: int = 0, world_size: int = 1): FILE: misc/reference_datasets/multilingual/copy_raw_data.py function adapter (line 8) | def adapter(self, data: dict, path: str, id_in_file: int | str): class CachedListReader (line 29) | class CachedListReader(JsonlReader): method __init__ (line 30) | def __init__(self, method run (line 63) | def run(self, data=None, rank: int = 0, world_size: int = 1): method read_file (line 96) | def read_file(self, filepath: str): FILE: misc/reference_datasets/multilingual/download_cc-100.py class CC100Reader (line 5) | class CC100Reader(PipelineStep): method run (line 6) | def run(self, data=None, rank: int = 0, world_size: int = 1): FILE: misc/reference_datasets/multilingual/download_culturax.py function adapter (line 6) | def adapter(self, data: dict, path: str, id_in_file: int | str): FILE: misc/reference_datasets/multilingual/download_hplt.py class HPLTReader (line 6) | class HPLTReader(JsonlReader): method run (line 8) | def run(self, data=None, rank: int = 0, world_size: int = 1): FILE: misc/reference_datasets/multilingual/download_mc4.py function adapter (line 7) | def adapter(self, data: dict, path: str, id_in_file: int | str): FILE: misc/reference_datasets/multilingual/part jsons.py class ConcatenatedFileStream (line 9) | class ConcatenatedFileStream: method __init__ (line 10) | def __init__(self, filepaths): method _open_next_file (line 16) | def _open_next_file(self): method read (line 26) | def read(self, size=-1): method close (line 41) | def close(self): function open_concatenated_gzip_files (line 46) | def open_concatenated_gzip_files(filepaths): class JsonlPartReader (line 56) | class JsonlPartReader(JsonlReader): method __init__ (line 57) | def __init__( method read_files_shard (line 77) | def read_files_shard(self, shard: list[str]): method read_file (line 115) | def read_file(self, filepath: str):