SYMBOL INDEX (34 symbols across 10 files) FILE: analysis_scripts/duplicates.py function same_adjacent_entry (line 37) | def same_adjacent_entry(entry, index): FILE: analysis_scripts/term_counts.py function build_count_dict (line 75) | def build_count_dict(examples): function term_counts (line 133) | def term_counts(text): FILE: analysis_scripts/timestamp_dist.py function reject_outliers (line 41) | def reject_outliers(data, m = 10.): FILE: analysis_scripts/url_dist.py function build_count_dict (line 60) | def build_count_dict(examples): FILE: pipeline_scripts/common_crawl/combine_last_modified_with_text_dataset.py function build_last_modified_dict (line 47) | def build_last_modified_dict(examples): FILE: pipeline_scripts/common_crawl/deduplicate.py function check_for_ending_example_in_cluster (line 38) | def check_for_ending_example_in_cluster(example, index, column, last_ind... function check_for_ending_example_in_cluster (line 72) | def check_for_ending_example_in_cluster(example, index, column, last_ind... class DatasetColumnIterator (line 96) | class DatasetColumnIterator(): method __init__ (line 97) | def __init__(self, dataset, column): method __iter__ (line 101) | def __iter__(self): method __next__ (line 104) | def __next__(self): function remove_slice_list (line 111) | def remove_slice_list(string, slice_list): FILE: pipeline_scripts/common_crawl/experimental/filter_for_only_updated_websites.py function check_for_adjacent_duplicate_url (line 32) | def check_for_adjacent_duplicate_url(example, index): function check_for_ending_or_beginning_example_in_url_cluster (line 47) | def check_for_ending_or_beginning_example_in_url_cluster(example, index): function check_for_updated_example_in_url_pair (line 58) | def check_for_updated_example_in_url_pair(example, index): FILE: pipeline_scripts/common_crawl/experimental/kenlm/model.py class SentencePiece (line 11) | class SentencePiece: method __init__ (line 12) | def __init__( method do (line 20) | def do(self, text: dict) -> dict: class KenlmModel (line 25) | class KenlmModel: method __init__ (line 70) | def __init__( method from_pretrained (line 87) | def from_pretrained( method pp (line 101) | def pp(self, log_score, length): method get_perplexity (line 104) | def get_perplexity(self, doc: str, normalize_cc_net: bool = True): method normalize (line 123) | def normalize( method strip_accents (line 147) | def strip_accents(self, line: str) -> str: method replace_unicode_punct (line 155) | def replace_unicode_punct(self, text: str) -> str: method remove_unicode_punct (line 158) | def remove_unicode_punct(self, text: str) -> str: method remove_non_printing_char (line 162) | def remove_non_printing_char(self, text: str) -> str: FILE: pipeline_scripts/common_crawl/get_last_modified_dataset_from_wat_downloads.py function split_a_into_n_parts (line 28) | def split_a_into_n_parts(a, n): function get_dataset (line 36) | def get_dataset(filenames): FILE: pipeline_scripts/common_crawl/get_text_dataset_from_wet_downloads.py function split_a_into_n_parts (line 30) | def split_a_into_n_parts(a, n): function do_parallel_pipeline_processing (line 38) | def do_parallel_pipeline_processing(dirs_awaiting_processing): function convert_to_parquet_and_reformat (line 74) | def convert_to_parquet_and_reformat(ungoliant_pipeline_output_dir):