SYMBOL INDEX (34 symbols across 10 files)

FILE: analysis_scripts/duplicates.py
  function same_adjacent_entry (line 37) | def same_adjacent_entry(entry, index):

FILE: analysis_scripts/term_counts.py
  function build_count_dict (line 75) | def build_count_dict(examples):
  function term_counts (line 133) | def term_counts(text):

FILE: analysis_scripts/timestamp_dist.py
  function reject_outliers (line 41) | def reject_outliers(data, m = 10.):

FILE: analysis_scripts/url_dist.py
  function build_count_dict (line 60) | def build_count_dict(examples):

FILE: pipeline_scripts/common_crawl/combine_last_modified_with_text_dataset.py
  function build_last_modified_dict (line 47) | def build_last_modified_dict(examples):

FILE: pipeline_scripts/common_crawl/deduplicate.py
  function check_for_ending_example_in_cluster (line 38) | def check_for_ending_example_in_cluster(example, index, column, last_ind...
  function check_for_ending_example_in_cluster (line 72) | def check_for_ending_example_in_cluster(example, index, column, last_ind...
  class DatasetColumnIterator (line 96) | class DatasetColumnIterator():
    method __init__ (line 97) | def __init__(self, dataset, column):
    method __iter__ (line 101) | def __iter__(self):
    method __next__ (line 104) | def __next__(self):
  function remove_slice_list (line 111) | def remove_slice_list(string, slice_list):

FILE: pipeline_scripts/common_crawl/experimental/filter_for_only_updated_websites.py
  function check_for_adjacent_duplicate_url (line 32) | def check_for_adjacent_duplicate_url(example, index):
  function check_for_ending_or_beginning_example_in_url_cluster (line 47) | def check_for_ending_or_beginning_example_in_url_cluster(example, index):
  function check_for_updated_example_in_url_pair (line 58) | def check_for_updated_example_in_url_pair(example, index):

FILE: pipeline_scripts/common_crawl/experimental/kenlm/model.py
  class SentencePiece (line 11) | class SentencePiece:
    method __init__ (line 12) | def __init__(
    method do (line 20) | def do(self, text: dict) -> dict:
  class KenlmModel (line 25) | class KenlmModel:
    method __init__ (line 70) | def __init__(
    method from_pretrained (line 87) | def from_pretrained(
    method pp (line 101) | def pp(self, log_score, length):
    method get_perplexity (line 104) | def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
    method normalize (line 123) | def normalize(
    method strip_accents (line 147) | def strip_accents(self, line: str) -> str:
    method replace_unicode_punct (line 155) | def replace_unicode_punct(self, text: str) -> str:
    method remove_unicode_punct (line 158) | def remove_unicode_punct(self, text: str) -> str:
    method remove_non_printing_char (line 162) | def remove_non_printing_char(self, text: str) -> str:

FILE: pipeline_scripts/common_crawl/get_last_modified_dataset_from_wat_downloads.py
  function split_a_into_n_parts (line 28) | def split_a_into_n_parts(a, n):
  function get_dataset (line 36) | def get_dataset(filenames):

FILE: pipeline_scripts/common_crawl/get_text_dataset_from_wet_downloads.py
  function split_a_into_n_parts (line 30) | def split_a_into_n_parts(a, n):
  function do_parallel_pipeline_processing (line 38) | def do_parallel_pipeline_processing(dirs_awaiting_processing):
  function convert_to_parquet_and_reformat (line 74) | def convert_to_parquet_and_reformat(ungoliant_pipeline_output_dir):