SYMBOL INDEX (272 symbols across 78 files) FILE: DomainSpecific/core/data.py class DataType (line 6) | class DataType(Enum): method belong (line 44) | def belong(a, b): class Data (line 52) | class Data: method __init__ (line 56) | def __init__(self, type=DataType.Mem_Any, value=None): FILE: DomainSpecific/core/layer.py class JointType (line 12) | class JointType(Enum): class Layer (line 17) | class Layer: method __init__ (line 18) | def __init__(self, type, joint=JointType.Default, repetition=1, param=... method __call__ (line 27) | def __call__(self, inputs, worker_id=0, worker_num=1, variables=dict()): FILE: DomainSpecific/core/layers/__init__.py class LayerType (line 21) | class LayerType(Enum): FILE: DomainSpecific/core/layers/control/data_concat_layer.py function data_concat_layer (line 9) | def data_concat_layer(lists, variables=dict()): FILE: DomainSpecific/core/layers/control/data_filter_layer.py function data_filter_layer (line 9) | def data_filter_layer(lines, variables=dict(), IN=False, FILTERS=(None,)): FILE: DomainSpecific/core/layers/control/data_order_layer.py function data_order_layer (line 9) | def data_order_layer(lines, variables=dict(), REVERSE=False): FILE: DomainSpecific/core/layers/control/data_partition_layer.py function data_partition_layer (line 9) | def data_partition_layer(lines, variables=dict(), WORKER_ID=-1): FILE: DomainSpecific/core/layers/control/data_sample_layer.py function data_sample_layer (line 10) | def data_sample_layer(lines, variables=dict(), N=-1, SEED=1): FILE: DomainSpecific/core/layers/control/data_shuffle_layer.py function data_shuffle_layer (line 10) | def data_shuffle_layer(lines, variables=dict(), SEED=1): FILE: DomainSpecific/core/layers/extract/build_index_layer.py function build_index_layer (line 9) | def build_index_layer(base_vectors, variables=dict(), SEED=1, DIM=4096, ... FILE: DomainSpecific/core/layers/extract/extract_article_layer.py function filter_tags_in_html (line 16) | def filter_tags_in_html(soup): function lid (line 43) | def lid(soup, model): function get_main_text_html (line 59) | def get_main_text_html(soup): function remove_dup_newline (line 68) | def remove_dup_newline(text): class User_MarkdownConverter (line 74) | class User_MarkdownConverter(MarkdownConverter): method convert_tr (line 75) | def convert_tr(self, el, text, convert_as_inline): method convert_a (line 98) | def convert_a(self, el, text, convert_as_inline): method convert_pre (line 117) | def convert_pre(self, el, text, convert_as_inline): function html2text (line 127) | def html2text(soup, **options): function trans2md (line 147) | def trans2md(html): function _patch_newspaper_parser_clean (line 157) | def _patch_newspaper_parser_clean(cls, node): function extract (line 161) | def extract(soup): function extract_article_layer (line 166) | def extract_article_layer(id_html, variables=dict()): FILE: DomainSpecific/core/layers/extract/search_index_layer.py function search_index_layer (line 10) | def search_index_layer(index, query_vectors, variables=dict(), TOPK=1): FILE: DomainSpecific/core/layers/io/from_binary_file_layer.py function from_binary_file_layer (line 10) | def from_binary_file_layer(file_path, variables=dict(), STORAGE_PATH=None): FILE: DomainSpecific/core/layers/io/from_index_file_layer.py function from_index_file_layer (line 11) | def from_index_file_layer(file_path, variables=dict(), STORAGE_PATH=None): FILE: DomainSpecific/core/layers/io/from_jsonl_file_layer.py function from_jsonl_file_layer (line 11) | def from_jsonl_file_layer(file_path, variables=dict(), STORAGE_PATH=None): FILE: DomainSpecific/core/layers/io/from_line_file_layer.py function from_line_file_layer (line 10) | def from_line_file_layer(file_path, variables=dict(), STORAGE_PATH=None): FILE: DomainSpecific/core/layers/io/from_parquet_file_layer.py function from_parquet_file_layer (line 12) | def from_parquet_file_layer(file_path, variables=dict(), STORAGE_PATH=No... FILE: DomainSpecific/core/layers/io/from_warc_file_layer.py function from_warc_file_layer (line 11) | def from_warc_file_layer(file_path, variables=dict(), STORAGE_PATH=None): FILE: DomainSpecific/core/layers/io/from_wet_file_layer.py function from_wet_file_layer (line 11) | def from_wet_file_layer(file_path, variables=dict(), STORAGE_PATH=None): FILE: DomainSpecific/core/layers/io/to_binary_file_layer.py function to_binary_file_layer (line 10) | def to_binary_file_layer(bytes, file_path, variables=dict(), STORAGE_PAT... FILE: DomainSpecific/core/layers/io/to_index_file_layer.py function to_index_file_layer (line 11) | def to_index_file_layer(index, file_path, variables=dict(), STORAGE_PATH... FILE: DomainSpecific/core/layers/io/to_jsonl_file_layer.py function to_jsonl_file_layer (line 11) | def to_jsonl_file_layer(data, file_path, variables=dict(), STORAGE_PATH=... FILE: DomainSpecific/core/layers/io/to_line_file_layer.py function to_line_file_layer (line 10) | def to_line_file_layer(lines, file_path, variables=dict(), STORAGE_PATH=... FILE: DomainSpecific/core/layers/io/to_parquet_file_layer.py function to_parquet_file_layer (line 12) | def to_parquet_file_layer(data, file_path, variables=dict(), STORAGE_PAT... FILE: DomainSpecific/core/layers/network/download_bytes_from_blob_layer.py function download_bytes_from_blob_layer (line 11) | def download_bytes_from_blob_layer(blob_path, variables=dict(), STORAGE_... FILE: DomainSpecific/core/layers/network/download_bytes_from_internet_layer.py function download_bytes_from_internet_layer (line 11) | def download_bytes_from_internet_layer(url, variables=dict(), TRIES=1): FILE: DomainSpecific/core/layers/network/download_file_from_blob_layer.py function download_file_from_blob_layer (line 11) | def download_file_from_blob_layer(blob_path, variables=dict(), DOWNLOAD_... FILE: DomainSpecific/core/layers/network/download_file_from_internet_layer.py function download_file_from_internet_layer (line 11) | def download_file_from_internet_layer(url, variables=dict(), DOWNLOAD_PA... FILE: DomainSpecific/core/layers/network/download_starcoder_layer.py function download_contents (line 19) | def download_contents(blob_id, src_encoding): function download_starcoder_layer (line 25) | def download_starcoder_layer(data_repo, variables=dict(), OUTPUT_FOLDER=... FILE: DomainSpecific/core/layers/network/download_url_list_layer.py function download_url_list_layer (line 13) | def download_url_list_layer(index_url, variables=dict(), FILTER_SUFFIXES... FILE: DomainSpecific/core/layers/network/download_urls_from_website_layer.py function download_urls_from_website_layer (line 12) | def download_urls_from_website_layer(website_url, variables=dict(), FILT... FILE: DomainSpecific/core/layers/network/download_warc_file_layer.py function download_warc_file_layer (line 11) | def download_warc_file_layer(warc_url, variables=dict(), DOWNLOAD_FOLDER... FILE: DomainSpecific/core/layers/network/download_warc_indice_layer.py function download_warc_indice_layer (line 12) | def download_warc_indice_layer(index_url, variables=dict(), TRIES=1, URL... FILE: DomainSpecific/core/layers/network/upload_bytes_to_blob_layer.py function upload_bytes_to_blob_layer (line 11) | def upload_bytes_to_blob_layer(bytes, blob_path, variables=dict(), STORA... FILE: DomainSpecific/core/layers/network/upload_file_to_blob_layer.py function upload_file_to_blob_layer (line 11) | def upload_file_to_blob_layer(file_path, blob_path, variables=dict(), ST... FILE: DomainSpecific/core/layers/template_layer.py function template_layer (line 18) | def template_layer(input, variables=dict(), PARAM=None): FILE: DomainSpecific/core/layers/transform/lsh_minhash_layer.py class LSH (line 16) | class LSH: method __init__ (line 17) | def __init__(self): method false_positive_probability (line 24) | def false_positive_probability(self, threshold, b, r): method false_negative_probability (line 29) | def false_negative_probability(self, threshold, b, r): method optimal_param (line 34) | def optimal_param(self, threshold, num_perm, false_positive_weight, method gen_lsh (line 53) | def gen_lsh(self, minhash): function lsh_minhash_layer (line 58) | def lsh_minhash_layer(minhash, variables=dict()): FILE: DomainSpecific/core/layers/transform/math_filter_layer.py function ismath_by_model (line 28) | def ismath_by_model(text, model, thred=0.5): function math_filter_layer (line 42) | def math_filter_layer(pq_name, variables=dict(), INPUT_FOLDER="./", OUTP... FILE: DomainSpecific/core/layers/transform/mcq_filter_layer.py function detect_lang (line 21) | def detect_lang(text): function detect_choice_exercise_by_rule (line 40) | def detect_choice_exercise_by_rule(uri, html): function detect_choice_exercise_by_ft_model (line 76) | def detect_choice_exercise_by_ft_model(uri, text, thred=0.5): function detect_choice_exercise_by_LLM (line 101) | def detect_choice_exercise_by_LLM(text, engine=None): function LCS (line 115) | def LCS(str1, str2): function localize_choice_exercise_by_LLM (line 131) | def localize_choice_exercise_by_LLM(text, engine=None): function mcq_filter_layer (line 172) | def mcq_filter_layer(wet_file_name, variables=dict(), INPUT_FOLDER="./",... FILE: DomainSpecific/core/layers/transform/minhash_tokens_layer.py class MinHasher (line 16) | class MinHasher: method __init__ (line 17) | def __init__(self): method _sha1_hash (line 23) | def _sha1_hash(self, val): method hash (line 28) | def hash(self, sequence): function minhash_tokens_layer (line 39) | def minhash_tokens_layer(tokens, variables=dict()): FILE: DomainSpecific/core/layers/transform/ngrams_layer.py function ngrams_layer (line 12) | def ngrams_layer(sequence, variables=dict()): FILE: DomainSpecific/core/layers/transform/openquestion_filter_layer.py function is_openquestion_by_model (line 31) | def is_openquestion_by_model(text, model, thred=0.5): function check_yes_no_question (line 45) | def check_yes_no_question(text_before, text_after): function check_multiple_choise_question (line 53) | def check_multiple_choise_question(text_before, text_after): function check_fill_in_question (line 83) | def check_fill_in_question(text_before, text_after): function check_quality (line 89) | def check_quality(item): function openquestion_filter_layer (line 141) | def openquestion_filter_layer(pq_name, variables=dict(), INPUT_FOLDER=".... FILE: DomainSpecific/core/layers/transform/tokenize_article_layer.py function tokenize_article_layer (line 13) | def tokenize_article_layer(article, variables=dict(), SPM_MODEL_PATH="./... FILE: DomainSpecific/core/layers/transform/warc_encode_layer.py function tex_in_script_tag (line 26) | def tex_in_script_tag(text): function tex_in_math_tag (line 36) | def tex_in_math_tag(text): function tex_in_math_tag2 (line 40) | def tex_in_math_tag2(text): function mathml_in_script_tag (line 43) | def mathml_in_script_tag(text): function mathml_in_math_tag (line 47) | def mathml_in_math_tag(text): function is_tex (line 53) | def is_tex(text): function contain_tex (line 56) | def contain_tex(text): function check_latex (line 59) | def check_latex(latex): function remove_hidden_content (line 67) | def remove_hidden_content(html): function remove_attr (line 84) | def remove_attr(text, attr): function mathml_to_latex1 (line 97) | def mathml_to_latex1(text): function mathml_to_latex2 (line 105) | def mathml_to_latex2(text): function separate_content_and_tag (line 142) | def separate_content_and_tag(html, start_str, end_str, s=0): function detect_code (line 151) | def detect_code(text): function encode_code (line 173) | def encode_code(node, code_tag, not_code_tag): function filter_code (line 218) | def filter_code(html, code_tag, not_code_tag): function encode_image (line 233) | def encode_image(uri, node, image_tag): function filter_image (line 259) | def filter_image(uri, html, image_tag): function encode_video (line 274) | def encode_video(uri, node, video_tag): function filter_video (line 300) | def filter_video(uri, html, video_tag): function encode_math_html (line 315) | def encode_math_html(uri, html, encoding): function get_tag_info (line 433) | def get_tag_info(tag): function encode_code_html (line 441) | def encode_code_html(uri, html, encoding): function encode_image_html (line 463) | def encode_image_html(uri, html, encoding): function encode_video_html (line 481) | def encode_video_html(uri, html, encoding): function encode_html (line 499) | def encode_html(uri, html, encoding, TAG): function warc_encode_layer (line 514) | def warc_encode_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./... FILE: DomainSpecific/core/layers/transform/warc_filter_layer.py function warc_filter_layer (line 15) | def warc_filter_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./... FILE: DomainSpecific/core/layers/transform/warc_to_wet_layer.py function warc_to_wet_layer (line 10) | def warc_to_wet_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./... FILE: DomainSpecific/core/layers/transform/wet_decode_layer.py function decode_tag (line 17) | def decode_tag(tag): function latex2text (line 20) | def latex2text(latex, encoding="utf-8"): function separate_content_and_tag (line 27) | def separate_content_and_tag(html, start_str, end_str): function remove_number_and_merge_snippet (line 36) | def remove_number_and_merge_snippet(html, NumberThred = 7): function identify_code (line 107) | def identify_code(text): function decode_html (line 118) | def decode_html(uri, html, encoding, TAG): function wet_decode_layer (line 256) | def wet_decode_layer(wet_file_name, variables=dict(), INPUT_FOLDER="./",... FILE: DomainSpecific/core/layers/util.py function load_yaml (line 17) | def load_yaml(config_path): function save_yaml (line 24) | def save_yaml(config, config_path): function str2bytes (line 29) | def str2bytes(data): function md5 (line 33) | def md5(data): function sha256 (line 39) | def sha256(data): function suffix (line 45) | def suffix(path): function relative2absolute_path (line 49) | def relative2absolute_path(prefix, link): function create_folder_by_file_path (line 63) | def create_folder_by_file_path(local_file_path): function to_real_path (line 71) | def to_real_path(path, variables): function get_container_client (line 79) | def get_container_client(storage_config): function get_blob_client (line 101) | def get_blob_client(storage_config, blob_path): function exist_blob (line 106) | def exist_blob(container_client, blob_path): function get_blob_size (line 111) | def get_blob_size(container_client, blob_path): function list_blob_dir (line 117) | def list_blob_dir(container_client, blob_path): function create_blob_dir (line 123) | def create_blob_dir(container_client, blob_path): function upload_bytes_to_blob (line 126) | def upload_bytes_to_blob(storage_config, content, blob_path): function upload_file_to_blob (line 131) | def upload_file_to_blob(storage_config, local_path, blob_path): function upload_bytes_to_internet (line 136) | def upload_bytes_to_internet(content, blob_path): function upload_file_to_internet (line 140) | def upload_file_to_internet(local_path, blob_path): function download_bytes_from_blob (line 144) | def download_bytes_from_blob(storage_config, blob_path): function download_file_from_blob (line 149) | def download_file_from_blob(storage_config, blob_path, local_path): function download_bytes_from_internet (line 156) | def download_bytes_from_internet(url, timeout=3): function download_file_from_internet (line 167) | def download_file_from_internet(url, local_path): FILE: DomainSpecific/core/network.py class Network (line 10) | class Network: method __init__ (line 11) | def __init__(self): method set_input_names (line 18) | def set_input_names(self, input_names): method set_output_names (line 21) | def set_output_names(self, output_names): method add_data (line 24) | def add_data(self, name, value): method add_layer (line 27) | def add_layer(self, name, value): method next_layer (line 30) | def next_layer(self, invisited_layer_names): method __call__ (line 40) | def __call__(self, inputs=list(), worker_id=0, worker_num=1, variables... FILE: DomainSpecific/dependency/gpt_api.py class GPTAPI (line 16) | class GPTAPI: method __init__ (line 17) | def __init__(self, engine, endpoint, identity_id): method switch_api (line 48) | def switch_api(self, api_idx=-1): method get_tokens (line 52) | def get_tokens(self, text): method run (line 56) | def run(self, system, question, engine=None, uid=None, temperature=0.0... FILE: DomainSpecific/dependency/install.py function install (line 16) | def install(local_id, storage_path): FILE: DomainSpecific/submit.py function submit_job (line 7) | def submit_job(network_path, run_mode, docker_path, computation_path, st... FILE: DomainSpecific/tools/submit_batch_job.py function submit_batch_job (line 18) | def submit_batch_job(network_path, run_mode, docker_path, computation_pa... FILE: DomainSpecific/tools/submit_local_job.py function submit_local_job (line 10) | def submit_local_job(network_path, run_mode, docker_path, computation_pa... FILE: DomainSpecific/wrapper/interpreter.py class Interpreter (line 14) | class Interpreter: method __init__ (line 15) | def __init__(self): method check_config (line 19) | def check_config(self, config): method __call__ (line 88) | def __call__(self, config_path): FILE: DomainSpecific/wrapper/parser.py class Parser (line 9) | class Parser: method __init__ (line 10) | def __init__(self): method __call__ (line 13) | def __call__(self, config_path): FILE: DomainSpecific/wrapper/runner.py class RunMode (line 15) | class RunMode(Enum): class Runner (line 20) | class Runner: method __init__ (line 21) | def __init__(self, network_path): method __call__ (line 25) | def __call__(self, run_mode, worker_id, worker_num, workspace_dir): FILE: DomainSpecific/wrapper/utility/azure_env.py function get_local_rank (line 6) | def get_local_rank(): function get_world_rank (line 12) | def get_world_rank(): function get_world_size (line 21) | def get_world_size(): function get_process_per_node (line 33) | def get_process_per_node(): FILE: DomainSpecific/wrapper/utility/cpu_count.py function cpu_count (line 8) | def cpu_count(): FILE: DomainSpecific/wrapper/utility/load_yaml.py function load_yaml (line 7) | def load_yaml(config_path): FILE: DomainSpecific/wrapper/utility/logger.py class Logger (line 8) | class Logger: method __init__ (line 9) | def __init__(): method init (line 13) | def init(log_path=None): method debug (line 28) | def debug(msg): method info (line 32) | def info(msg): method warning (line 36) | def warning(msg): method error (line 40) | def error(msg): method critical (line 44) | def critical(msg): FILE: DomainSpecific/wrapper/utility/save_yaml.py function save_yaml (line 7) | def save_yaml(config, config_path): FILE: GeneralDomain/redstone_cc/__main__.py function main (line 10) | def main(): FILE: GeneralDomain/redstone_cc/algos/deduplication/minhash.py function gen_lsh_param (line 10) | def gen_lsh_param(num_perm, lsh_threshold): class CalcMinhash (line 14) | class CalcMinhash: method __init__ (line 15) | def __init__(self, num_perm, seed=DEFAULT_SEED, mer=DEFAULT_MER): method _sha1_hash (line 23) | def _sha1_hash(self, val): method hash (line 28) | def hash(self, sequence: list[str]) -> np.ndarray: class CalcLsh (line 38) | class CalcLsh: method __init__ (line 39) | def __init__(self, b, r): method gen_lsh (line 44) | def gen_lsh(self, minhash) -> list[bytearray]: class CalcMinhashLsh (line 48) | class CalcMinhashLsh: method __init__ (line 49) | def __init__(self, b, r, seed=DEFAULT_SEED, mer=DEFAULT_MER): method hash (line 54) | def hash(self, tokens) -> list[bytearray]: class LocalMinhashLshDedup (line 60) | class LocalMinhashLshDedup: method __init__ (line 61) | def __init__(self, b, r, seed=DEFAULT_SEED, mer=DEFAULT_MER): method add (line 66) | def add(self, id, tokens): method dedup (line 70) | def dedup(self): FILE: GeneralDomain/redstone_cc/algos/deduplication/sha1.py function sha1_hash (line 8) | def sha1_hash(line, hash_size=DEFAULT_HASH_SIZE) -> bytes: class LocalSha1Dedup (line 14) | class LocalSha1Dedup: method __init__ (line 15) | def __init__(self, hash_size): method add_line (line 20) | def add_line(self, line_id, line): method add_hashes (line 24) | def add_hashes(self, line_id, hval): method dedup (line 28) | def dedup(self): FILE: GeneralDomain/redstone_cc/algos/deduplication/utils.py function ccnet_normalize (line 13) | def ccnet_normalize(line) -> str: function slimpajama_tokenize (line 31) | def slimpajama_tokenize(text, num_ngrams=13): function spm_tokenize (line 42) | def spm_tokenize(text, spm_model, num_ngrams=5): FILE: GeneralDomain/redstone_cc/algos/fasttext_classifier.py class FastTextClassifier (line 10) | class FastTextClassifier: method __init__ (line 11) | def __init__(self, model_path): method predict (line 14) | def predict(self, text): FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/func/document.py function document_word_count (line 4) | def document_word_count(words): function document_mean_word_length (line 8) | def document_mean_word_length(words): function document_alpha_words (line 15) | def document_alpha_words(words): function document_start_with_bullet (line 33) | def document_start_with_bullet(lines): function document_end_with_ellipsis (line 47) | def document_end_with_ellipsis(lines): function document_gopher_symbols (line 54) | def document_gopher_symbols(text): function document_gopher_stopwords (line 61) | def document_gopher_stopwords(words): FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/func/line.py function line_uppercase_ratio (line 7) | def line_uppercase_ratio(line): function line_all_numeric (line 18) | def line_all_numeric(line): function line_refinedweb_counter (line 25) | def line_refinedweb_counter(line): function line_regex_match (line 29) | def line_regex_match(line, patterns): function test_line_uppercase_ratio (line 36) | def test_line_uppercase_ratio(): function test_line_all_numeric (line 46) | def test_line_all_numeric(): function test_line_refinedweb_counter (line 53) | def test_line_refinedweb_counter(): function test_line_regex_match (line 60) | def test_line_regex_match(): FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/func/repetition.py function repetition_ngram_top_char_frac (line 7) | def repetition_ngram_top_char_frac(words, n: int): function repetition_ngram_dup_char_frac (line 22) | def repetition_ngram_dup_char_frac(words, n: int): function repetition_line_dup_frac (line 35) | def repetition_line_dup_frac(lines): function test_ngram_top (line 53) | def test_ngram_top(): function test_ngram_dup (line 67) | def test_ngram_dup(): function test_dup_line (line 77) | def test_dup_line(): FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/model/document.py class Document (line 17) | class Document: method __init__ (line 18) | def __init__(self, text, lang): method sents (line 23) | def sents(self): method paragraphs (line 33) | def paragraphs(self): method normalized_text (line 37) | def normalized_text(self): method normalized_sents (line 41) | def normalized_sents(self): method normalized_words (line 45) | def normalized_words(self): FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/model/violations.py class Violations (line 6) | class Violations: method __init__ (line 7) | def __init__(self): method doc (line 12) | def doc(self, key): method line (line 17) | def line(self, key, lines: List[int]): method apply_to_doc (line 25) | def apply_to_doc(self, doc: Document) -> str | None: FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/ruleset/gopher.py function gopher_filter (line 24) | def gopher_filter(doc: Document): function apply_gopher_rules (line 79) | def apply_gopher_rules(text, lang): FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/ruleset/refinedweb.py function refinedweb_filter (line 22) | def refinedweb_filter(doc: Document): function apply_refinedweb_rules (line 68) | def apply_refinedweb_rules(text, lang): FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/utils.py function remove_url (line 14) | def remove_url(text): function remove_consecutive_new_lines (line 18) | def remove_consecutive_new_lines(text): function remove_punct (line 22) | def remove_punct(text): function normalize (line 26) | def normalize(text): FILE: GeneralDomain/redstone_cc/algos/trafilatura_process.py class EmptyResultException (line 15) | class EmptyResultException(Exception): function _remove_dup_newline (line 19) | def _remove_dup_newline(text): function _normalize_whitespace (line 29) | def _normalize_whitespace(tree): function _traf_xml_to_html (line 46) | def _traf_xml_to_html(tree): function _build_traf_doc_full (line 111) | def _build_traf_doc_full(traf_bare_res): function _build_traf_doc (line 131) | def _build_traf_doc(traf_bare_res): function _reset_caches (line 146) | def _reset_caches(): function _detect_zip_bomb (line 154) | def _detect_zip_bomb(data): function trafilatura_process (line 189) | def trafilatura_process(html): FILE: GeneralDomain/redstone_cc/download_utils.py function _url_basename (line 12) | def _url_basename(url): function _normalize_dst (line 17) | def _normalize_dst(src, dst): function detect_aria2 (line 25) | def detect_aria2(): function download_with_aria2 (line 30) | def download_with_aria2(src, dst, num_connections=16, quiet=False, extra... function download_with_requests (line 71) | def download_with_requests(src, dst): function download (line 81) | def download(src, dst): FILE: GeneralDomain/redstone_cc/process.py function process_items (line 17) | def process_items(remote_cc_path, items, disable_tqdm=False): function process_file (line 70) | def process_file(index_path):