SYMBOL INDEX (126 symbols across 17 files) FILE: data_selection/base.py function default_load_dataset_fn (line 18) | def default_load_dataset_fn(path: str) -> Iterable[Dict]: function default_parse_example_fn (line 30) | def default_parse_example_fn(ex: Dict) -> str: function _iterate_virtually_sharded_dataset (line 39) | def _iterate_virtually_sharded_dataset(dataset: Iterable, num_shards: in... class DSIR (line 46) | class DSIR(): method __init__ (line 50) | def __init__(self, method _get_virtually_sharded_datasets (line 97) | def _get_virtually_sharded_datasets(self, datasets: List[str]): method featurizer (line 116) | def featurizer(self, text: str) -> np.ndarray: method importance_estimator (line 120) | def importance_estimator(self, features: np.ndarray) -> Union[float, n... method get_perexample_metadata (line 124) | def get_perexample_metadata(self, ex: Dict, features: np.ndarray) -> n... method fit_importance_estimator (line 133) | def fit_importance_estimator(self) -> None: method compute_importance_weights (line 140) | def compute_importance_weights(self) -> None: method perexample_metadata_filter (line 181) | def perexample_metadata_filter(self, concat_metadata: np.ndarray) -> n... method resample (line 185) | def resample(self, out_dir: str, num_to_sample: int, cache_dir: str = ... method save (line 304) | def save(self, path: str) -> None: method load (line 310) | def load(self, path: str, exclude_keys: Optional[List[str]] = None) ->... FILE: data_selection/hashed_ngram_dsir.py function hash_buckets (line 22) | def hash_buckets(text: str, num_buckets: int = 10000) -> int: function get_ngram_counts (line 26) | def get_ngram_counts(line: str, class HashedNgramDSIR (line 54) | class HashedNgramDSIR(DSIR): method __init__ (line 57) | def __init__(self, method featurizer (line 116) | def featurizer(self, text: str) -> np.ndarray: method importance_estimator (line 119) | def importance_estimator(self, features: np.ndarray) -> Union[float, n... method get_perexample_metadata (line 122) | def get_perexample_metadata(self, ex: Dict, features: np.ndarray) -> int: method perexample_metadata_filter (line 127) | def perexample_metadata_filter(self, concat_metadata: np.ndarray) -> n... method _fit_bow (line 131) | def _fit_bow(self, method fit_importance_estimator (line 168) | def fit_importance_estimator(self, num_tokens_to_fit: Union[str, int] ... FILE: data_selection/utils.py function parallelize (line 5) | def parallelize(fn: Callable, args: List[Any], num_proc: int): FILE: experimental/data_selection/dsir_general/data_selection.py function compute_ngrams_raw (line 16) | def compute_ngrams_raw(args, in_path: str, cache_path: Path): function compute_importance_weights (line 41) | def compute_importance_weights(args, in_path: str): function resample (line 70) | def resample(args, data_files, cache_ds_dir, streaming=False): FILE: experimental/data_selection/dsir_general/utils.py function hash_buckets (line 15) | def hash_buckets(text, num_buckets=1e4): function get_ngram_info (line 22) | def get_ngram_info(line, n=2, num_buckets=10000): function linecount (line 34) | def linecount(filename): function transform_text (line 44) | def transform_text(text): function repeating_filter (line 48) | def repeating_filter(x_tok, n=1): function mostly_uninformative_filter (line 59) | def mostly_uninformative_filter(x_tok): function numeric_filter (line 66) | def numeric_filter(x_tok): FILE: experimental/data_selection/dsir_pipeline.py function get_quality_mask (line 64) | def get_quality_mask(quality_scores): function hash_buckets (line 77) | def hash_buckets(string, num_buckets=10e4): function unigrams_bigrams (line 81) | def unigrams_bigrams(text): function get_ngram_info (line 87) | def get_ngram_info(line, n=2, num_buckets=10000): function grouper (line 99) | def grouper(iterable, n, *, incomplete='fill', fillvalue=None): function compute_ngrams_hf (line 115) | def compute_ngrams_hf(ds_name, ds_dir, cache_dir, ngrams, num_buckets): function compute_ngrams_pile (line 141) | def compute_ngrams_pile( function compute_importance_weights (line 174) | def compute_importance_weights( function compute_domain_idxs (line 198) | def compute_domain_idxs(filter_domains): function resample (line 231) | def resample(ds_dir, cache_ds_dir, num_to_retrieve): function linecount (line 327) | def linecount(filename): FILE: experimental/data_selection/heuristic_cls_pipeline.py function transform_text (line 25) | def transform_text(text): function batch_process (line 29) | def batch_process(e, text_cols, label_col, fixed_label=None): function reformat_dataset (line 41) | def reformat_dataset(ds_name, output_dir, cache_dir, num_proc=10, fixed_... function replace_label (line 100) | def replace_label(line, label): function mix_dataset (line 109) | def mix_dataset(ds_dir, pile_val_dir): function prepare_fasttext_dataset (line 159) | def prepare_fasttext_dataset(ds_dir): function make_prediction (line 195) | def make_prediction(line, model): function process (line 210) | def process(line): function make_prediction_chunk (line 214) | def make_prediction_chunk(ds_path, model_path, chunk_idx): function predict_chunk (line 230) | def predict_chunk(model_path, ds_dir, chunk_idx): function compute_domain_idxs (line 264) | def compute_domain_idxs(filter_domains): function retrieve_from_pile (line 296) | def retrieve_from_pile(model_path, num_to_retrieve, ds_dir): FILE: experimental/glue_eval/read_glue_results.py function read_file (line 16) | def read_file(path, task_name): function parse_file_name (line 24) | def parse_file_name(name): FILE: experimental/glue_eval/run_glue.py class DataTrainingArguments (line 74) | class DataTrainingArguments: method __post_init__ (line 139) | def __post_init__(self): class ModelArguments (line 158) | class ModelArguments: function main (line 193) | def main(): function _mp_fn (line 566) | def _mp_fn(index): FILE: experimental/preprocessing/quality_scores/compute_quality_stats.py function transform_text (line 22) | def transform_text(text): function length_filter (line 26) | def length_filter(x_tok): function repeating_filter (line 30) | def repeating_filter(x_tok): function mostly_uninformative_filter (line 38) | def mostly_uninformative_filter(x_tok): function numeric_filter (line 45) | def numeric_filter(x_tok): function process (line 55) | def process(example): FILE: experimental/preprocessing/reformat_and_chunk_data.py function chunk_examples (line 20) | def chunk_examples(examples, chunk_length=CHUNK_LENGTH): function add_id (line 29) | def add_id(examples, idx): function main (line 33) | def main(args): FILE: experimental/train/collator.py class DataCollatorForLanguageModeling (line 11) | class DataCollatorForLanguageModeling: method __post_init__ (line 18) | def __post_init__(self): method __call__ (line 25) | def __call__( method mask_tokens (line 46) | def mask_tokens( FILE: experimental/train/model.py class BertForMaskedLM (line 15) | class BertForMaskedLM(BertPreTrainedModel): method __init__ (line 20) | def __init__(self, config): method get_output_embeddings (line 34) | def get_output_embeddings(self): method set_output_embeddings (line 37) | def set_output_embeddings(self, new_embeddings): method set_args (line 40) | def set_args(self, args): method forward (line 43) | def forward( class RobertaForMaskedLM (line 93) | class RobertaForMaskedLM(RobertaPreTrainedModel): method __init__ (line 98) | def __init__(self, config): method get_output_embeddings (line 112) | def get_output_embeddings(self): method set_output_embeddings (line 115) | def set_output_embeddings(self, new_embeddings): method set_args (line 118) | def set_args(self, args): method forward (line 121) | def forward( FILE: experimental/train/run_pipeline.py function parse_args (line 39) | def parse_args(): function get_logger (line 236) | def get_logger(args, accelerator=None): function get_dataset (line 279) | def get_dataset(args, preprocessed_cache): function preprocess (line 294) | def preprocess(args, raw_datasets, tokenizer, logger, preprocessed_cache): function get_model (line 352) | def get_model(args, load_model=True): function main (line 406) | def main(): FILE: experimental/train/trainer.py class PretrainTrainer (line 12) | class PretrainTrainer: method __init__ (line 14) | def __init__(self, method _move_to_device (line 40) | def _move_to_device(self, batch): method _save_model (line 45) | def _save_model(self, save_path=None): method _save_trained (line 53) | def _save_trained(self, save_path=None): method evaluate (line 69) | def evaluate(self): method _get_batch (line 72) | def _get_batch(self): method compute_loss (line 85) | def compute_loss(self): method _prepare_from_checkpoint (line 94) | def _prepare_from_checkpoint(self): method update (line 116) | def update(self, tr_loss, loss_step): method train (line 149) | def train(self): FILE: tests/test_hashed_ngram.py function parse_example_fn (line 16) | def parse_example_fn(ex): function dsir_obj (line 21) | def dsir_obj(): function dsir_obj_diffparams (line 39) | def dsir_obj_diffparams(): function dsir_obj_septarget (line 57) | def dsir_obj_septarget(): function test_hash_buckets (line 76) | def test_hash_buckets(): function test_get_ngram_counts (line 84) | def test_get_ngram_counts(): function test_virtual_shards (line 98) | def test_virtual_shards(dsir_obj): function test_length_metadata (line 102) | def test_length_metadata(dsir_obj): function test_fit (line 109) | def test_fit(dsir_obj): function test_compute (line 150) | def test_compute(dsir_obj): function test_resample (line 160) | def test_resample(dsir_obj): function test_resample_diffparams (line 188) | def test_resample_diffparams(dsir_obj_diffparams): function test_resample_septarget (line 218) | def test_resample_septarget(dsir_obj_septarget): function test_resample_virtual_sharding (line 254) | def test_resample_virtual_sharding(): function test_smoothing (line 295) | def test_smoothing(dsir_obj): function test_save_load (line 336) | def test_save_load(dsir_obj): FILE: tests/test_utils.py function job (line 4) | def job(arg): function test_parallelize (line 8) | def test_parallelize():