SYMBOL INDEX (126 symbols across 17 files)

FILE: data_selection/base.py
  function default_load_dataset_fn (line 18) | def default_load_dataset_fn(path: str) -> Iterable[Dict]:
  function default_parse_example_fn (line 30) | def default_parse_example_fn(ex: Dict) -> str:
  function _iterate_virtually_sharded_dataset (line 39) | def _iterate_virtually_sharded_dataset(dataset: Iterable, num_shards: in...
  class DSIR (line 46) | class DSIR():
    method __init__ (line 50) | def __init__(self,
    method _get_virtually_sharded_datasets (line 97) | def _get_virtually_sharded_datasets(self, datasets: List[str]):
    method featurizer (line 116) | def featurizer(self, text: str) -> np.ndarray:
    method importance_estimator (line 120) | def importance_estimator(self, features: np.ndarray) -> Union[float, n...
    method get_perexample_metadata (line 124) | def get_perexample_metadata(self, ex: Dict, features: np.ndarray) -> n...
    method fit_importance_estimator (line 133) | def fit_importance_estimator(self) -> None:
    method compute_importance_weights (line 140) | def compute_importance_weights(self) -> None:
    method perexample_metadata_filter (line 181) | def perexample_metadata_filter(self, concat_metadata: np.ndarray) -> n...
    method resample (line 185) | def resample(self, out_dir: str, num_to_sample: int, cache_dir: str = ...
    method save (line 304) | def save(self, path: str) -> None:
    method load (line 310) | def load(self, path: str, exclude_keys: Optional[List[str]] = None) ->...

FILE: data_selection/hashed_ngram_dsir.py
  function hash_buckets (line 22) | def hash_buckets(text: str, num_buckets: int = 10000) -> int:
  function get_ngram_counts (line 26) | def get_ngram_counts(line: str,
  class HashedNgramDSIR (line 54) | class HashedNgramDSIR(DSIR):
    method __init__ (line 57) | def __init__(self,
    method featurizer (line 116) | def featurizer(self, text: str) -> np.ndarray:
    method importance_estimator (line 119) | def importance_estimator(self, features: np.ndarray) -> Union[float, n...
    method get_perexample_metadata (line 122) | def get_perexample_metadata(self, ex: Dict, features: np.ndarray) -> int:
    method perexample_metadata_filter (line 127) | def perexample_metadata_filter(self, concat_metadata: np.ndarray) -> n...
    method _fit_bow (line 131) | def _fit_bow(self,
    method fit_importance_estimator (line 168) | def fit_importance_estimator(self, num_tokens_to_fit: Union[str, int] ...

FILE: data_selection/utils.py
  function parallelize (line 5) | def parallelize(fn: Callable, args: List[Any], num_proc: int):

FILE: experimental/data_selection/dsir_general/data_selection.py
  function compute_ngrams_raw (line 16) | def compute_ngrams_raw(args, in_path: str, cache_path: Path):
  function compute_importance_weights (line 41) | def compute_importance_weights(args, in_path: str):
  function resample (line 70) | def resample(args, data_files, cache_ds_dir, streaming=False):

FILE: experimental/data_selection/dsir_general/utils.py
  function hash_buckets (line 15) | def hash_buckets(text, num_buckets=1e4):
  function get_ngram_info (line 22) | def get_ngram_info(line, n=2, num_buckets=10000):
  function linecount (line 34) | def linecount(filename):
  function transform_text (line 44) | def transform_text(text):
  function repeating_filter (line 48) | def repeating_filter(x_tok, n=1):
  function mostly_uninformative_filter (line 59) | def mostly_uninformative_filter(x_tok):
  function numeric_filter (line 66) | def numeric_filter(x_tok):

FILE: experimental/data_selection/dsir_pipeline.py
  function get_quality_mask (line 64) | def get_quality_mask(quality_scores):
  function hash_buckets (line 77) | def hash_buckets(string, num_buckets=10e4):
  function unigrams_bigrams (line 81) | def unigrams_bigrams(text):
  function get_ngram_info (line 87) | def get_ngram_info(line, n=2, num_buckets=10000):
  function grouper (line 99) | def grouper(iterable, n, *, incomplete='fill', fillvalue=None):
  function compute_ngrams_hf (line 115) | def compute_ngrams_hf(ds_name, ds_dir, cache_dir, ngrams, num_buckets):
  function compute_ngrams_pile (line 141) | def compute_ngrams_pile(
  function compute_importance_weights (line 174) | def compute_importance_weights(
  function compute_domain_idxs (line 198) | def compute_domain_idxs(filter_domains):
  function resample (line 231) | def resample(ds_dir, cache_ds_dir, num_to_retrieve):
  function linecount (line 327) | def linecount(filename):

FILE: experimental/data_selection/heuristic_cls_pipeline.py
  function transform_text (line 25) | def transform_text(text):
  function batch_process (line 29) | def batch_process(e, text_cols, label_col, fixed_label=None):
  function reformat_dataset (line 41) | def reformat_dataset(ds_name, output_dir, cache_dir, num_proc=10, fixed_...
  function replace_label (line 100) | def replace_label(line, label):
  function mix_dataset (line 109) | def mix_dataset(ds_dir, pile_val_dir):
  function prepare_fasttext_dataset (line 159) | def prepare_fasttext_dataset(ds_dir):
  function make_prediction (line 195) | def make_prediction(line, model):
  function process (line 210) | def process(line):
  function make_prediction_chunk (line 214) | def make_prediction_chunk(ds_path, model_path, chunk_idx):
  function predict_chunk (line 230) | def predict_chunk(model_path, ds_dir, chunk_idx):
  function compute_domain_idxs (line 264) | def compute_domain_idxs(filter_domains):
  function retrieve_from_pile (line 296) | def retrieve_from_pile(model_path, num_to_retrieve, ds_dir):

FILE: experimental/glue_eval/read_glue_results.py
  function read_file (line 16) | def read_file(path, task_name):
  function parse_file_name (line 24) | def parse_file_name(name):

FILE: experimental/glue_eval/run_glue.py
  class DataTrainingArguments (line 74) | class DataTrainingArguments:
    method __post_init__ (line 139) | def __post_init__(self):
  class ModelArguments (line 158) | class ModelArguments:
  function main (line 193) | def main():
  function _mp_fn (line 566) | def _mp_fn(index):

FILE: experimental/preprocessing/quality_scores/compute_quality_stats.py
  function transform_text (line 22) | def transform_text(text):
  function length_filter (line 26) | def length_filter(x_tok):
  function repeating_filter (line 30) | def repeating_filter(x_tok):
  function mostly_uninformative_filter (line 38) | def mostly_uninformative_filter(x_tok):
  function numeric_filter (line 45) | def numeric_filter(x_tok):
  function process (line 55) | def process(example):

FILE: experimental/preprocessing/reformat_and_chunk_data.py
  function chunk_examples (line 20) | def chunk_examples(examples, chunk_length=CHUNK_LENGTH):
  function add_id (line 29) | def add_id(examples, idx):
  function main (line 33) | def main(args):

FILE: experimental/train/collator.py
  class DataCollatorForLanguageModeling (line 11) | class DataCollatorForLanguageModeling:
    method __post_init__ (line 18) | def __post_init__(self):
    method __call__ (line 25) | def __call__(
    method mask_tokens (line 46) | def mask_tokens(

FILE: experimental/train/model.py
  class BertForMaskedLM (line 15) | class BertForMaskedLM(BertPreTrainedModel):
    method __init__ (line 20) | def __init__(self, config):
    method get_output_embeddings (line 34) | def get_output_embeddings(self):
    method set_output_embeddings (line 37) | def set_output_embeddings(self, new_embeddings):
    method set_args (line 40) | def set_args(self, args):
    method forward (line 43) | def forward(
  class RobertaForMaskedLM (line 93) | class RobertaForMaskedLM(RobertaPreTrainedModel):
    method __init__ (line 98) | def __init__(self, config):
    method get_output_embeddings (line 112) | def get_output_embeddings(self):
    method set_output_embeddings (line 115) | def set_output_embeddings(self, new_embeddings):
    method set_args (line 118) | def set_args(self, args):
    method forward (line 121) | def forward(

FILE: experimental/train/run_pipeline.py
  function parse_args (line 39) | def parse_args():
  function get_logger (line 236) | def get_logger(args, accelerator=None):
  function get_dataset (line 279) | def get_dataset(args, preprocessed_cache):
  function preprocess (line 294) | def preprocess(args, raw_datasets, tokenizer, logger, preprocessed_cache):
  function get_model (line 352) | def get_model(args, load_model=True):
  function main (line 406) | def main():

FILE: experimental/train/trainer.py
  class PretrainTrainer (line 12) | class PretrainTrainer:
    method __init__ (line 14) | def __init__(self,
    method _move_to_device (line 40) | def _move_to_device(self, batch):
    method _save_model (line 45) | def _save_model(self, save_path=None):
    method _save_trained (line 53) | def _save_trained(self, save_path=None):
    method evaluate (line 69) | def evaluate(self):
    method _get_batch (line 72) | def _get_batch(self):
    method compute_loss (line 85) | def compute_loss(self):
    method _prepare_from_checkpoint (line 94) | def _prepare_from_checkpoint(self):
    method update (line 116) | def update(self, tr_loss, loss_step):
    method train (line 149) | def train(self):

FILE: tests/test_hashed_ngram.py
  function parse_example_fn (line 16) | def parse_example_fn(ex):
  function dsir_obj (line 21) | def dsir_obj():
  function dsir_obj_diffparams (line 39) | def dsir_obj_diffparams():
  function dsir_obj_septarget (line 57) | def dsir_obj_septarget():
  function test_hash_buckets (line 76) | def test_hash_buckets():
  function test_get_ngram_counts (line 84) | def test_get_ngram_counts():
  function test_virtual_shards (line 98) | def test_virtual_shards(dsir_obj):
  function test_length_metadata (line 102) | def test_length_metadata(dsir_obj):
  function test_fit (line 109) | def test_fit(dsir_obj):
  function test_compute (line 150) | def test_compute(dsir_obj):
  function test_resample (line 160) | def test_resample(dsir_obj):
  function test_resample_diffparams (line 188) | def test_resample_diffparams(dsir_obj_diffparams):
  function test_resample_septarget (line 218) | def test_resample_septarget(dsir_obj_septarget):
  function test_resample_virtual_sharding (line 254) | def test_resample_virtual_sharding():
  function test_smoothing (line 295) | def test_smoothing(dsir_obj):
  function test_save_load (line 336) | def test_save_load(dsir_obj):

FILE: tests/test_utils.py
  function job (line 4) | def job(arg):
  function test_parallelize (line 8) | def test_parallelize():