Repository: mcleish7/arithmetic
Branch: main
Commit: 86022a57d38c
Files: 132
Total size: 479.5 KB

Directory structure:
gitextract_shohcgjg/

├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── abacus.py
├── arithmetic_eval_quicker.py
├── cramming/
│   ├── __init__.py
│   ├── architectures/
│   │   ├── __init__.py
│   │   ├── attention.py
│   │   ├── components.py
│   │   ├── construction.py
│   │   ├── crammed_depthrecurrent.py
│   │   ├── crammed_transformer.py
│   │   ├── embeddings.py
│   │   ├── huggingface_interface.py
│   │   ├── losses.py
│   │   └── sanity_check.py
│   ├── backend/
│   │   ├── __init__.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── optimizer_modifiers.py
│   │   │   ├── progressive_batching.py
│   │   │   └── schedulers.py
│   │   ├── prepare_backend.py
│   │   ├── torch_default.py
│   │   └── utils.py
│   ├── config/
│   │   ├── __init__.py
│   │   ├── arch/
│   │   │   ├── __init__.py
│   │   │   ├── albert.yaml
│   │   │   ├── crammed-depthrecurrent.yaml
│   │   │   ├── crammed-fakeRNN.yaml
│   │   │   ├── crammed-janus.yaml
│   │   │   ├── crammed-rnn.yaml
│   │   │   ├── crammed-stack-janus.yaml
│   │   │   ├── crammed-tiny.yaml
│   │   │   ├── crammed-transformer.yaml
│   │   │   ├── gpt2-base.yaml
│   │   │   ├── hf-gpt2.yaml
│   │   │   └── sanitycheck.yaml
│   │   ├── cfg_eval.yaml
│   │   ├── cfg_pretrain.yaml
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── arithmetic.yaml
│   │   │   ├── c4-subset-processed.yaml
│   │   │   ├── openweb.yaml
│   │   │   ├── proofpile.yaml
│   │   │   ├── sanity-check-1.yaml
│   │   │   ├── sanity-check-2.yaml
│   │   │   └── sources/
│   │   │       ├── ag_news.yaml
│   │   │       ├── arithmetic.yaml
│   │   │       ├── bookcorpus.yaml
│   │   │       ├── c4.yaml
│   │   │       ├── dash_books.yaml
│   │   │       ├── fake.yaml
│   │   │       ├── iwslt.yaml
│   │   │       ├── local.yaml
│   │   │       ├── no_code_stackexchange.yaml
│   │   │       ├── openwebtext.yaml
│   │   │       ├── oscar.yaml
│   │   │       ├── proofpiledata.yaml
│   │   │       ├── the_pile.yaml
│   │   │       ├── the_pileCC.yaml
│   │   │       ├── the_pile_dedup.yaml
│   │   │       ├── the_pile_natural.yaml
│   │   │       ├── the_pile_stream.yaml
│   │   │       ├── uncorpus.yaml
│   │   │       ├── uspto.yaml
│   │   │       ├── wikibooks.yaml
│   │   │       ├── wikinews.yaml
│   │   │       ├── wikipedia.yaml
│   │   │       ├── wikiquote.yaml
│   │   │       ├── wikiversity.yaml
│   │   │       └── wikivoyage.yaml
│   │   ├── eval/
│   │   │   ├── __init__.py
│   │   │   ├── pythia.yaml
│   │   │   └── tasks/
│   │   │       ├── lambada_openai.yaml
│   │   │       └── winogrande.yaml
│   │   ├── hydra/
│   │   │   ├── __init__.py
│   │   │   └── job_logging/
│   │   │       └── custom.yaml
│   │   ├── impl/
│   │   │   ├── __init__.py
│   │   │   ├── _default.yaml
│   │   │   └── torch-default.yaml
│   │   ├── train/
│   │   │   ├── __init__.py
│   │   │   ├── common.yaml
│   │   │   ├── cramming.yaml
│   │   │   ├── janus-regime.yaml
│   │   │   ├── optim/
│   │   │   │   ├── adafactor.yaml
│   │   │   │   ├── adahessian.yaml
│   │   │   │   ├── adam.yaml
│   │   │   │   ├── adam8bit.yaml
│   │   │   │   ├── adam_classic.yaml
│   │   │   │   ├── adamscale.yaml
│   │   │   │   ├── agd.yaml
│   │   │   │   ├── lion.yaml
│   │   │   │   ├── radam.yaml
│   │   │   │   ├── sgd.yaml
│   │   │   │   └── shampoo.yaml
│   │   │   └── optim_mod/
│   │   │       ├── disabled.yaml
│   │   │       ├── larc.yaml
│   │   │       ├── lars.yaml
│   │   │       ├── progressive.yaml
│   │   │       └── sam.yaml
│   │   └── wandb/
│   │       ├── default.yaml
│   │       └── none.yaml
│   ├── data/
│   │   ├── __init__.py
│   │   ├── arithmetic_tokenizers.py
│   │   ├── curriculum_sorting.py
│   │   ├── deduplicate.py
│   │   ├── pretraining_preparation.py
│   │   ├── tokenizer_preparation.py
│   │   └── utils.py
│   └── utils.py
├── create_data_split.py
├── create_pos_or_variants.py
├── dataset_analysis.py
├── gen_eval_script.py
├── load_local_model.py
├── pretrain.py
├── pretty_plotter.py
├── pretty_plotter_big.py
├── pretty_plotter_sort.py
├── pyproject.toml
├── setup.cfg
├── shells/
│   ├── addition_ff.sh
│   ├── addition_lt.sh
│   ├── bitwise_or.sh
│   ├── evaluation.sh
│   ├── generate_and_tokenize_data.sh
│   ├── multiplication.sh
│   └── sorting.sh
├── sort_eval.py
└── upload_processed_dataset.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
outputs
tables/*/*.csv
tables/*/*.csv#
tables/*.csv
tables/*.csv#
tables/*.ods
*.png
*.pdf

# torchdynamo debug
isolate
repro.py

checkpoints
wandb-metadata.json

torch_compile_debug/

dedup

.vs/

*.pdf
images

*.temp.sh

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

*.csv
*.txt
*.pth

cramming-data/
sanity.sh
log/
del.sh
del.py
sort_plots/

================================================
FILE: .pre-commit-config.yaml
================================================
# precommit hooks from https://github.com/ashleve/lightning-hydra-template
repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v3.4.0
    hooks:
      # list of supported hooks: https://pre-commit.com/hooks.html
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-yaml
      - id: check-added-large-files
      - id: debug-statements
      - id: detect-private-key

  # python code formatting
  - repo: https://github.com/psf/black
    rev: 22.3.0
    hooks:
      - id: black
        args: [--line-length, "140", "--fast"] # ;>

  # yaml formatting
  - repo: https://github.com/pre-commit/mirrors-prettier
    rev: v2.3.0
    hooks:
      - id: prettier
        types: [yaml]

  # python code analysis
  - repo: https://github.com/PyCQA/flake8
    rev: 4.0.1
    hooks:
      - id: flake8


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 Sean McLeish, Jonas Geiping

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: MANIFEST.in
================================================
# added by check-manifest
include *.py
include *.yaml
recursive-include cramming *.md
recursive-include cramming *.yaml
global-exclude *.pyc
global-exclude __pycache__


================================================
FILE: README.md
================================================
# Transformers Can Do Arithmetic with the Right Embeddings! [Link to arXiv paper](https://arxiv.org/abs/2405.17399)

A joint project by: Sean McLeish, Arpit Bansal, Alex Stein,  Neel Jain, John Kirchenbauer, Brian R. Bartoldson, Bhavya Kailkhura, Abhinav Bhatele, Jonas Geiping, Avi Schwarzschild and Tom Goldstein


This repository contains code to replicate our research. It is a fork of the language model training framework [cramming](https://github.com/JonasGeiping/cramming) edited to for a next token prediction objective.

We provide a standalone implementation of Abacus Embeddings in [abacus.py](abacus.py).

## Citing Our Work
To cite our work, please use this bibtex.
```
@article{mcleish2024transformers,
    title={Transformers Can Do Arithmetic with the Right Embeddings}, 
    author={Sean McLeish and Arpit Bansal and Alex Stein and Neel Jain and John Kirchenbauer and Brian R. Bartoldson and Bhavya Kailkhura and Abhinav Bhatele and Jonas Geiping and Avi Schwarzschild and Tom Goldstein},
    journal={arXiv preprint arXiv:2405.17399},
    year={2024}
}
```

# Getting Started
We developed in Python 3.10.4, to install run:
```
git clone git@github.com:mcleish7/arithmetic.git
cd arithmetic
pip install .
```

On some machines you will need to run:
1. `pip install multiprocess -U`
2. `pip install dill -U`
3. `pip install apache-beam -U`

# Arithmetic
## Datasets
We release our datasets on [Google Drive](https://drive.google.com/drive/folders/1DqjCrUM1cNV7069Zl25_qBw2Px2xAw9j?usp=sharing) both in zipped format. We recommend you work with the zipped version until it is correctly placed in your file system.

Alternatively, you can make your own datasets using [create_data_split.py](create_data_split.py) using the commands from [shells/generate_and_tokenize_data.sh](shells/generate_and_tokenize_data.sh).

## File Structure
We recommend creating another directory `cramming-data` inside of arithmetic. This is where the models, logs and data will be stored.

You can either export you cramming base directory path to your `.bashrc` or you can replace `$cramming_base_dir` manually in the provided shells.
```
cd arithmetic
mkdir cramming-data
echo 'export cramming_base_dir=MY_BASE_DIR' >> ~/.bashrc
source ~/.bashrc
```
For example, this may look like: `echo 'export cramming_base_dir=~/arithmetic/cramming-data' >> ~/.bashrc`

For example our file system looks like:
```
cramming-generative
└── cramming-data
    ├── addition-train-one
    │    ├── pretrain/<DATE>/<TIME>
    │    │    ├── .hydra
    │    │    │   ├── config.yaml
    │    │    │   ├── hydra.yaml
    │    │    │   └── overrides.yaml
    │    │    └── addition-train-one_pretrain.log
    │    ├── checkpoints/FINAL_<LOSS_VAL>
    │    │    ├── model_config.json
    │    │    ├── model.safetensors
    │    │    └── state_dict.pth
    │    └── downstream
    └── data
        └── arithmetic_data
            ├── +_grid_eval_dataset_reverse_all_tokenized
            └── ... other datasets ...
```

## Training
Example commands are in the [shells](shells) directory, organised by task.

### Explanation of Some Commands
1. Give samples instead of tokens equal importance in loss: `arch.loss_reduction=none`
2. Divide the gradients in the recurrent block by the number of recurrences: `arch.throttle=True`
3. Mask before the equals sign: `arch.mask_before_equals=True`
4. Skip connections inside of the recurrent block: `arch.forward_only_model_with_skip=True`
5. Multi-GPU: `python` -> `torchrun --nproc_per_node=<NUM GPUS> --standalone ` and add `impl.fullgraph=false`

### Positional Embeddings:
#### Absolute
1. Learned: `arch.embedding.pos_embedding=learned`
2. Abacus: `arch.embedding.pos_embedding=abacus`
* If you want the maximum k in abacus to be larger: `arch.embedding.max_abacus_len=100`, be default this value is 100. Abacus is also implemented in a standalone manner in [abacus.py](abacus.py).

#### Relative
1. NoPE: `arch.embedding.pos_embedding=None`
2. FIRE: `arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"`
3. FIRE randomised: e.g:`arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.attention.max_length=128` by default `arch.attention.max_length=0` so setting this longer than the max sequence length gives some randomness in the embedding.
4. RoPE: `arch.attention.type="self-attention" arch.attention.rotary_embedding=true`

### Checkpointing
We have implemented *single* GPU training checkpointing, to do this use:
`impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last'`
This saves a checkpoint every 60 minutes under the name 'last'

Caution: This feature is not fully tested for multi-GPU cases. We also cannot currently train models which have used their full budget for longer.

### WandB
You can log runs to your weights&biases account. To do so, simply modify `wandb.entity` and `wandb.project` on the command line or at [cramming/config/wandb/default.yaml](cramming/config/wandb/default.yaml).

## Testing
We show examples in [shells/evaluation.sh](shells/evaluation.sh). 

We provide a very basic automation in [gen_eval_script.py](gen_eval_script.py), this prints the basic commands you may need to further edit these.

### Addition
For addition we have a very large possible evaluation set, we do a grid search over a 100x100 grid which we split into 20 pieces with the aim of balancing the number of forward calls across all 20 pieces.
We then have a further eval for operand lengths 100->160.

### Multiplication
We only evaluate up to 25x25, which we do in a single job.

### Sorting
Sorting uses a separate evaluation file [sort_eval.py](sort_eval.py), this is because the evaluation calls cannot be parallelised, making evaluation much longer.
The evaluation cannot be parallelised because the place of the equals sign is not fixed for a batch.
We currently evaluate across 30 jobs for a 30x30 grid but this can be reduced to a smaller number of jobs using these flags: `max_size_given, start_ind_1_given, start_ind_2_given`

### Bitwise OR
We use the same framework as for addition but the process is quicker as some of the batches do not contain 100 samples as there are not 100 possibilities for some batches. Unlike addition we do not sample with replacement for this task.

# Analysis
1. We provide [pretty_plotter.py](pretty_plotter.py) to combine the small evaluation grids together into one plot.
Use this by putting the model name into the string at the top of the `main` function.
2. For the large 100x100 grids we provide [pretty_plotter_big.py](pretty_plotter_big.py).
These are designed to be as flexible as possible but may need to be edited to fit your file set up.
3. For sorting, we provide [pretty_plotter_sort.py](pretty_plotter_sort.py), this allows us to read the individual `.txt` files created during testing and merge them all together into a nice plot.

# Contact
Please, feel free to contact us with any questions, or open an issue on Github.

================================================
FILE: abacus.py
================================================
"""Implementation of abacus embeddings"""
# Example of how to extract digit tokens to pass into constructor
# digit_tokens = tokenizer.convert_tokens_to_ids(['0','1','2','3','4','5','6','7','8','9'])

class Abacus(torch.nn.Module):
    """
    Abacus Embeddings, learned emebddings resued for each digit.
    Integers must be reversed for this to work correctly.
    Transformers Can Do Arithmetic with the Right Embeddings, McLeish et al. (2024)
    """
    def __init__(self, digit_tokens, embedding_dim, max_seq_length=1024, max_k=99):
        """
        digit_tokens (list): list of the tokens for each of the 10 digits, `digit_tokens = tokenizer.convert_tokens_to_ids(['0','1','2','3','4','5','6','7','8','9'])`
        embedding_dim (int): dimension to embed into
        max_seq_length (int): maximum number of embeddings that can be trained
        max_k (int): maximum k value which we randomly shift by during training
        """
        super().__init__()
        self.embedding = torch.nn.Embedding(max_seq_length, embedding_dim)
        self.register_buffer("digits", torch.tensor(digit_tokens), persistent=False)

        self.max_k = max_k

    def helper(self, mask, device):
        """
        Converts a binary mask of digit locations into spans of consecutive digits
        """
        mask_shape = mask.shape
        
        # Create a shifted version of the mask to detect changes from 0 to 1
        shifted_mask = torch.cat([torch.zeros((mask_shape[0], 1), device=device, dtype=mask.dtype), mask[:, :-1]], dim=1)
        starts = (shifted_mask != mask) & mask
        
        # Generate IDs for each segment of 1s, processing row-wise
        segment_ids = torch.cumsum(starts, dim=1)
        
        # Generate an index array row-wise
        index = torch.arange(mask.size(1)).repeat(mask.size(0), 1).to(device)
        
        # Reset index at the start of each segment
        reset_index = torch.zeros_like(mask).long()
        second_term = index * starts.long()
        reset_index = reset_index.scatter_add(1, segment_ids, second_term)
        
        # Calculate positions in segment
        positions = index - reset_index.gather(1, segment_ids) + 1
        
        # Ensure only values within 1-segments are non-zero
        result = positions * mask

        return result

    def forward(self, input_ids):
        """
        input_ids (tensor): a batch of inputs, each row is a sample
        """
        mask = torch.isin(input_ids, self.digits)
        output = self.helper(mask, input_ids.device)

        k=0
        if self.training:
            k = random.randint(0, self.max_k)
            output[output>0] += k # as we already have ones in the tensor, the tensor values will be k+1

        return self.embedding(output)


================================================
FILE: arithmetic_eval_quicker.py
================================================
import logging
import hydra
from omegaconf import OmegaConf
import cramming
import torch
from safetensors.torch import load_file
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np
import re
import pandas as pd
import datasets
import os
from typing import List, Dict
from cramming.data.tokenizer_preparation import get_tokenizer
import random

log = logging.getLogger(__name__)

def grid_plotter(data, type="accs", name='_large', extra_path=None):
    """plot a 2d accuracy grid"""
    data = np.array(data)*100
    df = pd.DataFrame(data)

    # Create the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(df, cmap="YlGnBu", fmt=".1f", annot_kws={'size': 8,'rotation':0})
    
    # Customize the plot
    plt.title("Accuracy - percetange, rounded to 1dp")
    plt.ylabel("1st Number Length")
    plt.xlabel("2nd Number Length")
    size = data.shape[0]
    plt.xticks(np.arange(0.5, size+0.5, 1), labels=np.arange(1, size+1, 1))
    plt.yticks(np.arange(0.5, size+0.5, 1), labels=np.arange(1, size+1, 1))

    if extra_path is not None:
        plt.savefig(f"{extra_path}{type}{name}_grid_plot", bbox_inches='tight')
    else:
        plt.savefig(f"{type}{name}_grid_plot", bbox_inches='tight')
    plt.clf()

def index_hints_helper(num, tokenizer):
    """Add index hints into a tokenized number"""
    char_set = tokenizer.char_set
    shape1 = num.shape[1]
    for i in range(shape1):
        this_char_token = tokenizer._convert_token_to_id(char_set[i])
        char_to_insert = this_char_token * torch.ones((num.shape[0], 1), dtype=num.dtype, device=num.device)
        num = torch.cat((num[:,:(2*i)], char_to_insert, num[:,(2*i):]), dim=1)
    return num

def grid_logic(cfg):
    """logic to select function to control which part of a 2d grid this run should be responsible for evaling"""

    # origional testing
    def logic_func_large(data_size_1, data_size_2):
        return (data_size_1 <= 23 or data_size_2 <=23)
    logic_func = logic_func_large
    name = '_large'
    max_size = 23+1
    
    if cfg.ood_only:
        def logic_func_ood(data_size_1, data_size_2):
            return (data_size_1 >=24 or data_size_2 >=24) and (data_size_1 <= 30 or data_size_2 <=30)
        logic_func = logic_func_ood
        name = '_ood_only'
        max_size = 30+1
        
    if cfg.up_to_40:
        def logic_func_40(data_size_1, data_size_2):
            return (data_size_1 >=31 or data_size_2 >=31) and (data_size_1 <=40 or data_size_2 <=40)
        logic_func = logic_func_40
        name = '_up_to_40'
        max_size = 40+1
        
    if cfg.up_to_50:
        def logic_func_50(data_size_1, data_size_2):
            return (data_size_1 >=41 or data_size_2 >=41) and (data_size_1 <=50 or data_size_2 <=50)
        logic_func = logic_func_50
        name = '_up_to_50'
        max_size = 50+1

    # checkerboarding: for the large eval we can checkerboard:

    if cfg.checkerboard is not None:
        if cfg.checkerboard == 'even':
            def checkerboard_even(data_size_1, data_size_2):
                return ((data_size_1+data_size_2)%2 ==0)
            checkerboard_func = checkerboard_even
            checkerboard_str = "_even"
        elif cfg.checkerboard == 'odd':
            def checkerboard_odd(data_size_1, data_size_2):
                return ((data_size_1+data_size_2)%2 ==1)
            checkerboard_func = checkerboard_odd
            checkerboard_str = "_odd"
        else:
            print("checkerboard config not allowed")
            exit()
    else:
        def always_true(data_size_1, data_size_2):
            return True
        checkerboard_func = always_true
        checkerboard_str = ""


    # if we are testing up to 100, split into 10 steps each of approximately equal number of forward passes required
    if cfg.big_eval_step_1: # 1 -> 46
        def logic_func_big_1(data_size_1, data_size_2):
            return (data_size_1 <= 46 and data_size_2 <= 46) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_1
        name = '_big_eval_1'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_2: # 47 -> 58
        def logic_func_big_2(data_size_1, data_size_2):
            return (data_size_1 >=47 or data_size_2 >=47) and (data_size_1 <=58 and data_size_2 <=58) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_2
        name = '_big_eval_2'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_3: # 59 -> 67
        def logic_func_big_3(data_size_1, data_size_2):
            return (data_size_1 >=59 or data_size_2 >=59) and (data_size_1 <=67 and data_size_2 <=67) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_3
        name = '_big_eval_3'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_4: # 68 -> 74
        def logic_func_big_4(data_size_1, data_size_2):
            return (data_size_1 >=68 or data_size_2 >=68) and (data_size_1 <=74 and data_size_2 <=74) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_4
        name = '_big_eval_4'+checkerboard_str
        max_size = 100+1
      
    if cfg.big_eval_step_5: # 75 -> 80
        def logic_func_big_5(data_size_1, data_size_2):
            return (data_size_1 >= 75 or data_size_2 >=75) and (data_size_1 <=80 and data_size_2 <=80) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_5
        name = '_big_eval_5'+checkerboard_str
        max_size = 100+1

    if cfg.big_eval_step_6: # 81 -> 85
        def logic_func_big_6(data_size_1, data_size_2):
            return (data_size_1 >= 81 or data_size_2 >=81) and (data_size_1 <=85 and data_size_2 <=85) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_6
        name = '_big_eval_6'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_7: # 86 -> 90
        def logic_func_big_7(data_size_1, data_size_2):
            return (data_size_1 >= 86 or data_size_2 >=86) and (data_size_1 <=90 and data_size_2 <=90) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_7
        name = '_big_eval_7'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_8: # 91 -> 94
        def logic_func_big_8(data_size_1, data_size_2):
            return (data_size_1 >= 91 or data_size_2 >=91) and (data_size_1 <=94 and data_size_2 <=94) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_8
        name = '_big_eval_8'+checkerboard_str
        max_size = 100+1
    
    if cfg.big_eval_step_9: # 95 -> 97
        def logic_func_big_9(data_size_1, data_size_2):
            return (data_size_1 >= 95 or data_size_2 >=95) and (data_size_1 <=97 and data_size_2 <=97) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_9
        name = '_big_eval_9'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_10: # 98 -> 100
        def logic_func_big_10(data_size_1, data_size_2):
            return (data_size_1 >= 98 or data_size_2 >=98) and (data_size_1 <=100 and data_size_2 <=100) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_10
        name = '_big_eval_10'+checkerboard_str
        max_size = 100+1

    # boolean_list_precidence = [large, ood_only, up_to_40, up_to_50, big_eval_step_1, big_eval_step_2, big_eval_step_3, big_eval_step_4, big_eval_step_5]

    log.info(f"large = {cfg.large}")
    log.info(f"ood only = {cfg.ood_only}")
    log.info(f"up to 40 = {cfg.up_to_40}")
    log.info(f"up to 50 = {cfg.up_to_50}")
    log.info(f"big eval 1 = {cfg.big_eval_step_1}")
    log.info(f"big eval 2 = {cfg.big_eval_step_2}")
    log.info(f"big eval 3 = {cfg.big_eval_step_3}")
    log.info(f"big eval 4 = {cfg.big_eval_step_4}")
    log.info(f"big eval 5 = {cfg.big_eval_step_5}")
    log.info(f"big eval 6 = {cfg.big_eval_step_6}")
    log.info(f"big eval 7 = {cfg.big_eval_step_7}")
    log.info(f"big eval 8 = {cfg.big_eval_step_8}")
    log.info(f"big eval 9 = {cfg.big_eval_step_9}")
    log.info(f"big eval 10 = {cfg.big_eval_step_10}")
    log.info(f"the last true value in the above list will be run, mul and pos arith can take control after this")

    return logic_func, name, max_size

def main(cfg):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    local_checkpoint_folder = os.path.join(cfg.base_dir, cfg.name, "checkpoints")
    tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg.eval.checkpoint,
                                                                                local_checkpoint_folder,
                                                                                cfg.eval.arch_modifications)
    if cfg.max_rec is not None: # can have more/less recurrences for eval
        cfg_arch.maximal_recurrence_in_eval = cfg.max_rec
    else:
        cfg_arch.maximal_recurrence_in_eval = cfg_arch.maximal_recurrence
    log.info(f"cfg_arch.maximal_recurrence_in_eval changed to {cfg_arch.maximal_recurrence_in_eval}")
    cfg_arch.throttle = False # turn throttle off

    logic_func, name, max_size = grid_logic(cfg)

    if cfg.mul: # multiplication
        def logic_func_for_mul(data_size_1, data_size_2):
            return (data_size_1 <= 25 or data_size_2 <= 25)
        logic_func = logic_func_for_mul
        name = '_large'
        max_size = 25+1
    log.info(f"mul = {cfg.mul}")

    if cfg.pos_arth: # bitwise OR
        def logic_func_for_pos(data_size_1, data_size_2):
            return (data_size_1 <= 25 or data_size_2 <= 25)
        logic_func = logic_func_for_pos
        name = '_large'
        max_size = 25+1
    log.info(f"pos_arth = {cfg.pos_arth}")

    if cfg.pos_arth_ood:
        def logic_func_for_pos_ood(data_size_1, data_size_2):
            return (data_size_1 >= 26 or data_size_2 >=26) and (data_size_1 <=40 and data_size_2 <=40)
        logic_func = logic_func_for_pos_ood
        name = '_ood_only'
        max_size = 40+1
    log.info(f"pos_arth_ood = {cfg.pos_arth_ood}")

    # import tokeniser
    cfg_data_sources_values_list = list(cfg.data.sources.values())[0]
    if cfg_data_sources_values_list["provider"] == "arithmetic":
        tokenizer = get_tokenizer(cfg_data_sources_values_list["tokenizer_type"])
    else: 
        log.info("exiting as this is only for arithmetic")
        exit()
    vocab = tokenizer.ids_to_tokens
    EOS_token = tokenizer._convert_token_to_id(tokenizer.eos_token)
    PAD_token = tokenizer._convert_token_to_id(tokenizer.pad_token)
    assert PAD_token == 0, "PAD token must be token zero for our code to work"

    # Load model
    if 'alpha' not in cfg_arch:
        cfg_arch['alpha'] = 1.0
    model = cramming.construct_model(cfg_arch, tokenizer).to(device)
    model = cramming.backend.load_model_checkpoint(model, model_file)
    model.to(device)
    model.eval()

    log.info(f"greedy = {cfg.greedy}, note: if greedy = True this overrides any temperature arguments")
    ## Greedy decoding will overide any temperature arguments

    if cfg.max_size_given is not None: # allows unique splits for eval
        max_size = max_size_given

    # Grid plots - grid search from 1x1 to 12x12 data
    data_sizes = list(range(1, max_size))
    acc_grid = np.zeros((len(data_sizes),len(data_sizes)))
    start_ind_1 = 0
    start_ind_2 = 0
    tuple_method = False
    completed_one = False
    if "big_eval" in name:
        tuple_method = True
        # go up two layers and search for grid
        try:
            with open(f"../../accs_grid_quick{name}.json", 'r') as file:
                data = json.load(file)
            start_ind_1 = data[1]
            start_ind_2 = data[2]
            acc_grid = np.array(data[0])
            log.info("loaded grid from previous run")
        except:
            pass

    if cfg.start_ind_1_given is not None: # allows unique splits for eval
        start_ind_1 = cfg.start_ind_1_given
    if cfg.start_ind_2_given is not None:
        start_ind_2 = cfg.start_ind_2_given
    log.info(f"start_ind_1 = {start_ind_1}, start_ind_2 = {start_ind_2}")

    os.makedirs("outputs", exist_ok=True)

    if not cfg.extended_eval:
        # main 2d loop
        for data_size_1 in data_sizes:
            for data_size_2 in data_sizes:
                if (data_size_1 < start_ind_1 or data_size_2 < start_ind_2) and not completed_one:
                    continue
                else:
                    proceed = False
                    # if both data sizes are less than the start indices, then dont proceed
                    # but if one of them is greater than the start indices, then proceed
                    if data_size_1 >= start_ind_1 or data_size_2 >= start_ind_2:
                        proceed = True
                        
                    if not proceed:
                        continue

                print(f"evaluating for {data_size_1} and {data_size_2}")

                if logic_func(data_size_1, data_size_2):
                    completed_one = True
                    log.info(f"Starting iteration in grid eval for size: {data_size_1} and {data_size_2}")
                    correct_total = 0

                    # get the correct dataset, these names may need to be changed if you make new datasets
                    file_path = f"../../../../data/arithmetic_data/+_grid_eval_dataset_padded_tokenized/+_n_{data_size_1}_m_{data_size_2}_examples_100_diff_lens_seed_42/hf_tokenized_dataset"
                    if cfg.reverse_inputs:
                        file_path = f"../../../../data/arithmetic_data/+_grid_eval_dataset_reverse_all_tokenized/+_n_{data_size_1}_m_{data_size_2}_examples_100_diff_lens_seed_42/hf_tokenized_dataset"
                    if cfg.mul:
                        file_path = f"../../../../data/arithmetic_data/x_grid_eval_dataset_2_reverse_all_tokenized/x_n_{data_size_1}_m_{data_size_2}_examples_100_diff_lens_exact_seed_91/hf_tokenized_dataset"
                    if cfg.pos_arth or cfg.pos_arth_ood:
                        file_path = f"../../../../data/arithmetic_data/pos_or_one_vec_zeros_eval/or_one_vec_zeros_{data_size_1}_{data_size_2}/hf_tokenized_dataset"
                    tokenized_dataset = datasets.load_from_disk(file_path)["test"]
                    data_loader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=100, shuffle=False)
                    equals_tensor = data_size_1+data_size_2+6
                    if cfg.pos_arth or cfg.pos_arth_ood:
                        equals_tensor = data_size_1+data_size_2+2

                    for batch in data_loader:
                        # split prompt and answer
                        tokenized_prompts = batch["input_ids"][:equals_tensor]
                        tokenized_prompts = torch.stack(tokenized_prompts).to(device)
                        tokenized_prompts = torch.transpose(tokenized_prompts, 0, 1)
                        tokenized_answers = batch["input_ids"][equals_tensor:]
                        tokenized_answers = torch.stack(tokenized_answers).to(device)
                        tokenized_answers = torch.transpose(tokenized_answers, 0, 1)
   
                        if cfg.remove_padding and (cfg_data_sources_values_list["tokenizer_type"] != "index"):
                            # removes the padding from the eval data
                            num1 = tokenized_prompts[:,:data_size_1]
                            op = tokenized_prompts[:,data_size_1+1:data_size_1+2]
                            num2 = tokenized_prompts[:,data_size_1+3:data_size_1+data_size_2+3]
                            equals = tokenized_prompts[:,data_size_1+data_size_2+4:data_size_1+data_size_2+5]
                            tokenized_prompts = torch.cat((num1, op, num2, equals), dim=1)
 
                        if cfg_data_sources_values_list["tokenizer_type"] == "index":
                            # adding in the index hints to the input numbers
                            num1 = tokenized_prompts[:,:data_size_1]
                            num1 = index_hints_helper(num1, tokenizer)
                            op = tokenized_prompts[:,data_size_1+1:data_size_1+2]
                            num2 = tokenized_prompts[:,data_size_1+3:data_size_1+data_size_2+3]
                            num2 = index_hints_helper(num2, tokenizer)
                            equals = tokenized_prompts[:,data_size_1+data_size_2+4:data_size_1+data_size_2+5]
                            tokenized_prompts = torch.cat((num1, op, num2, equals), dim=1)

                            predicted_ids = None

                            ## below inserts the characters for the model, we decided against this in the end
                            predicted_ids = model._generate(tokenized_prompts, token_limit=(tokenized_answers.shape[1]*2), temperature=cfg.temp, steps_at_generation_time=cfg_arch.maximal_recurrence_in_eval, greedy=cfg.greedy, quick=True)
                            predicted_ids = torch.transpose(predicted_ids, 0, 1)

                            new_tensor = torch.zeros_like(predicted_ids)
                            for i in range(predicted_ids.size(0)): # inefficient!!
                                # Filter out values greater than 17
                                filtered_values = predicted_ids[i][predicted_ids[i] <= 17]
                                # Place filtered values in new tensor and pad with zeros
                                new_tensor[i, :len(filtered_values)] = filtered_values

                            predicted_ids = new_tensor[:, :tokenized_answers.shape[1]] # trim off the excess
                            predicted_ids = torch.transpose(predicted_ids, 0, 1)

                        else:
                            predicted_ids = model._generate(tokenized_prompts, token_limit=tokenized_answers.shape[1], temperature=cfg.temp, steps_at_generation_time=cfg_arch.maximal_recurrence_in_eval, greedy=cfg.greedy, quick=True)
                        
                        if len(predicted_ids.shape) > 1: # i.e. we have a batch of more than one
                            predicted_ids = torch.transpose(predicted_ids, 0, 1)
                        else:
                            predicted_ids = predicted_ids.reshape((1,-1)) # add a batch dim otherwise
                            
                    # ignore everything after EOS on eval but replacing all after EOS with PAD
                    eval_tensor = predicted_ids.clone()
                    input_tensor_EOS = (eval_tensor == EOS_token).int()
                    indices_of_EOS = torch.argmax(input_tensor_EOS, dim=1)
                    mask = torch.arange(eval_tensor.size(1)).to(device) > indices_of_EOS[:, None]
                    eval_tensor[mask] = PAD_token
                    
                    # compare eval tensor to correct outputs
                    elementwise_equal = torch.eq(eval_tensor, tokenized_answers)
                    rows_equal = torch.all(elementwise_equal, dim=1)
                    num_equal_rows = torch.sum(rows_equal).item()
                    correct_total += (num_equal_rows/tokenized_prompts.shape[0])
                    log.info(f"accuracy for {data_size_1}, {data_size_2}: {num_equal_rows} = {correct_total*100}%")

                    # combine the prompts and outputs
                    complete_lines = torch.cat((tokenized_prompts,predicted_ids), dim=1)
                    tokens_list = complete_lines.tolist()
                    decoded_batch = list(map(lambda seq: list(map(lambda token: vocab[token], seq)), tokens_list)) # map token ids to tokens
                    log.info(f"example for {data_size_1}, {data_size_2}: {decoded_batch[0]}")
                    # save the answers down so we don't eval twice ever
                    with open(f"outputs/+_n_{data_size_1}_m_{data_size_2}.json", 'w') as json_file:
                        json.dump(decoded_batch, json_file)

                    acc_grid[(data_size_1-1),(data_size_2-1)] = correct_total

                    if tuple_method:
                        with open(f"../../accs_grid_quick{name}.json", "w") as file:
                            tuple_to_save = (acc_grid.tolist(),data_size_1,data_size_2)
                            json.dump(tuple_to_save, file)

        log.info(f"acc grid: {acc_grid}")

        with open(f"accs_grid_quick{name}.json", "w") as file:
            json.dump(acc_grid.tolist(), file)
        
        # Grid plots - one for accs one for contains
        grid_plotter(acc_grid, name=name)

    if cfg.extended_eval:
        # extended eval to eval large numbers easily, used the large eval numebers to split up into multiple parts

        number = int(re.findall(r'\d+', name)[0])
        log.info("starting extended eval")
        # this is hard coded for reverse all, addition past 100x100 grid, removing the padding

        accs = dict()
        batch_size_extended_eval = 100

        old_data_path = None
        for root, dirs, files in os.walk("../.."):
            if f"over_100_{number}.json" in files:
                old_data_path = os.path.join(root, f"over_100_{number}.json")

        if number == 1:
            start = 101
            list_to_do = range(start,161)
        elif number == 2:
            list_to_do = [1000, 800]
        elif number == 3:
            list_to_do = [200, 700, 900]
        elif number == 4:
            list_to_do = [300, 400, 500, 600]
        else:
            print("number too high")
            exit()

        if old_data_path is not None: # read the old accs dict and don't repeat what we have already done
            with open(old_data_path, 'r') as file:
                data = json.load(file)
            accs = {int(k): v for k, v in data.items()}
            to_do = set(list_to_do).difference(set(accs.keys()))
            list_to_do = list(to_do)

        log.info(f"In extended eval with number {number}")

        for data_size in list_to_do:
            log.info(f"Extended eval {data_size}")
            correct_total = 0
            file_path = f"../../../../data/arithmetic_data/+_grid_eval_dataset_reverse_all_tokenized_over_100/+_n_{data_size}_m_{data_size}_examples_100_diff_lens_exact_seed_42/hf_tokenized_dataset"
            tokenized_dataset = datasets.load_from_disk(file_path)["test"]
            data_loader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=batch_size_extended_eval, shuffle=False)
            equals_tensor = data_size+data_size+6

            for batch in data_loader:
                # get prompt and answer
                tokenized_prompts = batch["input_ids"][:equals_tensor]
                tokenized_prompts = torch.stack(tokenized_prompts).to(device)
                tokenized_prompts = torch.transpose(tokenized_prompts, 0, 1)
                tokenized_answers = batch["input_ids"][equals_tensor:]
                tokenized_answers = torch.stack(tokenized_answers).to(device)
                tokenized_answers = torch.transpose(tokenized_answers, 0, 1)

                # remove the padding
                num1 = tokenized_prompts[:,:data_size]
                op = tokenized_prompts[:,data_size+1:data_size+2]
                num2 = tokenized_prompts[:,data_size+3:data_size+data_size+3]
                equals = tokenized_prompts[:,data_size+data_size+4:data_size+data_size+5]
                tokenized_prompts = torch.cat((num1, op, num2, equals), dim=1)

                # get the output from the model
                predicted_ids = model._generate(tokenized_prompts, token_limit=tokenized_answers.shape[1], temperature=cfg.temp, steps_at_generation_time=cfg_arch.maximal_recurrence_in_eval, greedy=cfg.greedy, quick=True)
                predicted_ids = torch.transpose(predicted_ids, 0, 1) # add a batch dim

                eval_tensor = predicted_ids.clone()
                input_tensor_EOS = (eval_tensor == EOS_token).int()
                indices_of_EOS = torch.argmax(input_tensor_EOS, dim=1)
                mask = torch.arange(eval_tensor.size(1)).to(device) > indices_of_EOS[:, None]
                eval_tensor[mask] = PAD_token
                elementwise_equal = torch.eq(eval_tensor, tokenized_answers)
                
                rows_equal = torch.all(elementwise_equal, dim=1)
                num_equal_rows = torch.sum(rows_equal).item()
                correct_total += (num_equal_rows/tokenized_prompts.shape[0])
                log.info(f"accuracy for {data_size}, {data_size}: {num_equal_rows} = {correct_total*100}%")

                # combine the prompts and outputs
                complete_lines = torch.cat((tokenized_prompts,predicted_ids), dim=1)
                tokens_list = complete_lines.tolist()
                decoded_batch = list(map(lambda seq: list(map(lambda token: vocab[token], seq)), tokens_list)) # map token ids to tokens
                log.info(f"example for {data_size}, {data_size}: {decoded_batch[0]}")
                # save the answers down so we don't eval twice ever

            accs[data_size] = correct_total
            with open(f"over_100_{number}.json", 'w') as json_file:
                    json.dump(accs, json_file)
                    
    log.info("Eval complete")

@hydra.main(config_path="cramming/config", config_name="cfg_eval", version_base="1.3")
def launch(cfg):
    log.info("calling main launch")
    cfg = cramming.utils.pathfinder(cfg)
    log.info(OmegaConf.to_yaml(cfg, resolve=True))
    main(cfg)

if __name__ == "__main__":
    launch()

================================================
FILE: cramming/__init__.py
================================================
"""Initialize cramming"""

from cramming import utils
from cramming.architectures import construct_model
from cramming.backend import load_backend
from cramming.data import load_pretraining_corpus, prepare_dataloaders


__all__ = [
    "construct_model",
    "load_backend",
    "prepare_dataloaders",
    "load_pretraining_corpus",
    "utils",
]


import hydra

"""Construct interfaces to some cfg folders for use in packaged installations:"""


def get_config(overrides=[]):
    """Return default hydra config."""
    with hydra.initialize(config_path="config"):
        cfg = hydra.compose(config_name="cfg", overrides=overrides)
        print(f"Loading default config {cfg.name}.")
    return cfg


def get_model_config(arch="hf-bert-tiny", overrides=[]):
    """Return default hydra config for a given attack."""
    with hydra.initialize(config_path="config/arch"):
        cfg = hydra.compose(config_name=arch, overrides=overrides)
        print(f"Loading model configuration {cfg.architecture}.")
    return cfg


def get_backend_config(backend="torch-default", overrides=[]):
    """Return default hydra config for a given attack."""
    with hydra.initialize(config_path="config/impl"):
        cfg = hydra.compose(config_name=backend, overrides=overrides)
        print(f"Loading backend {cfg.name}.")
    return cfg


================================================
FILE: cramming/architectures/__init__.py
================================================
"""This module handles all questions of model architecture."""

from .construction import construct_model

__all__ = ["construct_model"]


================================================
FILE: cramming/architectures/attention.py
================================================
"""Attention modules. Most code heavily stolen from the GPT-neoX implementation"""
import torch
from transformers.models.bert.modeling_bert import BertSelfAttention

from .embeddings import Rotary, RotarySanityCheck, RotaryEleutherAI, RotaryLLAMA, FIRE
from typing import Optional

from torch.nn.modules.linear import NonDynamicallyQuantizableLinear  # use to mark output projections of attn while it exists


def get_attention_mechanism(idx, hidden_size, cfg_attention, norm_fn: torch.nn.Identity):
    # ########## main implementation
    if cfg_attention.type == "self-attention":
        mechanism = SeqFirstSelfAttention(hidden_size, cfg_attention, norm_fn)  # neox
    # ########## other things:
    elif cfg_attention.type == "pytorch":
        mechanism = SelfAttentionPyTorch(hidden_size, cfg_attention)  # torch default
    elif cfg_attention.type == "pytorch-seqfirst":
        mechanism = SeqFirstSelfAttentionPyTorch(hidden_size, cfg_attention)  # torch default
    elif cfg_attention.type == "huggingface":
        mechanism = BertAttentionWrapper(hidden_size, cfg_attention)  # always includes bias!
    elif cfg_attention.type == "fourier":
        mechanism = FourierMixing(hidden_size, cfg_attention)
    elif cfg_attention.type == "none":
        mechanism = Identity(hidden_size)
    elif cfg_attention.type == "rn":
        mechanism = RandomNoise(hidden_size) # i.e. no signal on where to look
    else:
        raise ValueError(f"Invalid attention type {cfg_attention.type} given.")
    return mechanism


class Identity(torch.nn.Module):
    """mini wrapper around BERT attention from huggingface for sanity checks."""

    __constants__ = ["LAYOUT"]
    LAYOUT = "[B S H]"

    def __init__(self, hidden_size):
        super().__init__()
        self.output_dim = hidden_size

    def forward(self, hidden_states, attention_mask: Optional[torch.Tensor] = None):
        return hidden_states

class RandomNoise(torch.nn.Module):
    """mini wrapper around BERT attention from huggingface for sanity checks."""

    __constants__ = ["LAYOUT"]
    LAYOUT = "[B S H]"

    def __init__(self, hidden_size):
        super().__init__()
        self.output_dim = hidden_size

    def forward(self, hidden_states, attention_mask: Optional[torch.Tensor] = None):
        print("using rn")
        return hidden_states + torch.normal(0, 0.1, hidden_states.shape).to(hidden_states.device)

class BertAttentionWrapper(BertSelfAttention):
    """mini wrapper around BERT attention from huggingface for sanity checks."""

    __constants__ = ["LAYOUT"]
    LAYOUT = "[B S H]"

    def __init__(self, hidden_size, cfg_attention):
        class config:
            pass

        config.hidden_size = hidden_size
        config.num_attention_heads = cfg_attention.num_attention_heads
        config.attention_probs_dropout_prob = 0.0
        config.is_decoder = True

        super().__init__(config)
        if cfg_attention.skip_output_projection:
            self.dense = torch.nn.Identity()
        else:
            self.dense = torch.nn.Linear(hidden_size, hidden_size, bias=cfg_attention.bias_in_proj)

    def forward(self, hidden_states, attention_mask: Optional[torch.Tensor] = None):
        return self.dense(super().forward(hidden_states, attention_mask)[0])


class SelfAttentionPyTorch(torch.nn.Module):
    """Minimal wrapper around pytorch self attention."""

    __constants__ = ["LAYOUT"]
    LAYOUT = "[B S H]"

    def __init__(self, hidden_size, cfg_attention):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(
            hidden_size,
            cfg_attention.num_attention_heads,
            dropout=0.0,
            batch_first=True,
            bias=cfg_attention.bias_in_proj,
            add_bias_kv=cfg_attention.qkv_bias,
        )

    def forward(self, hidden_states, attention_mask: Optional[torch.Tensor] = None):
        return self.attn(
            hidden_states,
            hidden_states,
            hidden_states,
            attn_mask=attention_mask[0, 0, :, :],
            need_weights=False,
            is_causal=True,
        )[0]


class SeqFirstSelfAttentionPyTorch(torch.nn.Module):
    """Minimal wrapper around pytorch self attention."""

    __constants__ = ["LAYOUT"]
    LAYOUT = "[S B H]"

    def __init__(self, hidden_size, cfg_attention):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(
            hidden_size,
            cfg_attention.num_attention_heads,
            dropout=0.0,
            batch_first=False,
            bias=cfg_attention.bias_in_proj,
            add_bias_kv=cfg_attention.qkv_bias,
        )

    def forward(self, hidden_states, attention_mask: Optional[torch.Tensor] = None):
        return self.attn(
            hidden_states,
            hidden_states,
            hidden_states,
            attn_mask=attention_mask[0, 0, :, :],
            need_weights=False,
            is_causal=True,
        )[0]


class SeqFirstSelfAttention(torch.nn.MultiheadAttention):
    """Self-attention layer.

    This is the gpt neo-x implementation from:
    https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py (which is a megatron variant)

    This is a modified version of the neo-x implementation that I can manage to compile without graph breaks.

    Inherits from MultiheadAttention to catch the same initialization
    """

    __constants__ = ["LAYOUT"]
    LAYOUT: str = "[S B H]"

    def __init__(self, hidden_size: int, cfg_attention, norm_module=torch.nn.Identity):
        torch.nn.Module.__init__(self)
        self.hidden_size = hidden_size
        self.num_attention_heads = cfg_attention.num_attention_heads
        self.hidden_per_head = self.hidden_size // cfg_attention.num_attention_heads
        self.register_buffer("norm_factor", torch.tensor(self.hidden_per_head).rsqrt())
        self.cfg_attention = cfg_attention
        self.use_fire = False

        self.norm = norm_module()

        # Strided linear layer.
        self.in_proj_weight = torch.nn.Parameter(torch.randn(3 * self.hidden_size, self.hidden_size))
        if cfg_attention.qkv_bias:
            self.in_proj_bias = torch.nn.Parameter(torch.zeros(3 * self.hidden_size))
        else:
            self.in_proj_bias = None
        self.bias_k, self.bias_v = None, None  # for compat with MultiheadAttention

        self.output_dim = hidden_size
        if cfg_attention.rotary_embedding == "sanity":
            self.rotary_emb = RotarySanityCheck(self.hidden_per_head, seq_dim=0)
        elif cfg_attention.rotary_embedding == "v2":
            self.rotary_emb = RotaryEleutherAI(self.hidden_per_head)
        elif cfg_attention.rotary_embedding == "llama":
            self.rotary_emb = RotaryLLAMA(self.hidden_per_head)
        elif cfg_attention.rotary_embedding == "fire":
            self.rotary_emb = FIRE(cfg_attention.num_attention_heads, max_length=cfg_attention.max_length)
            self.use_fire = True
        elif cfg_attention.rotary_embedding:
            self.rotary_emb = Rotary(self.hidden_per_head, seq_dim=0)
        else:
            self.rotary_emb = None
            
        if cfg_attention.sequence_op == "torch-softmax":
            self.sequence_op = TorchSoftmax(cfg_attention.seq_op_in_fp32)
        elif cfg_attention.sequence_op == "shaped-attention":
            self.sequence_op = TorchShaped(cfg_attention.seq_op_in_fp32, hidden_size=self.hidden_size)
        elif cfg_attention.sequence_op == "swin-cosine":
            self.sequence_op = SwinCosine(cfg_attention.seq_op_in_fp32)
        elif cfg_attention.sequence_op == "torch-norm":
            self.sequence_op = TorchNormalize(self.num_attention_heads, cfg_attention.seq_op_in_fp32)
        elif cfg_attention.sequence_op == "none":
            self.sequence_op = ScaledIdentity(cfg_attention.seq_op_in_fp32)
        elif cfg_attention.sequence_op == "cumsum":
            self.sequence_op = Cumsum(cfg_attention.seq_op_in_fp32)
        elif cfg_attention.sequence_op == "cumsumexp":
            self.sequence_op = CumsumExp(cfg_attention.seq_op_in_fp32)
        else:
            raise ValueError(f"Invalid sequence operation {cfg_attention.sequence_op} given.")

        if cfg_attention.skip_output_projection:
            self.out_proj = torch.nn.Identity()
        else:
            self.out_proj = NonDynamicallyQuantizableLinear(hidden_size, hidden_size, bias=cfg_attention.bias_in_proj)

        self.attention_func = self.attention

    def attention(self, query_layer, key_layer, value_layer, attention_mask: Optional[torch.Tensor] = None, training: bool = False, fire: Optional[torch.Tensor] = None):
        # ===================================
        # Raw attention scores. [b, np, s, s]
        # ===================================

        # [b, np, sq, sk]
        output_size = (query_layer.shape[1], query_layer.shape[2], query_layer.shape[0], key_layer.shape[0])

        # [sq, b, np, hn] -> [sq, b * np, hn]
        query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
        key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)

        # this better be fused in a clever way:
        matmul_result = torch.bmm(query_layer.transpose(0, 1), key_layer.transpose(0, 1).transpose(1, 2)) * self.norm_factor

        # change view to [b, np, sq, sk]
        attention_scores = matmul_result.view(output_size[0], output_size[1], output_size[2], output_size[3])
        if fire is not None:
            attention_scores += fire

        # ===========================
        # Attention probs
        # ===========================
        # attention scores and attention mask [b, np, sq, sk]
        attention_probs = self.sequence_op(attention_scores, attention_mask)

        # =========================
        # Context layer. [sq, b, hp]
        # =========================

        # value_layer -> context layer.
        # [sk, b, np, hn] --> [b, np, sq, hn]

        # context layer shape: [b, np, sq, hn]
        output_size = (value_layer.shape[1], value_layer.shape[2], query_layer.shape[0], value_layer.shape[3])

        # change view [sk, b * np, hn]
        value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)

        # change view [b * np, sq, sk]
        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)

        # matmul: [b * np, sq, hn]
        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))

        # change view [b, np, sq, hn]
        context_layer = context_layer.view(*output_size)
        return context_layer
    
    def forward(self, hidden_states, attention_mask: Optional[torch.Tensor] = None):
        # =====================
        # hidden_states: [sq, b, h]
        # Query, Key, and Value
        # =====================
        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
        mixed_x_layer = torch.nn.functional.linear(hidden_states, self.in_proj_weight, self.in_proj_bias)

        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
        # new_tensor_shape = mixed_x_layer.size()[:-1] + (self.num_attention_heads, 3 * self.hidden_per_head)
        mixed_x_layer = mixed_x_layer.view(
            hidden_states.shape[0], hidden_states.shape[1], self.num_attention_heads, 3 * self.hidden_per_head
        )
        # print("mixed shape ",mixed_x_layer.shape) (82, 24, 16, 192)

        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
        (query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, [self.hidden_per_head] * 3, dim=3)
        
        fire = None
        if self.rotary_emb is not None:
            if self.use_fire:
                fire = self.rotary_emb(query_layer.size(0), query_layer.device)
            else:
                query_layer, key_layer = self.rotary_emb(query_layer, key_layer)
                # print(query_layer.shape)

        # ==================================
        # Attention computation
        # ==================================
        context_layer = self.attention_func(query_layer, key_layer, value_layer, attention_mask, self.training, fire)

        # [b, np, sq, hn] --> [sq, b, np, hn]
        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()

        # [sq, b, np, hn] --> [sq, b, hp]
        # new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size,)
        context_layer = context_layer.view(context_layer.shape[0], context_layer.shape[1], self.hidden_size)
        return self.out_proj(self.norm(context_layer))


class FourierMixing(torch.nn.Module):
    """Fourier mixing layer as described in the FNet paper.
    Layer takes input with size [Batch, Seq, Hidden] and returns output of the same size.
    This function can take an attention mask as input, but will ignore it.
    """

    __constants__ = ["LAYOUT"]
    LAYOUT = "[B S H]"

    def __init__(self, hidden_size, cfg_attention):
        super().__init__()
        self.fft_op_in_fp32 = True  # Always necessary (atleast on pytorch 1.12)
        self.output_dim = hidden_size
        if cfg_attention.rotary_embedding:
            if cfg_attention.low_level_fusion:
                self.rotary_emb = torch.jit.script(Rotary(hidden_size, seq_dim=1))
            else:
                self.rotary_emb = Rotary(hidden_size, seq_dim=0)
        else:
            self.rotary_emb = None

    def forward(self, hidden_states, attention_mask: Optional[torch.Tensor] = None):
        """Forward will take an attention mask but ignore it!"""

        if self.rotary_emb is not None:
            # full rotary (mostly on for compatibility, no guarantees on this being non-terrible)
            cos, sin = self.rotary_emb.get_cos_sin_cache(hidden_states)
            hidden_states = (hidden_states * cos[:, 0]) + (self.rotary_emb.rotate_half(hidden_states) * sin[:, 0])

        if self.fft_op_in_fp32:
            hidden_state_dtype = hidden_states.dtype
            hidden_states = hidden_states.float()
        else:
            hidden_state_dtype = None

        # Implementation 1:
        # hidden_states = torch.fft.fft(torch.fft.fft(hidden_states, dim=0, , norm="ortho"), dim=2, , norm="ortho").real
        # Implementation 2:
        hidden_states = torch.fft.fftn(hidden_states, dim=(1, 2), norm="ortho").real  # could also cast into angle?

        if self.fft_op_in_fp32:
            hidden_states = hidden_states.to(hidden_state_dtype)

        return hidden_states


class TorchSoftmax(torch.nn.Module):
    def __init__(self, seq_op_in_fp32=False):
        super().__init__()
        self.seq_op_in_fp32 = seq_op_in_fp32

    def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
        input_dtype = inputs.dtype
        if self.seq_op_in_fp32:
            inputs = inputs.to(dtype=torch.float)
        if attention_mask is not None:
            inputs = inputs.masked_fill_(attention_mask, -10000.0)
        probs = torch.softmax(inputs, dim=-1).to(dtype=input_dtype)
        return probs


class TorchShaped(torch.nn.Module):
    """Noci et al."""

    def __init__(self, seq_op_in_fp32=False, hidden_size=768):
        super().__init__()
        self.seq_op_in_fp32 = seq_op_in_fp32
        self.register_buffer("nfactor", torch.tensor(hidden_size).rsqrt())

    def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
        input_dtype = inputs.dtype
        breakpoint()
        if self.seq_op_in_fp32:
            inputs = inputs.to(dtype=torch.float)
        if attention_mask is not None:
            inputs = inputs.masked_fill_(attention_mask, -10000.0)
        probs = torch.softmax(inputs * self.nfactor, dim=-1).to(dtype=input_dtype)
        I = torch.eye(probs.shape[-1], dtype=probs.dtype, device=probs.device)[None, None, :, :]
        shaped_outputs = probs + I - 1 / probs.shape[-1]
        return shaped_outputs


class SwinCosine(torch.nn.Module):
    """kind of SwinCosine, but not quite (normalizations scaled by mean(q) and mean(k))"""

    def __init__(self, seq_op_in_fp32=False, tau=0.1, eps=1e-8):
        super().__init__()
        self.seq_op_in_fp32 = seq_op_in_fp32
        self.tau = 0.1
        self.eps = eps

    def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
        """inputs are q_i, k_j -> o_ij. Normalize"""
        input_dtype = inputs.dtype
        if self.seq_op_in_fp32:
            inputs = inputs.to(dtype=torch.float)
        row_norm = inputs.mean(dim=-1, keepdim=True).norm(dim=-2, keepdim=True)
        col_norm = inputs.mean(dim=-2, keepdim=True).norm(dim=-1, keepdim=True)
        outputs = inputs / torch.clamp(row_norm * col_norm * self.tau, min=self.eps)

        if attention_mask is not None:
            outputs[:, :, attention_mask[0, 0]] = 0

        return outputs.to(dtype=input_dtype)


class TorchNormalize(torch.nn.Module):
    def __init__(self, num_attention_heads=1, seq_op_in_fp32=False):
        """Normalized attention pooling as described in Richter&Wattenhofer, 2020."""
        super().__init__()
        self.seq_op_in_fp32 = seq_op_in_fp32
        self.seq_gamma = torch.nn.Parameter(torch.ones(1, num_attention_heads, 1, 1))
        self.seq_beta = torch.nn.Parameter(torch.zeros(1, num_attention_heads, 1, 1))

    def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
        # Inputs are [b, np, sq, sk]
        input_dtype = inputs.dtype
        if self.seq_op_in_fp32:
            inputs = inputs.to(dtype=torch.float)

        if attention_mask is not None:
            inputs.masked_fill_(attention_mask, 0.0)

        norms = torch.nn.functional.layer_norm(inputs, inputs.shape[1:], eps=1e-05)
        norms = (norms * self.seq_gamma + self.seq_beta).to(dtype=input_dtype)
        return norms


class ScaledIdentity(torch.nn.Module):
    def __init__(self, seq_op_in_fp32):
        super().__init__()
        self.seq_op_in_fp32 = seq_op_in_fp32

    def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
        """Sequence-scaled input."""
        input_dtype = inputs.dtype
        if self.seq_op_in_fp32:
            inputs = inputs.to(dtype=torch.float)
        return (inputs * torch.as_tensor(inputs.shape[2]).rsqrt()).to(dtype=input_dtype)


class Cumsum(torch.nn.Module):
    def __init__(self, seq_op_in_fp32):
        super().__init__()
        self.seq_op_in_fp32 = seq_op_in_fp32

    def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
        """Sequence-scaled input cumulative sum."""
        input_dtype = inputs.dtype
        if self.seq_op_in_fp32:
            inputs = inputs.to(dtype=torch.float)
        return (inputs.cumsum(dim=-1) * pow(inputs.shape[2], -0.5)).to(dtype=input_dtype)


class CumsumExp(torch.nn.Module):
    def __init__(self, seq_op_in_fp32):
        super().__init__()
        self.seq_op_in_fp32 = True  # Required as of pytorch 1.13

    def forward(self, inputs, attention_mask: Optional[torch.Tensor] = None):
        """Sequence-scaled input cumulative sum."""
        input_dtype = inputs.dtype
        if self.seq_op_in_fp32:
            inputs = inputs.to(dtype=torch.float)
        return (inputs.logcumsumexp(dim=-1) * pow(inputs.shape[2], -0.5)).to(dtype=input_dtype)


================================================
FILE: cramming/architectures/components.py
================================================
"""Basic transformer components."""

import torch

from typing import Tuple
from functools import partial

from .embeddings import SinusoidalPositional, LearnablePositional, ScaledSinosoidal, Abacus
from torch.nn.modules.linear import NonDynamicallyQuantizableLinear  # use to mark output projections of attn while it exists

INPLACE = False


class EmbeddingComponent(torch.nn.Module):
    """Absolute Embeddings and sine embeddings"""
    def __init__(self, cfg_embedding, norm, norm_eps):
        super().__init__()

        self.word_embedding = torch.nn.Embedding(cfg_embedding.vocab_size, cfg_embedding.embedding_dim)
        if cfg_embedding.pos_embedding == "learned":
            self.pos_embedding = LearnablePositional(cfg_embedding.embedding_dim, cfg_embedding.max_seq_length)
        elif cfg_embedding.pos_embedding == "learned_rand":
            self.pos_embedding = LearnablePositionalRand(cfg_embedding.embedding_dim, cfg_embedding.max_seq_length)
        elif cfg_embedding.pos_embedding == "sinusoidal":
            self.pos_embedding = SinusoidalPositional(cfg_embedding.embedding_dim, cfg_embedding.max_seq_length)
        elif cfg_embedding.pos_embedding == "scaled-sinusoidal":
            self.pos_embedding = ScaledSinosoidal(cfg_embedding.embedding_dim, cfg_embedding.max_seq_length)
        elif cfg_embedding.pos_embedding == "abacus":
            self.pos_embedding = Abacus(cfg_embedding.embedding_dim, cfg_embedding.max_seq_length, max_k=cfg_embedding.max_abacus_len)
        else:
            self.pos_embedding = None

        if cfg_embedding.normalization:
            self.stabilize_low_precision = cfg_embedding.get("stable_low_precision", False)
            self.norm = _get_norm_fn(norm)(cfg_embedding.embedding_dim, eps=norm_eps)
        else:
            self.stabilize_low_precision = False
            self.norm = torch.nn.Identity()

    def forward(self, input_ids):
        embeds = self.word_embedding(input_ids)

        if self.pos_embedding is not None:
            embeds += self.pos_embedding(input_ids)
        

        if self.stabilize_low_precision:
            # Stabilize as in bnb StableEmbedding
            return self.norm(embeds.to(torch.get_default_dtype())).to(embeds.dtype)
        else:
            return self.norm(embeds)


class PredictionHeadComponent(torch.nn.Module):
    def __init__(self, cfg_arch):
        super().__init__()

        if cfg_arch.embedding.embedding_dim == cfg_arch.hidden_size:
            output_size = cfg_arch.hidden_size
        else:
            output_size = cfg_arch.embedding.embedding_dim

        self.dense = torch.nn.Linear(cfg_arch.hidden_size, output_size, bias=cfg_arch.use_bias)
        self.nonlin = _get_nonlin_fn(cfg_arch.nonlin, use_gating=False)()
        self.norm = _get_norm_fn(cfg_arch.norm)(output_size, eps=cfg_arch.norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.norm(self.nonlin(self.dense(hidden_states)))
        return hidden_states


class NormalizedResidualConnection(torch.nn.Module):
    """Implement variations on residual connection types, especially stabilized versions and deep/shaped propagation."""

    def __init__(self, input_dim, cfg_arch, output_dim=None, dropout=0.0):
        super().__init__()
        output_dim = input_dim if output_dim is None else output_dim
        self.dropout = torch.nn.Dropout(dropout) if dropout > 0 else torch.nn.Identity()
        if cfg_arch.norm_scheme == "pre":
            self.norm = _get_norm_fn(cfg_arch.norm)(input_dim, eps=cfg_arch.norm_eps)
            self._chosen_forward_impl = self._prenormalization_residual
        elif cfg_arch.norm_scheme == "post":
            self.norm = _get_norm_fn(cfg_arch.norm)(output_dim, eps=cfg_arch.norm_eps)
            self._chosen_forward_impl = self._postnormalization_residual
        elif cfg_arch.norm_scheme == "simple":
            self._chosen_forward_impl = self._simple_residual
        elif cfg_arch.norm_scheme == "deepnorm":
            self.norm = _get_norm_fn(cfg_arch.norm)(output_dim, eps=cfg_arch.norm_eps)
            if "num_transformer_layers" in cfg_arch:
                self.alpha = (2.0 * cfg_arch.num_transformer_layers) ** 0.25
            elif "layers_in_recurrent_block" in cfg_arch:
                self.alpha = (2.0 * cfg_arch.layers_in_recurrent_block * cfg_arch.maximal_recurrence) ** 0.25
            else:
                raise ValueError("Need to define `num_transformer_layers` in config for deepnorm.")
            self._chosen_forward_impl = self._deepnorm_residual
        elif cfg_arch.norm_scheme == "shaped":
            self.norm = _get_norm_fn(cfg_arch.norm)(input_dim, eps=cfg_arch.norm_eps)
            self.gamma = 0.214  # Noci et al., could make this into a parameter
            self.alpha = torch.as_tensor(1 - self.gamma**2).sqrt().item()
            self._chosen_forward_impl = self._prenorm_equalized_residual
        elif cfg_arch.norm_scheme == "sandwich":
            self.norm = _get_norm_fn(cfg_arch.norm)(input_dim, eps=cfg_arch.norm_eps)
            self.norm2 = _get_norm_fn(cfg_arch.norm)(output_dim, eps=cfg_arch.norm_eps)
            self._chosen_forward_impl = self._sandwich_residual
        else:
            raise ValueError(f"Invalid type of residual connection {cfg_arch.norm_scheme} given.")

    def _simple_residual(self, residual, layer, states, *args, **kwargs):
        return residual + self.dropout(layer(states, *args, **kwargs))

    def _prenormalization_residual(self, residual, layer, states, *args, **kwargs):
        return residual + self.dropout(layer(self.norm(states), *args, **kwargs))

    def _postnormalization_residual(self, residual, layer, states, *args, **kwargs):
        return self.norm(residual + layer(states, *args, **kwargs))

    def _deepnorm_residual(self, residual, layer, states, *args, **kwargs):
        return self.norm(residual * self.alpha + self.dropout(layer(states, *args, **kwargs)))

    def _prenorm_equalized_residual(self, residual, layer, states, *args, **kwargs):
        return residual * self.alpha + self.dropout(layer(self.norm(states), *args, **kwargs)) * self.gamma

    def _sandwich_residual(self, residual, layer, states, *args, **kwargs):
        return self.norm2(residual + self.dropout(layer(self.norm(states), *args, **kwargs)))

    def forward(self, residual: torch.Tensor, layer_callable: torch.nn.Module, states: torch.Tensor, *args, **kwargs):
        """Argument might look weird here, but I find it nicer because it reads like the pre/post schemes from left to right,
        as
        residual + layer ( state )

        Additional args are passed directly into the layer callable
        """
        return self._chosen_forward_impl(residual, layer_callable, states, *args, **kwargs)


def _get_norm_fn(norm_name):
    if norm_name == "ScaleNorm":
        norm_fn = ScaleNorm
    elif norm_name == "RMSNorm":
        norm_fn = RMSNorm
    elif norm_name == "ApexLayerNorm":
        from apex.normalization import FusedLayerNorm

        norm_fn = FusedLayerNorm
    else:
        norm_fn = getattr(torch.nn, norm_name)
    return norm_fn


def _get_nonlin_fn(nonlin_name, use_gating=True):
    if "glu" in nonlin_name.lower():
        nonlin_name = nonlin_name.split("glu")[0]
        wrap_in_glu = use_gating
    else:
        wrap_in_glu = False
    nonlin_fn = getattr(torch.nn, nonlin_name)  # dont mess this up :<
    try:
        nonlin_fn = partial(nonlin_fn, inplace=INPLACE)
        nonlin_fn()
    except TypeError:
        nonlin_fn = getattr(torch.nn, nonlin_name)

    if wrap_in_glu:
        return partial(GLU, nonlin_fn)
    else:
        return nonlin_fn


class GLU(torch.nn.Module):
    """*-GLU activation functions.

    Implementation mostly following megatron
    """

    def __init__(self, sub_activation):
        super().__init__()
        self.sub_activation = sub_activation()

    def forward(self, inputs):
        x, gate = inputs.chunk(2, dim=-1)
        return self.sub_activation(gate) * x


class ScaleNorm(torch.nn.Module):
    """Quick and simple scale norm implementation. "elementwise_affine" is not the ideal name but for compat with LayerNorm

    Do we also need FixNorm (cosine in the last layer)? It's a maybe here:
    https://github.com/lucidrains/performer-pytorch/issues/55#issuecomment-762544686
    """

    def __init__(self, hidden_size: int, eps: float = 1e-5, elementwise_affine: bool = True):
        super().__init__()
        self.eps = eps
        if elementwise_affine:
            self.learnable_scale = torch.nn.Parameter(torch.tensor(float(hidden_size) ** -0.5))
        else:
            self.register_buffer("learnable_scale", torch.tensor(float(hidden_size) ** -0.5))

    def forward(self, inputs):
        """This is the same eps clipping as in the original ScaleNorm implementation."""
        return inputs * self.learnable_scale / torch.norm(inputs, dim=-1, keepdim=True).clamp(min=self.eps)


class RMSNorm(torch.nn.Module):
    """The RMS variant of scaling norms.  "elementwise_affine" is not the ideal name but for compat with LayerNorm"""

    def __init__(self, hidden_size: int, eps: float = 1e-6, elementwise_affine: bool = True):
        super().__init__()
        self.eps = eps
        if elementwise_affine:
            self.learnable_scale = torch.nn.Parameter(torch.ones(hidden_size) ** -0.5)
        else:
            self.register_buffer("learnable_scale", torch.ones(hidden_size) ** -0.5)

    def _legacy_forward(self, inputs):
        """This is the same eps clipping as in the original ScaleNorm implementation."""
        return inputs * self.learnable_scale / torch.norm(inputs, dim=-1, keepdim=True).clamp(min=1e-8)

    def _norm(self, x):
        """LLama implementation"""
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.learnable_scale


def get_causal_attention_mask(input_ids) -> torch.Tensor:
    """Simplified triangular causal mask. Adapted for multiple heads."""
    seq_length = input_ids.shape[1]  # not transposed yet
    device = input_ids.device
    # lower triangular attention mask
    mask = torch.tril(torch.ones((1, 1, seq_length, seq_length), device=device)).view(1, 1, seq_length, seq_length)

    # convert to binary
    return mask < 0.5


def get_extended_attention_mask(attention_mask: torch.Tensor, input_shape: Tuple[int], causal_attention: bool = False) -> torch.Tensor:
    """
    Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
    Arguments:
        attention_mask (`torch.Tensor`):
            Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
        input_shape (`Tuple[int]`):
            The shape of the input to the model.
    Returns:
        `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.

    Method stolen from huggingface :)
    """
    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
    # ourselves in which case we just need to make it broadcastable to all heads.
    if attention_mask.dim() == 3:
        extended_attention_mask = attention_mask[:, None, :, :]
    elif attention_mask.dim() == 2:
        # Provided a padding mask of dimensions [batch_size, seq_length]
        # - if the model is a decoder, apply a causal mask in addition to the padding mask
        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if causal_attention:
            batch_size, seq_length = input_shape
            seq_ids = torch.arange(seq_length, device=attention_mask.device)
            causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
            # in case past_key_values are used we need to add a prefix ones mask to the causal mask
            # causal and attention masks must have same type with pytorch version < 1.3
            causal_mask = causal_mask.to(attention_mask.dtype)

            if causal_mask.shape[1] < attention_mask.shape[1]:
                prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
                causal_mask = torch.cat(
                    [
                        torch.ones((batch_size, seq_length, prefix_seq_len), device=attention_mask.device, dtype=causal_mask.dtype),
                        causal_mask,
                    ],
                    axis=-1,
                )
            extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
        else:
            extended_attention_mask = attention_mask[:, None, None, :]
    else:
        raise ValueError(f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})")

    # extended_attention_mask = extended_attention_mask.to(dtype=self.setup["dtype"])  # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask


"""Collect inits."""


@torch.no_grad()
def _init_module(module, init_method="normal", init_std=0.02, hidden_size=768, num_layers=12):
    """Todo: refactor this insanity"""
    if "deepnorm" in init_method:  # This is a xavier init with changes in the MHA inits
        if "normal" in init_method:
            gain = init_std
        elif "subln" in init_method:
            gain = torch.as_tensor(2 * num_layers).log().sqrt()  # foundation transformer paper, use with subln
        elif "straight" in init_method:
            gain = torch.as_tensor(8 * num_layers).pow(-0.25)  # deepnorm paper, use with deepnorm
        elif "as-is" in init_method:  # use locally defined inits for each module
            gain = 1.0
        else:
            raise ValueError(f"Invalid init method {init_method} given.")

        if isinstance(module, torch.nn.Linear):
            if isinstance(module, NonDynamicallyQuantizableLinear):
                # This is handled below in the MultiheadAttention section
                pass
            else:
                if module.weight is not None:
                    torch.nn.init.xavier_normal_(module.weight, gain=gain)
                if module.bias is not None:
                    module.bias.data.zero_()
        elif isinstance(module, torch.nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0, std=module.weight.shape[1] ** -0.5)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, torch.nn.LayerNorm):
            if module.weight is not None:
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
        elif isinstance(module, torch.nn.MultiheadAttention):  # be careful with other transformer definitions!
            if "mimetic" in init_method:
                if module.in_proj_weight is not None:
                    h = module.in_proj_weight.shape[1]
                    Z1 = module.in_proj_weight.new_empty([h, h])
                    torch.nn.init.xavier_normal_(Z1, gain=gain)  # as per deepnorm prescription
                    I = torch.eye(h, device=module.in_proj_weight.device, dtype=module.in_proj_weight.dtype)
                    U1, S1, V1 = torch.linalg.svd(Z1 + I, full_matrices=False)
                    V = U1 @ torch.diag_embed(S1.sqrt())
                    O = V1 @ torch.diag_embed(S1.sqrt())

                    k = module.head_dim
                    I = torch.eye(h, device=module.in_proj_weight.device, dtype=module.in_proj_weight.dtype)
                    Qlist, Klist = [], []
                    for head in range(module.num_heads):
                        Z2 = module.in_proj_weight.new_empty([h, h])
                        torch.nn.init.xavier_normal_(Z2, gain=1.0)  # as per deepnorm prescription
                        U2, S2, V2 = torch.linalg.svd(Z2 + I, full_matrices=False)
                        Qlist.append(U2[:, :k] @ torch.diag_embed(S2[:k].sqrt()))
                        Klist.append(V2[:, :k] @ torch.diag_embed(S2[:k].sqrt()))
                    Q, K = torch.cat(Qlist, dim=-1), torch.cat(Klist, dim=-1)
                    module.in_proj_weight.data.copy_(torch.cat([Q, K, V], dim=0).contiguous())
                    if module.out_proj is not None:
                        module.out_proj.weight.data.copy_(O)
            else:
                if module.in_proj_weight is not None:
                    h = module.in_proj_weight.shape[1]
                    Q, K, V = (
                        module.in_proj_weight.new_empty([h, h]),
                        module.in_proj_weight.new_empty([h, h]),
                        module.in_proj_weight.new_empty([h, h]),
                    )
                    torch.nn.init.xavier_normal_(Q, gain=1.0)  # as per deepnorm prescription
                    torch.nn.init.xavier_normal_(K, gain=1.0)
                    torch.nn.init.xavier_normal_(V, gain=gain)
                    module.in_proj_weight.data.copy_(torch.cat([Q, K, V], dim=0).contiguous())
                # init outproj:
                if module.out_proj is not None:
                    torch.nn.init.xavier_normal_(module.out_proj.weight, gain=gain)
                    if module.out_proj.bias is not None:
                        module.out_proj.bias.data.zero_()
            if module.in_proj_bias is not None:
                module.in_proj_bias.data.zero_()
            if module.bias_k is not None:
                module.bias_k.data.zero_()
            if module.bias_v is not None:
                module.bias_v.data.zero_()
            if module.out_proj is not None and module.out_proj.bias is not None:
                module.out_proj.bias.data.zero_()
    else:
        if "normal" in init_method:
            std = init_std
        elif init_method == "small" in init_method:
            # Transformers without Tears: Improving
            # the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010)
            std = torch.as_tensor(2 / (5 * hidden_size)).sqrt()
        elif "megatron" in init_method:
            std = torch.as_tensor(1 / (3 * hidden_size)).sqrt()
            # Megatron init is near-equal to normal if hidden=768, but otherwise smaller
        elif "wang" in init_method:
            std = 2 / num_layers / torch.as_tensor(hidden_size).sqrt()
        elif "as-is" in init_method:  # use locally defined inits for each module
            return
        else:
            raise ValueError(f"Invalid init method {init_method} given.")
        if isinstance(module, torch.nn.Linear):
            if isinstance(module, NonDynamicallyQuantizableLinear):
                # This is handled below in the MultiheadAttention section
                pass
            else:
                # Slightly different from the TF version which uses truncated_normal for initialization
                # cf https://github.com/pytorch/pytorch/pull/5617
                if module.weight is not None:
                    module.weight.data.normal_(mean=0.0, std=std)
                if module.bias is not None:
                    module.bias.data.zero_()
        elif isinstance(module, torch.nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, torch.nn.LayerNorm):
            if module.weight is not None:
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
        elif isinstance(module, torch.nn.MultiheadAttention):  # be careful with other transformer definitions!
            if "mimetic" in init_method:
                if module.in_proj_weight is not None:
                    h = module.in_proj_weight.shape[1]
                    Z1 = module.in_proj_weight.new_empty([h, h]).normal_() / h
                    I = torch.eye(h, device=module.in_proj_weight.device, dtype=module.in_proj_weight.dtype)
                    U1, S1, V1 = torch.linalg.svd(0.2 * Z1 + 0.2 * I, full_matrices=False)
                    V = U1 @ torch.diag_embed(S1.sqrt())
                    O = V1 @ torch.diag_embed(S1.sqrt())

                    k = module.head_dim
                    I = torch.eye(h, device=module.in_proj_weight.device, dtype=module.in_proj_weight.dtype)
                    Qlist, Klist = [], []
                    for head in range(module.num_heads):
                        # Z2 = module.in_proj_weight.new_empty([h, h]).normal_() / h
                        U2, S2, V2 = torch.linalg.svd(0 + 0.5 * I, full_matrices=False)  # alpha1 =0 from Trockman
                        Qlist.append(U2[:, :k] @ torch.diag_embed(S2[:k].sqrt()))  # this is a bit pointless, ...
                        Klist.append(V2[:, :k] @ torch.diag_embed(S2[:k].sqrt()))  # ... I've left it here for alpha1 not zero
                    Q, K = torch.cat(Qlist, dim=-1), torch.cat(Klist, dim=-1)
                    module.in_proj_weight.data.copy_(torch.cat([Q, K, V], dim=0).contiguous())
                    if module.out_proj is not None:
                        module.out_proj.weight.data.copy_(O)
            else:
                if module.in_proj_weight is not None:
                    module.in_proj_weight.data.normal_(mean=0.0, std=std)
                if module.out_proj is not None:
                    module.out_proj.weight.data.normal_(mean=0.0, std=std)
            if module.in_proj_bias is not None:
                module.in_proj_bias.data.zero_()
            if module.bias_k is not None:
                module.bias_k.data.zero_()
            if module.bias_v is not None:
                module.bias_v.data.zero_()
            # init outproj:
            if module.out_proj is not None and module.out_proj.bias is not None:
                module.out_proj.bias.data.zero_()


================================================
FILE: cramming/architectures/construction.py
================================================
"""Interface to construct models."""

from .huggingface_interface import construct_huggingface_model
from .sanity_check import SanityCheckforPreTraining
from .crammed_transformer import construct_crammed_transformer
from .crammed_depthrecurrent import construct_crammed_recurrent

import logging
from ..utils import is_main_process

log = logging.getLogger(__name__)


def construct_model(cfg_arch, tokenizer):
    model = None
    eos_token_id = tokenizer.eos_token  # tokenizer.vocab["<eot>"]
    if "model_type" in cfg_arch:
        # attempt to solve locally
        if "SanityCheckLM" in cfg_arch.model_type:
            model = SanityCheckforPreTraining(cfg_arch.width, tokenizer.vocab_size)
        elif "ScriptableCrammedTransformer" in cfg_arch.model_type:
            model = construct_crammed_transformer(cfg_arch, tokenizer.vocab_size)
        elif "ScriptableCrammedDepthRecurrent" in cfg_arch.model_type:
            equals_token = tokenizer.vocab["="]
            model = construct_crammed_recurrent(cfg_arch, tokenizer.vocab_size, equals_token)

    if model is not None:  # Return local model arch
        num_params = sum([p.numel() for p in model.parameters()])
        if is_main_process():
            log.info(f"Model with architecture {cfg_arch.model_type} loaded with {num_params:,} parameters.")
        return model

    try:  # else try on HF
        model = construct_huggingface_model(cfg_arch, tokenizer.vocab_size)
        num_params = sum([p.numel() for p in model.parameters()])
        if is_main_process():
            log.info(f"Model with config {cfg_arch} loaded with {num_params:,} parameters.")
        return model
    except Exception as e:
        raise ValueError(f"Invalid model architecture {cfg_arch.model_type} given. Error: {e}")


================================================
FILE: cramming/architectures/crammed_depthrecurrent.py
================================================
"""Variant for modifications of the transformer architecture that are depth-recurrent"""
import torch
from transformers import PretrainedConfig, PreTrainedModel
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM

from typing import Optional
from omegaconf import OmegaConf

from .components import (
    _get_norm_fn,
    _get_nonlin_fn,
    EmbeddingComponent,
    GLU,
    get_causal_attention_mask,
    _init_module,
    NormalizedResidualConnection,
)
from .attention import get_attention_mechanism


class crammedDepthRecurrentConfig(PretrainedConfig):
    model_type = "crammedDepthRecurrent"

    def __init__(self, cfg_arch_container: dict = {}, **kwargs):
        self.arch = cfg_arch_container
        super().__init__(**kwargs)


def construct_crammed_recurrent(cfg_arch, vocab_size, equals_token):
    """See the config file for details on what is possible."""
    cfg_arch.embedding.vocab_size = vocab_size

    config = crammedDepthRecurrentConfig(OmegaConf.to_container(cfg_arch, resolve=True))
    if config.arch["objective_layout"] in ["fixed", "albert"]:
        model = ScriptableRecurrentLMForPreTraining(config)
    elif config.arch["objective_layout"] in ["TBPTT", "deepthinking"]:
        model = ScriptableRecurrentLMBPTT(config, equals_token)
    else:
        raise ValueError(f"Invalid layout {config.arch['objective_layout']} of training objective given.")

    return model


class FFNComponent(torch.nn.Module):
    """Note: The FF layer is not auto-scaled when using a GLU type activation.
    Better do this manually and choose a sensible intermed_size that is nicely divisible.

    The neox suggestion for approx. equal parameter count is int(4 * 2 / 3 * hidden_size) * 2 [this is ~5.33]
    """

    def __init__(self, hidden_size, intermed_size, cfg_arch, output_size=None):
        super().__init__()
        self.dense_in = torch.nn.Linear(hidden_size, intermed_size, bias=cfg_arch.use_bias)
        self.nonlin = _get_nonlin_fn(cfg_arch.nonlin)()
        if isinstance(self.nonlin, GLU):
            intermed_output_size = intermed_size // 2
        else:
            intermed_output_size = intermed_size
        if cfg_arch.sub_normalization:
            self.norm = _get_norm_fn(cfg_arch.norm)(intermed_output_size, eps=cfg_arch.norm_eps)
        else:
            self.norm = torch.nn.Identity()
        output_size = hidden_size if output_size is None else output_size
        self.dense_out = torch.nn.Linear(intermed_output_size, output_size, bias=cfg_arch.use_bias)

    def forward(self, hidden_states):
        return self.dense_out(self.norm(self.nonlin(self.dense_in(hidden_states))))


class TransformerLayer(torch.nn.Module):
    """A transformer structure based on the components from above."""

    def __init__(self, idx, cfg_arch):
        super().__init__()
        self.residual1 = NormalizedResidualConnection(cfg_arch.hidden_size, cfg_arch)
        self.residual2 = NormalizedResidualConnection(cfg_arch.hidden_size, cfg_arch)
        if cfg_arch.attention.sub_normalization:
            sub_norm_fn = lambda: _get_norm_fn(cfg_arch.norm)(cfg_arch.hidden_size, eps=cfg_arch.norm_eps)  # noqa
        else:
            sub_norm_fn = torch.nn.Identity
        self.attn = get_attention_mechanism(idx, cfg_arch.hidden_size, cfg_arch.attention, sub_norm_fn)
        self.ffn = FFNComponent(cfg_arch.hidden_size, cfg_arch.intermed_size, cfg_arch)
        self.LAYOUT = self.attn.LAYOUT

    def forward(self, states, attention_mask: Optional[torch.Tensor] = None):
        states = self.residual1(states, self.attn, states, attention_mask)
        states = self.residual2(states, self.ffn, states)
        return states


class TransformerBlock(torch.nn.Module):
    """A transformer block of multiple layers (without weightsharing)."""

    def __init__(self, layers, cfg_arch):
        super().__init__()
        self.layers = torch.nn.ModuleList(layers)
        self.seq_first = self.layers[0].LAYOUT == "[S B H]" if len(self.layers) > 0 else False
        self.injection_type = cfg_arch.input_injection_type
        if self.injection_type == "linear":
            self.adapter = torch.nn.Linear(cfg_arch.hidden_size * 2, cfg_arch.hidden_size, bias=False)
        elif self.injection_type == "ffn":
            self.ffn = FFNComponent(cfg_arch.hidden_size * 2, cfg_arch.intermed_size, cfg_arch, cfg_arch.hidden_size)

    def forward(self, states, injected_state, attention_mask: Optional[torch.Tensor] = None):
        if self.injection_type == "none":
            states = states
        elif self.injection_type == "add": # this is the deafault in the config
            states = states + injected_state
        elif self.injection_type == "linear":
            combined_inputs = torch.cat([states, injected_state], dim=-1)
            states = self.adapter(combined_inputs)
        elif self.injection_type == "ffn":
            combined_inputs = torch.cat([states, injected_state], dim=-1)
            states = self.ffn(combined_inputs)
        for layer in self.layers:
            states = layer(states, attention_mask)
        return states


class TransposedAdapter(torch.nn.Linear):  # steal init
    def __init__(self, embedding_dim, hidden_size, original_adapter, tie_weights=True):
        torch.nn.Module.__init__(self)
        # self.adapter.weight = self.encoder.adapter.weight.T # this would be nice but cannot assign like this
        if tie_weights:
            self.weight = original_adapter.weight
        else:
            self.adapter_active = False
            self.weight = torch.nn.Parameter(torch.randn([hidden_size, embedding_dim]))  # transposed
        self.register_parameter("bias", None)
        self.reset_parameters()

    def forward(self, inputs):
        return torch.nn.functional.linear(inputs, self.weight.T)


class ScriptableRecurrentLM(PreTrainedModel):
    """Depth-recurrent model. Trying to include most reasonable variations of this concept"""

    config_class = crammedDepthRecurrentConfig

    def __init__(self, config):
        super().__init__(config)
        self.cfg = OmegaConf.create(config.arch)

        self.embedding = EmbeddingComponent(self.cfg.embedding, self.cfg.norm, self.cfg.norm_eps)
        if self.cfg.embedding.embedding_dim != self.cfg.hidden_size:
            self.adapter = torch.nn.Linear(self.cfg.embedding.embedding_dim, self.cfg.hidden_size, bias=False)
        else:
            self.adapter = torch.nn.Identity()
        self.state_init = self.cfg.state_init
        self.recurrent_block = torch.compile(
            TransformerBlock([TransformerLayer(idx, self.cfg) for idx in range(self.cfg.layers_in_recurrent_block)], self.cfg),
            mode="default",
            disable=not self.cfg.local_compilation,
        )
        self.seq_first = self.recurrent_block.seq_first
        if self.cfg.head == "identity":
            self.head = torch.nn.Identity()
        elif self.cfg.head == "ffn":
            self.head = FFNComponent(self.cfg.hidden_size, self.cfg.intermed_size, self.cfg)
        elif self.cfg.head == "linear":
            self.head = torch.nn.Linear(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.use_bias)
        else:
            raise ValueError(f"Invalid head layout {self.cfg.head} given.")

        if self.cfg.final_norm:
            self.final_norm = _get_norm_fn(self.cfg.norm)(self.cfg.hidden_size, eps=self.cfg.norm_eps)
        else:
            self.final_norm = torch.nn.Identity()
        self.register_buffer("attention_mask", torch.ones([0, 0, 0, 0], dtype=torch.bool), persistent=False)

    def forward(self, input_ids: torch.Tensor, num_steps_no_grad: int = None, num_steps_with_grad: int = None):
        if input_ids.shape[1] != self.attention_mask.shape[1]:
            self.attention_mask = get_causal_attention_mask(input_ids)
        hidden_states = self.adapter(self.embedding(input_ids))
        if self.seq_first:
            hidden_states = hidden_states.transpose(0, 1).contiguous()
        injected_state = hidden_states.clone()

        num_steps_prefix = 0 if num_steps_no_grad is None else num_steps_no_grad
        hidden_states = self.initialize_state(hidden_states)

        # Recurr without gradients
        with torch.no_grad():
            for repeat in range(num_steps_prefix):
                hidden_states = self.recurrent_block(hidden_states, injected_state, self.attention_mask).clone()

        num_steps_active = self.cfg.maximal_recurrence if num_steps_with_grad is None else num_steps_with_grad
        # Recur with gradients
        for repeat in range(num_steps_active):
            hidden_states = self.recurrent_block(hidden_states, injected_state, self.attention_mask).clone()
        return self.final_norm(self.head(hidden_states))

    def initialize_state(self, hidden_states):
        if self.cfg.initial_hidden_randomized:
            batch_size = hidden_states.shape[0]
            if self.state_init == "normal":
                hidden_states = torch.randn_like(hidden_states)
            elif self.state_init == "embed":  # initialized like a BERT embedding
                hidden_states = torch.randn_like(hidden_states).mul(0.02)
            elif self.state_init == "zero":
                hidden_states = torch.zeros_like(hidden_states)
            elif self.state_init == "unit":
                hidden_states = torch.randn_like(hidden_states)
                std, mean = torch.std_mean(hidden_states, dim=-1, keepdim=True)
                hidden_states = (hidden_states - mean) / std
        return hidden_states


class ScriptableRecurrentLMReplicaConcat(PreTrainedModel):
    """Depth-recurrent model. with skips inside block 
    This is nearly the same as ScriptableRecurrentLM but has skips inside block too"""

    config_class = crammedDepthRecurrentConfig

    def __init__(self, config):
        super().__init__(config)
        self.cfg = OmegaConf.create(config.arch)

        self.embedding = EmbeddingComponent(self.cfg.embedding, self.cfg.norm, self.cfg.norm_eps)
        if self.cfg.embedding.embedding_dim != self.cfg.hidden_size:
            self.adapter = torch.nn.Linear(self.cfg.embedding.embedding_dim, self.cfg.hidden_size, bias=False)
        else:
            self.adapter = torch.nn.Identity()
        self.state_init = self.cfg.state_init


        self.max_recurs = self.cfg.layers_in_recurrent_block
        self.recurrent_blocks = []
        print("Initializing feedforward blocks with recall connections")
        for _ in range(self.max_recurs):
            self.recurrent_blocks.append(
                torch.compile(TransformerBlock([TransformerLayer(1, self.cfg)], self.cfg),
                              mode="default",
                              disable=not self.cfg.local_compilation,)
            )
        self.recurrent_blocks = torch.nn.ModuleList(self.recurrent_blocks)
        print(f"Initialized feedforward blocks with recall connections. "
              f"It has the depth of {self.max_recurs}")

        self.seq_first = self.recurrent_blocks[0].seq_first
        if self.cfg.head == "identity":
            self.head = torch.nn.Identity()
        elif self.cfg.head == "ffn":
            self.head = FFNComponent(self.cfg.hidden_size, self.cfg.intermed_size, self.cfg)
        elif self.cfg.head == "linear":
            self.head = torch.nn.Linear(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.use_bias)
        else:
            raise ValueError(f"Invalid head layout {self.cfg.head} given.")

        if self.cfg.final_norm:
            self.final_norm = _get_norm_fn(self.cfg.norm)(self.cfg.hidden_size, eps=self.cfg.norm_eps)
        else:
            self.final_norm = torch.nn.Identity()
        self.register_buffer("attention_mask", torch.ones([0, 0, 0, 0], dtype=torch.bool), persistent=False)


    def apply_recurrent_block(self, hidden_states, injected_state, attention_mask):
        for block in self.recurrent_blocks:
            hidden_states = block(hidden_states, injected_state, attention_mask)
        return hidden_states


    def forward(self, input_ids: torch.Tensor, num_steps_no_grad: int = None, num_steps_with_grad: int = None):
        if input_ids.shape[1] != self.attention_mask.shape[1]:
            self.attention_mask = get_causal_attention_mask(input_ids)
        hidden_states = self.adapter(self.embedding(input_ids))
        if self.seq_first:
            hidden_states = hidden_states.transpose(0, 1).contiguous()
        injected_state = hidden_states.clone()

        num_steps_prefix = 0 if num_steps_no_grad is None else num_steps_no_grad
        hidden_states = self.initialize_state(hidden_states)

        # Recurr without gradients
        with torch.no_grad():
            for repeat in range(num_steps_prefix):
                hidden_states = self.apply_recurrent_block(hidden_states, injected_state, self.attention_mask).clone()

        num_steps_active = self.cfg.maximal_recurrence if num_steps_with_grad is None else num_steps_with_grad
        # Recur with gradients
        for repeat in range(num_steps_active):
            hidden_states = self.apply_recurrent_block(hidden_states, injected_state, self.attention_mask).clone()
        return self.final_norm(self.head(hidden_states))

    def initialize_state(self, hidden_states):
        if self.cfg.initial_hidden_randomized:
            batch_size = hidden_states.shape[0]
            if self.state_init == "normal":
                hidden_states = torch.randn_like(hidden_states)
            elif self.state_init == "embed":  # initialized like a BERT embedding
                hidden_states = torch.randn_like(hidden_states).mul(0.02)
            elif self.state_init == "zero":
                hidden_states = torch.zeros_like(hidden_states)
            elif self.state_init == "unit":
                hidden_states = torch.randn_like(hidden_states)
                std, mean = torch.std_mean(hidden_states, dim=-1, keepdim=True)
                hidden_states = (hidden_states - mean) / std
        return hidden_states


"""Generator fn for these models."""
@torch.no_grad()
def _generate(self, input_ids, token_limit=100, temperature=1.0, steps_at_generation_time=None, track_steps=False, greedy=False, quick=False, **kwargs):
    """Generate token_limit many tokens from input_ids prompt. 
    track_steps = for making thinking plots
    """
    predicted_ids = []
    tracking = []
    num_steps = self.cfg.maximal_recurrence_in_eval if steps_at_generation_time is None else steps_at_generation_time
    logit_tensor = torch.zeros(token_limit, num_steps, self.cfg.embedding.vocab_size)
    for gen_idx in range(token_limit):
        if input_ids.shape[1] != self.encoder.attention_mask.shape[1]:
            self.encoder.attention_mask = get_causal_attention_mask(input_ids)
        hidden_states = self.encoder.adapter(self.encoder.embedding(input_ids))
        if self.encoder.seq_first:
            hidden_states = hidden_states.transpose(0, 1).contiguous()
        injected_state = hidden_states
        hidden_states = self.encoder.initialize_state(hidden_states)
        # Recur without gradient
        step = []
        with torch.no_grad():
            for repeat in range(num_steps):
                if hasattr(self.encoder, 'recurrent_blocks'):
                    for block in self.encoder.recurrent_blocks:
                        hidden_states = block(hidden_states, injected_state, self.encoder.attention_mask)
                else:
                    hidden_states = self.encoder.recurrent_block._orig_mod(hidden_states, injected_state,
                                                                           self.encoder.attention_mask)
                if track_steps:
                    # keep track of the intermediate probs
                    output_states = self.encoder.final_norm(self.encoder.head(hidden_states.clone()))
                    logits = self.decoder(self.adapter(output_states))
                    logits = logits[-1, :, :] if self.encoder.seq_first else logits[:, -1, :]
                    if greedy:
                        probs = torch.softmax(logits, dim=-1)
                        predicted_token = torch.argmax(logits, dim=1).unsqueeze(dim=0)
                    else:
                        probs = torch.softmax(logits * temperature, dim=-1)
                        predicted_token = torch.multinomial(probs, 1)
                    logit_tensor[gen_idx, repeat, :] = probs
                    step.append(predicted_token)
        if track_steps:
            predicted_token = step[-1]
        else:
            # calcualte the probs if we haven't already
            output_states = self.encoder.final_norm(self.encoder.head(hidden_states.clone()))
            logits = self.decoder(self.adapter(output_states))
            logits = logits[-1, :, :] if self.encoder.seq_first else logits[:, -1, :]
            if greedy:
                predicted_token = torch.argmax(logits, dim=1).unsqueeze(dim=0)
            else:
                predicted_token = torch.multinomial(torch.softmax(logits * temperature, dim=-1), 1)

        if quick:
            input_ids = torch.cat((input_ids, torch.transpose(predicted_token, 0, 1)), dim=1)
        else:
            input_ids = torch.cat([input_ids, predicted_token], dim=-1)
        predicted_ids += [predicted_token]
        tracking.append(step)

    if quick:
        generated_ids = torch.stack(predicted_ids, dim=1).squeeze()
    else:
        generated_ids = torch.cat(predicted_ids, dim=-1)

    if track_steps:
        return generated_ids, tracking, logit_tensor # tracking is a [num generated tokens, num recurrences] list of lists of tensors of which each tensor is a token id
    return generated_ids


class ScriptableRecurrentLMForPreTraining(PreTrainedModel):
    """Pretraining version"""

    config_class = crammedDepthRecurrentConfig

    def __init__(self, config):
        super().__init__(config)
        self.cfg = OmegaConf.create(config.arch)

        self.encoder = ScriptableRecurrentLM(config)
        if self.cfg.embedding.embedding_dim != self.cfg.hidden_size:
            self.adapter = TransposedAdapter(
                self.cfg.embedding.embedding_dim, self.cfg.hidden_size, self.encoder.adapter, self.cfg.tie_weights
            )
        else:
            self.adapter = torch.nn.Identity()
        self.decoder = torch.nn.Linear(self.cfg.embedding.embedding_dim, self.cfg.embedding.vocab_size, bias=self.cfg.decoder_bias)
        if self.cfg.tie_weights:
            self.decoder.weight = self.encoder.embedding.word_embedding.weight

        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100) # size_average defaults to True so when using masking loss is calculated correctly

        self._init_weights()

    def _init_weights(self, module=None):
        modules = self.modules() if module is None else [module]
        for module in modules:
            _init_module(
                module,
                self.cfg.init.type,
                self.cfg.init.std,
                self.cfg.hidden_size,
                self.cfg.layers_in_recurrent_block * self.cfg.maximal_recurrence,
            )

    def forward(self, input_ids: torch.Tensor, *args, **kwargs):
        outputs = self.decoder(self.adapter(self.encoder(input_ids, num_steps_no_grad=0, num_steps_with_grad=self.cfg.maximal_recurrence)))

        if self.encoder.seq_first:
            shifted_outputs = outputs[:-1]
            shifted_labels = input_ids.transpose(0, 1)[1:].contiguous()
            outputs = outputs.detach().transpose(0, 1)
        else:
            shifted_outputs = outputs[..., :-1, :].contiguous()
            shifted_labels = input_ids[..., 1:].contiguous()
            outputs = outputs.detach()

        # Flatten the tokens and compute loss
        loss = self.loss_fn(shifted_outputs.view(-1, shifted_outputs.shape[-1]), shifted_labels.view(-1))

        return {"loss": loss, "logits": outputs[:, -1, :], "log_perplexity": loss.clone().detach()}

    def _generate(self, input_ids, token_limit=100, temperature=0.7, steps_at_generation_time=None):
        return _generate(self, input_ids, token_limit, temperature, steps_at_generation_time)


class ScriptableRecurrentLMBPTT(PreTrainedModel):
    """Pretraining version with stochastic depth / trunc. BPTT"""

    config_class = crammedDepthRecurrentConfig

    def __init__(self, config, equals_token):
        super().__init__(config)
        self.cfg = OmegaConf.create(config.arch)
        self.equals_token = equals_token

        self.max_recurrences_for_training = self.cfg.maximal_recurrence
        self.max_backprop = max(self.cfg.maximal_recurrence // 2 if self.cfg.max_backprop is None else self.cfg.max_backprop, 1)
        try:
            self.forward_only_model_with_skip = self.cfg.forward_only_model_with_skip
            if self.cfg.forward_only_model_with_skip:
                print("Using forward only model with skip")
                self.encoder = ScriptableRecurrentLMReplicaConcat(config)
            else:
                self.encoder = ScriptableRecurrentLM(config)
        except:
            self.encoder = ScriptableRecurrentLM(config)

        self.adapter = TransposedAdapter(self.cfg.embedding.embedding_dim, self.cfg.hidden_size, self.encoder.adapter, self.cfg.tie_weights)
        self.decoder = torch.nn.Linear(self.cfg.embedding.embedding_dim, self.cfg.embedding.vocab_size, bias=self.cfg.decoder_bias)
        if self.cfg.tie_weights:
            self.decoder.weight = self.encoder.embedding.word_embedding.weight

        self.throttle = self.cfg.throttle
        self.alpha = self.cfg.alpha
        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction=self.cfg.loss_reduction) # size_average defaults to True so when using masking loss is calculated correctly
        self._init_weights()

        self.mask_before_equals = self.cfg.mask_before_equals
        self.model_call = self.prog_model_call_with_masking # moved the logic for masking before equals into this function

    def _init_weights(self, module=None):
        modules = self.modules() if module is None else [module]
        for module in modules:
            _init_module(
                module,
                self.cfg.init.type,
                self.cfg.init.std,
                self.cfg.hidden_size,
                self.cfg.layers_in_recurrent_block * self.cfg.maximal_recurrence,
            )

    def set_max_recurrences_for_training(self, new_max):
        """Can play around with recurrences during training"""
        self.max_recurrences_for_training = new_max
        self.max_backprop = max(self.max_recurrences_for_training // 2 if self.cfg.max_backprop is None else self.cfg.max_backprop, 1)

    def forward(self, input_ids: torch.Tensor, *args, **kwargs):
        """
        WARNING: max iters outputs is used for logits and entropy calcs
        """
        if self.training:
            loss, outputs = self.forward_progressive(input_ids)
            if self.throttle:
                Ek = 1 + min(self.max_recurrences_for_training / 4, self.max_backprop / 2)
                loss = loss * (Ek / self.max_backprop)
        else:
            loss, outputs = self.model_call(input_ids, n=self.cfg.maximal_recurrence_in_eval, k=0)

        return {"loss": loss, "logits": outputs[:, -1, :], "log_perplexity": loss.clone().detach()}
    
    def forward_progressive(self, input_ids):
        """Implements progressive loss"""
        if self.alpha != 1:
            # max iters forward pass
            n = self.max_recurrences_for_training-self.max_backprop
            k = self.max_backprop # i.e. maxmimise the number of layers we back prop through
            loss_max_iters, outputs_max_iters = self.model_call(input_ids, n=n, k=k)
        else:
            loss_max_iters = torch.zeros(1, dtype=torch.float32).to(input_ids.get_device())

        if self.alpha != 0:
            # stochastic forward pass
            n = torch.randint(low=0, high=self.max_recurrences_for_training, size=(1,))
            k = torch.randint(low=1, high=1 + min(self.max_recurrences_for_training - n, self.max_backprop), size=(1,))
            loss_progressive, outputs_progressive = self.model_call(input_ids, n=n, k=k)
            if self.alpha == 1:
                outputs_max_iters = outputs_progressive
        else:
            loss_progressive = torch.zeros(1, dtype=torch.float32).to(input_ids.get_device())
        
        loss = (1 - self.alpha) * loss_max_iters + self.alpha * loss_progressive
        # Returning outputs max_iters to be used for logits, could try outputs_progressive
        return loss, outputs_max_iters

    def prog_model_call_with_masking(self, input_ids, n, k):
        if self.mask_before_equals: # mask before equals
            indices_of_equals = (input_ids == self.equals_token).nonzero()[:, 1] # gets the index of equals sign for each tensor in the batch
            max_indices = torch.arange(input_ids.size(1), device=input_ids.device) # tensor for mask
            masks = max_indices.unsqueeze(0) > indices_of_equals.unsqueeze(1) # fill tensor after including index of = sign for each row
        else: # mask only the random padding
            masks = input_ids != 0
        
        outputs = self.decoder(self.adapter(self.encoder(input_ids, num_steps_no_grad=n, num_steps_with_grad=k)))

        if self.encoder.seq_first:
            shifted_outputs = outputs[:-1]
            shifted_labels = input_ids.transpose(0, 1)[1:].contiguous()
            outputs = outputs.detach().transpose(0, 1)
            masked = torch.mul(shifted_labels, masks[..., 1:].transpose(0, 1))
        else:
            shifted_outputs = outputs[..., :-1, :].contiguous()
            shifted_labels = input_ids[..., 1:].contiguous()
            outputs = outputs.detach()
            masked = torch.mul(shifted_labels, masks[..., 1:])
        masked[masked == 0] = -100 # mask all 0's in loss

        shifted_outputs_shape = shifted_outputs.shape
        
        loss = self.loss_fn(shifted_outputs.view(-1, shifted_outputs.shape[-1]), masked.view(-1)) # CE_Loss(Input, Target)
        if self.cfg.loss_reduction=='none': # giving all output samples equal weighting
            loss = loss.view(shifted_outputs_shape[0],shifted_outputs_shape[1])
            loss = torch.mean(loss, dim=1)
            loss = torch.mean(loss)
        return loss, outputs

    def _generate(self, input_ids, token_limit=100, temperature=1.0, steps_at_generation_time=None, track_steps=False, greedy=False, quick=False):
        return _generate(self, input_ids, token_limit, temperature, steps_at_generation_time, track_steps, greedy=greedy, quick=quick)


# ###### HF registry here? ############### #

AutoConfig.register("crammedDepthRecurrent", crammedDepthRecurrentConfig)
AutoModel.register(crammedDepthRecurrentConfig, ScriptableRecurrentLM)
AutoModelForCausalLM.register(crammedDepthRecurrentConfig, ScriptableRecurrentLMForPreTraining)


================================================
FILE: cramming/architectures/crammed_transformer.py
================================================
"""Base file for modifications of the transformer architecture"""
import torch
from transformers import PretrainedConfig, PreTrainedModel
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM

from typing import Optional
from omegaconf import OmegaConf

from .components import (
    _get_norm_fn,
    _get_nonlin_fn,
    NormalizedResidualConnection,
    EmbeddingComponent,
    GLU,
    get_causal_attention_mask,
    _init_module,
)
from .attention import get_attention_mechanism


class crammedTransformerConfig(PretrainedConfig):
    model_type = "crammedTransformer"

    def __init__(self, cfg_arch_container: dict = {}, **kwargs):
        self.arch = cfg_arch_container
        super().__init__(**kwargs)


def construct_crammed_transformer(cfg_arch, vocab_size):
    """See the config file for details on what is possible."""
    cfg_arch.embedding.vocab_size = vocab_size

    config = crammedTransformerConfig(OmegaConf.to_container(cfg_arch, resolve=True))
    model = ScriptableLMForPreTraining(config)

    return model


class FFNComponent(torch.nn.Module):
    """Note: The FF layer is not auto-scaled when using a GLU type activation.
    Better do this manually and choose a sensible intermed_size that is nicely divisible.

    The neox suggestion for approx. equal parameter count is int(4 * 2 / 3 * hidden_size) * 2 [this is ~5.33]
    """

    def __init__(self, hidden_size, intermed_size, cfg_arch, output_size=None):
        super().__init__()
        self.dense_in = torch.nn.Linear(hidden_size, intermed_size, bias=cfg_arch.use_bias)
        self.nonlin = _get_nonlin_fn(cfg_arch.nonlin)()
        if isinstance(self.nonlin, GLU):
            intermed_output_size = intermed_size // 2
        else:
            intermed_output_size = intermed_size
        if cfg_arch.sub_normalization:
            self.norm = _get_norm_fn(cfg_arch.norm)(intermed_output_size, eps=cfg_arch.norm_eps)
        else:
            self.norm = torch.nn.Identity()
        output_size = hidden_size if output_size is None else output_size
        self.dense_out = torch.nn.Linear(intermed_output_size, output_size, bias=cfg_arch.use_bias)

    def forward(self, hidden_states):
        return self.dense_out(self.norm(self.nonlin(self.dense_in(hidden_states))))


class TransformerLayer(torch.nn.Module):
    """A transformer structure based on the components from above."""

    def __init__(self, idx, cfg_arch):
        super().__init__()
        self.residual1 = NormalizedResidualConnection(cfg_arch.hidden_size, cfg_arch)
        self.residual2 = NormalizedResidualConnection(cfg_arch.hidden_size, cfg_arch)
        if cfg_arch.attention.sub_normalization:
            sub_norm_fn = lambda: get_norm_fn(cfg_arch.norm)(cfg_arch.hidden_size, eps=cfg_arch.norm_eps)  # noqa
        else:
            sub_norm_fn = torch.nn.Identity
        self.attn = get_attention_mechanism(idx, cfg_arch.hidden_size, cfg_arch.attention, sub_norm_fn)
        self.ffn = FFNComponent(cfg_arch.hidden_size, cfg_arch.intermed_size, cfg_arch)
        self.LAYOUT = self.attn.LAYOUT

    def forward(self, states, attention_mask: Optional[torch.Tensor] = None):
        states = self.residual1(states, self.attn, states, attention_mask)
        states = self.residual2(states, self.ffn, states)
        return states


class ScriptableLM(PreTrainedModel):
    """Simplified transformer wrapper."""

    config_class = crammedTransformerConfig

    def __init__(self, config):
        super().__init__(config)
        self.cfg = OmegaConf.create(config.arch)

        self.embedding = EmbeddingComponent(self.cfg.embedding, self.cfg.norm, self.cfg.norm_eps)
        self.layers = torch.nn.ModuleList([TransformerLayer(idx, self.cfg) for idx in range(self.cfg.num_transformer_layers)])
        self.seq_first = self.layers[0].LAYOUT == "[S B H]" if len(self.layers) > 0 else False

        if self.cfg.final_norm:
            self.final_norm = _get_norm_fn(self.cfg.norm)(self.cfg.hidden_size, eps=self.cfg.norm_eps)
        else:
            self.final_norm = torch.nn.Identity()

        self.register_buffer("attention_mask", torch.ones([0, 0, 0, 0], dtype=torch.bool), persistent=False)

    def forward(self, input_ids: torch.Tensor):
        if input_ids.shape[1] != self.attention_mask.shape[1]:
            self.attention_mask = get_causal_attention_mask(input_ids)
        hidden_states = self.embedding(input_ids)

        if self.seq_first:
            hidden_states = hidden_states.transpose(0, 1).contiguous()

        for i, layer_module in enumerate(self.layers):
            hidden_states = layer_module(hidden_states, self.attention_mask)

        # if self.seq_first:
        #     hidden_states = hidden_states.transpose(0, 1).contiguous()
        # this happens only in the output if necessary

        return self.final_norm(hidden_states)


class ScriptableLMForPreTraining(PreTrainedModel):
    """Pretraining version with optional prediction head and variant for sparse prediction."""

    config_class = crammedTransformerConfig

    def __init__(self, config):
        super().__init__(config)
        self.cfg = OmegaConf.create(config.arch)

        self.encoder = ScriptableLM(config)

        self.decoder = torch.nn.Linear(self.cfg.embedding.embedding_dim, self.cfg.embedding.vocab_size, bias=self.cfg.decoder_bias)
        self.decoder.weight = self.encoder.embedding.word_embedding.weight

        self.loss_fn = torch.nn.CrossEntropyLoss()
        self._init_weights()

    def _init_weights(self, module=None):
        modules = self.modules() if module is None else [module]
        for module in modules:
            _init_module(
                module,
                self.cfg.init.type,
                self.cfg.init.std,
                self.cfg.hidden_size,
                self.cfg.num_transformer_layers,
            )

    def forward(self, input_ids: torch.Tensor, *args, **kwargs):
        outputs = self.decoder(self.encoder(input_ids))

        if self.encoder.seq_first:
            shifted_outputs = outputs[:-1]
            shifted_labels = input_ids.transpose(0, 1)[1:].contiguous()
            outputs = outputs.detach().transpose(0, 1)
        else:
            shifted_outputs = outputs[..., :-1, :].contiguous()
            shifted_labels = input_ids[..., 1:].contiguous()
            outputs = outputs.detach()
        # Flatten the tokens and compute loss
        loss = self.loss_fn(shifted_outputs.view(-1, shifted_outputs.shape[-1]), shifted_labels.view(-1))

        return {"loss": loss, "logits": outputs[:, -1, :], "log_perplexity": loss.clone().detach()}


# ###### HF registry here? ############### #

AutoConfig.register("crammedTransformer", crammedTransformerConfig)
AutoModel.register(crammedTransformerConfig, ScriptableLM)
AutoModelForCausalLM.register(crammedTransformerConfig, ScriptableLMForPreTraining)


================================================
FILE: cramming/architectures/embeddings.py
================================================
"""Non-standard embedding implementations."""

import torch
import math

from typing import Tuple
from einops import repeat
import random


class PositionalEmbedding(torch.nn.Module):
    # https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py#L15C1-L31C37
    def __init__(self, demb):
        super(PositionalEmbedding, self).__init__()

        self.demb = demb

        inv_freq = (1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))).float()
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, pos_seq, bsz=None):
        # sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
        tensor_24_17_1 = pos_seq.float().unsqueeze(2)

        vector_512_expanded = self.inv_freq.unsqueeze(0).unsqueeze(1)

        result = torch.matmul(tensor_24_17_1, vector_512_expanded)

        sinusoid_inp = result.squeeze(2)

        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
        return pos_emb


class RandomNoise(torch.nn.Module):

    def __init__(self, embedding_dim, max_seq_length=5000):
        super().__init__()
        self.embedding_dim = embedding_dim

    def forward(self, input_ids):
        return torch.normal(0, 0.1, size=(input_ids.size(0), input_ids.size(1), self.embedding_dim)).to(input_ids.device)


class RPE(torch.nn.Module):
    # https://jaketae.github.io/study/relative-positional-encoding/
    # def __init__(self, embedding_dim, max_seq_length=5000):
    #     super().__init__()

    # def forward(self, input_ids):
    #     return torch.normal(0, 0.1, size=input_ids.shape)
    def __init__(self, d_model, num_heads, max_len=1024, dropout=0.1):
        super().__init__()
        d_head, remainder = divmod(d_model, num_heads)
        if remainder:
            raise ValueError("incompatible `d_model` and `num_heads`")
        self.max_len = max_len
        self.d_model = d_model
        self.num_heads = num_heads
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)
        self.query = torch.nn.Linear(d_model, d_model)
        self.dropout = torch.nn.Dropout(dropout)
        self.Er = torch.nn.Parameter(torch.randn(max_len, d_head))
        self.register_buffer("mask", torch.tril(torch.ones(max_len, max_len)).unsqueeze(0).unsqueeze(0))
        # self.mask.shape = (1, 1, max_len, max_len)

    def forward(self, x):
        # x.shape == (batch_size, seq_len, d_model)
        batch_size, seq_len, _ = x.shape

        if seq_len > self.max_len:
            raise ValueError("sequence length exceeds model capacity")

        k_t = self.key(x).reshape(batch_size, seq_len, self.num_heads, -1).permute(0, 2, 3, 1)
        # k_t.shape = (batch_size, num_heads, d_head, seq_len)
        v = self.value(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        q = self.query(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        # shape = (batch_size, num_heads, seq_len, d_head)

        start = self.max_len - seq_len
        Er_t = self.Er[start:, :].transpose(0, 1)
        # Er_t.shape = (d_head, seq_len)
        QEr = torch.matmul(q, Er_t)
        # QEr.shape = (batch_size, num_heads, seq_len, seq_len)
        Srel = self.skew(QEr)
        # Srel.shape = (batch_size, num_heads, seq_len, seq_len)

        QK_t = torch.matmul(q, k_t)
        # QK_t.shape = (batch_size, num_heads, seq_len, seq_len)
        attn = (QK_t + Srel) / math.sqrt(q.size(-1))
        mask = self.mask[:, :, :seq_len, :seq_len]
        # mask.shape = (1, 1, seq_len, seq_len)
        attn = attn.masked_fill(mask == 0, float("-inf"))
        # attn.shape = (batch_size, num_heads, seq_len, seq_len)
        attn = torch.nn.functional.softmax(attn, dim=-1)
        out = torch.matmul(attn, v)
        # out.shape = (batch_size, num_heads, seq_len, d_head)
        out = out.transpose(1, 2)
        # out.shape == (batch_size, seq_len, num_heads, d_head)
        out = out.reshape(batch_size, seq_len, -1)
        # out.shape == (batch_size, seq_len, d_model)
        return self.dropout(out)

    def skew(self, QEr):
        # QEr.shape = (batch_size, num_heads, seq_len, seq_len)
        padded = torch.nn.functional.pad(QEr, (1, 0))
        # padded.shape = (batch_size, num_heads, seq_len, 1 + seq_len)
        batch_size, num_heads, num_rows, num_cols = padded.shape
        reshaped = padded.reshape(batch_size, num_heads, num_cols, num_rows)
        # reshaped.size = (batch_size, num_heads, 1 + seq_len, seq_len)
        Srel = reshaped[:, :, 1:, :]
        # Srel.shape = (batch_size, num_heads, seq_len, seq_len)
        return Srel


# module partially stolen from pytorch examples:
class SinusoidalPositional(torch.nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
    in the sequence. The positional encodings have the same dimension as
    the embeddings, so that the two can be summed. Here, we use sine and cosine
    functions of different frequencies.
    """

    def __init__(self, embedding_dim, max_seq_length=5000):
        super().__init__()

        pe = torch.zeros(max_seq_length, embedding_dim)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe, persistent=False)

    def forward(self, input_ids):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [batch size, sequence length, embed dim]
            output: [batch size, sequence length, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """
        return self.pe[:, : input_ids.shape[1], :]


class ScaledSinosoidal(SinusoidalPositional):
    """Sinusoidal with scaling (see FLASH paper)."""

    def __init__(self, embedding_dim, max_seq_length):
        super().__init__(embedding_dim, max_seq_length)
        self.scale_factor = torch.nn.Parameter(torch.tensor([1.0 / embedding_dim**0.5]))

    def forward(self, input_ids):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [batch size, sequence length, embed dim]
            output: [batch size, sequence length, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """
        return self.scale_factor * self.pe[:, : input_ids.shape[1], :]


class LearnablePositional(torch.nn.Module):
    """Shorthand for a learnable embedding."""

    def __init__(self, embedding_dim, max_seq_length=1024):
        super().__init__()
        self.embedding = torch.nn.Embedding(max_seq_length, embedding_dim)
        self.register_buffer("position_ids", torch.arange(max_seq_length).expand((1, -1)))

    def forward(self, input_ids):
        """This is a batch-first implementation"""
        position_ids = self.position_ids[:, : input_ids.shape[1]]
        return self.embedding(position_ids)


class LearnablePositionalRand(torch.nn.Module):
    """Shorthand for a learnable embedding."""

    def __init__(self, embedding_dim, max_seq_length=1024):
        super().__init__()
        self.max_length = max_seq_length
        self.embedding = torch.nn.Embedding(max_seq_length, embedding_dim)
        self.register_buffer("position_ids", torch.arange(max_seq_length).expand((1, -1)))

    def forward(self, input_ids):
        """This is a batch-first implementation"""
        seq_length = input_ids.shape[1]
        device = input_ids.device
        if seq_length > self.max_length:  # max length will be increased to max sequnece length if max length is short
            max_length = seq_length
        else:
            max_length = self.max_length
        position_ids = self.position_ids[:, : input_ids.shape[1]]
        position_ids = torch.sort(torch.randperm(max_length, dtype=torch.long, device=device)[:seq_length]).values
        return self.embedding(position_ids)

# Code stolen from GPT-X:
class Rotary(torch.nn.Module):
    def __init__(self, dim, base=10000, def_seq_length=128, seq_dim: int = 0):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=True)
        self.seq_len_cached = def_seq_length
        self.seq_dim = seq_dim
        cos_cache, sin_cache = self._get_cos_sin()
        self.register_buffer("cos_cached", cos_cache, persistent=False)
        self.register_buffer("sin_cached", sin_cache, persistent=False)

        # Force fusions on batched version
        def rotate_half(x: torch.Tensor):
            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]  # torch.split(x, x.shape[-1] // 2, dim=-1)  # not faster
            return torch.cat((-x2, x1), dim=-1)

        def rope_fn(cos: torch.Tensor, sin: torch.Tensor, query_layer: torch.Tensor, key_layer: torch.Tensor):
            QK = torch.cat([query_layer, key_layer], dim=1)
            rotated = QK * cos[: QK.shape[0]] + rotate_half(QK) * sin[: QK.shape[0]]
            return torch.split(rotated, query_layer.shape[1], dim=1)

        self.rope_fn = rope_fn  # handle fusion on module level

    @torch.no_grad()
    def get_cos_sin_cache(self, x: torch.Tensor):
        seq_len = x.shape[self.seq_dim]
        if seq_len != self.seq_len_cached:
            self.seq_len_cached = x.shape[self.seq_dim]
            cos_cache, sin_cache = self._get_cos_sin()
            self.cos_cached = cos_cache.to(x.device)
            self.sin_cached = sin_cache.to(x.device)
        return self.cos_cached, self.sin_cached

    def _get_cos_sin(self):
        t = torch.arange(self.seq_len_cached).type_as(self.inv_freq)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        if self.seq_dim == 0:
            return emb.cos()[:, None, None, :].detach(), emb.sin()[:, None, None, :].detach()
        else:
            return emb.cos()[None, :, None, :].detach(), emb.sin()[None, :, None, :].detach()

    def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
        cos_cached, sin_cached = self.get_cos_sin_cache(query_layer)
        return self.rope_fn(cos_cached, sin_cached, query_layer, key_layer)

    @torch.jit.export
    def single_forward(self, inputs: torch.Tensor):
        """For cases where shapes of Q and K do not match."""
        cos, sin = self.cos_cached[: inputs.shape[0]], self.sin_cached[: inputs.shape[0]]
        return inputs * cos + self.rotate_half(inputs) * sin

    def rotate_half(self, x: torch.Tensor):
        x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
        return torch.cat((-x2, x1), dim=-1)  # torch.split(x, x.shape[-1] // 2, dim=-1)  # not faster

class RotarySanityCheck(torch.nn.Module):
    """not again..."""

    def __init__(self, dim, base=10000, def_seq_length=128, seq_dim: int = 0):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=True)
        self.seq_len_cached = def_seq_length
        self.seq_dim = seq_dim
        cos_cache, sin_cache = self._get_cos_sin()
        self.register_buffer("cos_cached", cos_cache, persistent=False)
        self.register_buffer("sin_cached", sin_cache, persistent=False)

    @torch.no_grad()
    def get_cos_sin_cache(self, x: torch.Tensor):
        seq_len = x.shape[self.seq_dim]
        if seq_len != self.seq_len_cached:
            self.seq_len_cached = x.shape[self.seq_dim]
            cos_cache, sin_cache = self._get_cos_sin()
            self.cos_cached = cos_cache.to(x.device)
            self.sin_cached = sin_cache.to(x.device)
        return self.cos_cached, self.sin_cached

    def _get_cos_sin(self):
        t = torch.arange(self.seq_len_cached).type_as(self.inv_freq)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        if self.seq_dim == 0:
            return emb.cos()[:, None, None, :].detach(), emb.sin()[:, None, None, :].detach()
        else:
            return emb.cos()[None, :, None, :].detach(), emb.sin()[None, :, None, :].detach()

    def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
        # cos, sin = self.get_cos_sin_cache(key_layer)
        # cos, sin = (cos[offset : query_layer.shape[0] + offset, ...], sin[offset : query_layer.shape[0] + offset, ...])
        cos, sin = self.cos_cached, self.sin_cached
        return (query_layer * cos) + (self.rotate_half(query_layer) * sin), (key_layer * cos) + (self.rotate_half(key_layer) * sin)

    def rotate_half(self, x: torch.Tensor):
        x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
        return torch.cat((-x2, x1), dim=-1)  # torch.split(x, x.shape[-1] // 2, dim=-1)  # not faster

    @torch.jit.export
    def single_forward(self, inputs: torch.Tensor):
        """For cases where shapes of Q and K do not match."""
        cos, sin = self.cos_cached[: inputs.shape[0]], self.sin_cached[: inputs.shape[0]]
        return inputs * cos + self.rotate_half(inputs) * sin


# Adapted from https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/rotary.py who adapted from
# Adapted from https://github.com/facebookresearch/xformers/blob/main/xformers/components/positional_embedding/rotary.py
class RotaryEleutherAI(torch.nn.Module):
    """
    The rotary position embeddings from RoFormer_ (Su et. al).
    A crucial insight from the method is that the query and keys are
    transformed by rotation matrices which depend on the relative positions.
    Other implementations are available in the Rotary Transformer repo_ and in
    GPT-NeoX_, GPT-NeoX was an inspiration
    .. _RoFormer: https://arxiv.org/abs/2104.09864
    .. _repo: https://github.com/ZhuiyiTechnology/roformer
    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
    """

    _seq_len_cached: int
    # _cos_cached: Optional[torch.Tensor]
    # _sin_cached: Optional[torch.Tensor]

    def __init__(self, dim_model: int, *_, **__):
        super().__init__()
        # Generate and save the inverse frequency buffer (non trainable)
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim_model, 2).float() / dim_model))
        self.register_buffer("inv_freq", inv_freq)

        _cos_cached, _sin_cached = self._update_cos_sin_tables(torch.randn(1, 128, 1), seq_dimension=-2)
        self.register_buffer("_cos_cached", _cos_cached, persistent=False)
        self.register_buffer("_sin_cached", _sin_cached, persistent=False)

    @torch.jit.ignore
    def _update_cos_sin_tables(self, x: torch.Tensor, seq_dimension: int = -2) -> Tuple[torch.Tensor, torch.Tensor]:
        seq_len = x.shape[seq_dimension]

        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        # if seq_len != self._seq_len_cached:  # or self._cos_cached.device != x.device or self._cos_cached.dtype != x.dtype:
        self._seq_len_cached = seq_len
        t = torch.arange(x.shape[seq_dimension], device=x.device, dtype=self.inv_freq.dtype)
        # Don't do einsum, it converts fp32 to fp16
        # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        freqs = torch.outer(t, self.inv_freq)
        cos_cached = repeat(torch.cos(freqs).to(x.dtype), "... d -> ... (d 2)")
        sin_cached = repeat(torch.sin(freqs).to(x.dtype), "... d -> ... (d 2)")

        return cos_cached, sin_cached

    def forward(self, q: torch.Tensor, k: torch.Tensor, seq_dimension: int = -2) -> Tuple[torch.Tensor, torch.Tensor]:
        # assert seq_dimension in [-2, -3]  # Either (bs, h, s, d) or (bs, s, h, d)
        # self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=seq_dimension)

        return (
            self.apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached, seq_dimension),
            self.apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached, seq_dimension),
        )

    def rotate_half(self, x: torch.Tensor):
        x = x.unflatten(dim=-1, sizes=(-1, 2))
        x1, x2 = x.unbind(dim=-1)
        rotated_x = torch.stack((-x2, x1), dim=-1)
        return rotated_x.flatten(start_dim=-2)

    def apply_rotary_pos_emb(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, seq_dimension: int = -2):
        # NOTE: This could probably be moved to Triton

        # Handle a possible sequence length mismatch in between q and k
        cos = cos[: x.shape[seq_dimension], :]
        sin = sin[: x.shape[seq_dimension], :]
        if seq_dimension == -3:
            cos = cos[:, None, :]
            sin = sin[:, None, :]
        return (x * cos) + (self.rotate_half(x) * sin)


class RotaryLLAMA(torch.nn.Module):
    """Facebook implementation of rotary embeddings."""

    def __init__(self, hidden_per_head, base=10000, max_seq_length=512, seq_dim: int = 0):
        super().__init__()
        self.seq_dim: int = seq_dim
        freqs_cis = self.precompute_freqs_cis(dim=hidden_per_head, end=max_seq_length * 2, theta=base)
        self.register_buffer("freqs_cis", freqs_cis)

    def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
        return self.apply_rotary_emb(query_layer, key_layer, freqs_cis=self.freqs_cis)

    def apply_rotary_emb(self, xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
        xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
        freqs_cis = self.reshape_for_broadcast(freqs_cis, xq_)

        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
        return xq_out.type_as(xq), xk_out.type_as(xk)

    def reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor):
        freqs_cis = freqs_cis[: x.shape[self.seq_dim]]
        # shape = [d if i == 1 or i == x.ndim - 1 else 1 for i, d in enumerate(x.shape)]
        # shape = [1, seq_length, 1, hidden_per_head]
        shape = [s if i == self.seq_dim or i == x.ndim - 1 else 1 for i, s in enumerate(x.shape)]
        return freqs_cis.view(*shape)

    @staticmethod
    def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
        t = torch.arange(end, device=freqs.device)  # type: ignore
        freqs = torch.outer(t, freqs).float()  # type: ignore
        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
        return freqs_cis

class FIRE(torch.nn.Module):
    def __init__(self, num_heads=12, mlp_width=32, init_c=0.1, init_L=512.0, eps=1e-6, max_length=0):
        """
        FIRE attention bias module (https://arxiv.org/abs/2310.04418).

        Args:
            num_heads: number of attention heads.
            mlp_width: Width of MLP.
            init_c: initial value of log transformation parameter
            init_L: initial value of thresholding parameter
            eps: small constant for numerical stability
        """
        super(FIRE, self).__init__()
        self.max_length = max_length  # using random PE

        # Define the MLP layers
        self.mlp = torch.nn.Sequential(torch.nn.Linear(1, mlp_width), torch.nn.ReLU(), torch.nn.Linear(mlp_width, num_heads))

        # Initialize c (log transformation parameter)
        self.c = torch.nn.Parameter(torch.tensor(init_c))

        # Initialize L (threshold)
        self.init_L = torch.nn.Parameter(torch.tensor(init_L), requires_grad=False)
        self.L_multiplier = torch.nn.Parameter(torch.tensor(1.0))  # learn a multiplier to L

        self.eps = eps

    def forward(self, seq_length, device):
        """
        Compute FIRE attention bias (https://arxiv.org/abs/2310.04418).

        Args:
            x: input sequence, shape [bsz, num_heads, seq_len, hidden_dim]

        Returns:
            attention bias of shape [1, num_heads, seq_len, seq_len]
        """
        if (seq_length > self.max_length) or (
            not self.training
        ):  # max length will be increased to max sequnece length if max length is short
            max_length = seq_length
        else:
            max_length = self.max_length

        # take a subset (of length seq_length) of a random permutation of length max_length, then sort it to
        positions = torch.sort(torch.randperm(max_length, dtype=torch.float, device=device)[:seq_length]).values
        relative_distances = positions[:, None] - positions[None, :]
        
        # Thresholding the normalizer for short sequence modeling
        threshold = torch.abs(self.L_multiplier * self.init_L)
        position_normalizer = torch.max(positions, threshold)[:, None]

        # Amplifying differences among local positions with log transform
        relative_distances = torch.log(torch.abs(self.c * relative_distances) + 1)
        position_normalizer = torch.log(torch.abs(self.c * position_normalizer) + 1)

        # Progressive interpolation
        normalized_distances = relative_distances / (position_normalizer + self.eps)
        fire_bias = self.mlp(normalized_distances.unsqueeze(-1)).unsqueeze(0)
        fire_bias = fire_bias.permute(0, 3, 1, 2)
        
        return fire_bias

class Abacus(torch.nn.Module):
    """Abacus Embeddings, learned emebddings resued for each digit"""

    def __init__(self, embedding_dim, max_seq_length=1024, max_k=99):
        super().__init__()
        self.embedding = torch.nn.Embedding(max_seq_length, embedding_dim)
        self.register_buffer("position_ids", torch.arange(max_seq_length).expand((1, -1)))
        self.max_k = max_k # the max_k here by default is 99 as we add it on after istead of generate with it

    def helper(self, mask, device):
        mask_shape = mask.shape
        
        # Create a shifted version of the mask to detect changes from 0 to 1
        shifted_mask = torch.cat([torch.zeros((mask_shape[0], 1), device=device, dtype=mask.dtype), mask[:, :-1]], dim=1)
        starts = (shifted_mask != mask) & mask
        
        # Generate IDs for each segment of 1s, processing row-wise
        segment_ids = torch.cumsum(starts, dim=1)
        
        # Generate an index array row-wise
        index = torch.arange(mask.size(1)).repeat(mask.size(0), 1).to(device)
        
        # Reset index at the start of each segment
        reset_index = torch.zeros_like(mask).long()
        second_term = index * starts.long()
        reset_index = reset_index.scatter_add(1, segment_ids, second_term)
        
        # Calculate positions in segment
        positions = index - reset_index.gather(1, segment_ids) + 1
        
        # Ensure only values within 1-segments are non-zero
        result = positions * mask

        return result

    def forward(self, input_ids):
        """This is a batch-first implementation"""
        """
        This is a batch-first implementation
        designed to work with our tokenizers, for a more versatile implementation, look at the abacus.py file
        sort tokenizer: '0': 4, '1': 5, '2': 6, '3': 7, '4': 8, '5': 9, '6': 10, '7': 11, '8': 12, '9': 13

        {'0': 4, '1': 5, '2': 6, '3': 7, '4': 8, '5': 9, '6': 10, '7': 11, '8': 12, '9': 13, 'D': 14, ',': 15, ':': 16, '=': 17, ' ': 18, 'A': 19, 'B': 20, 'C': 21, 'E': 22, 'F': 23, 'G': 24, 'H': 25, 'I': 26, 'J': 27, 'K': 28, 'L': 29, 'M': 30, 'N': 31, 'O': 32, 'P': 33, 'Q': 34, 'R': 35, 'S': 36, 'T': 37, 'U': 38, 'V': 39, 'W': 40, 'X': 41, 'Y': 42, 'Z': 43, 'a': 44, 'b': 45, 'c': 46, 'd': 47, 'e': 48, 'f': 49, 'g': 50, 'h': 51, 'i': 52, 'j': 53, 'k': 54, 'l': 55, 'm': 56, 'n': 57, 'o': 58, 'p': 59, 'q': 60, 'r': 61, 's': 62, 't': 63, 'u': 64, 'v': 65, 'w': 66, 'y': 67, 'z': 68, '!': 69, '@': 70, '£': 71, '#': 72, '$': 73, '%': 74, '^': 75, '&': 76, '*': 77, '(': 78, ')': 79, '~': 80, '?': 81, '.': 82, '<': 83, '>': 84, '{': 85, '}': 86, '[': 87, ']': 88, ';': 89, '/': 90, '|': 91, 'β': 92, 'Γ': 93, 'Δ': 94, 'δ': 95, 'ε': 96, 'ζ': 97, 'η': 98, 'θ': 99, 'κ': 100, 'Λ': 101, 'λ': 102, 'μ': 103, 'Ξ': 104, 'ξ': 105, 'Π': 106, 'π': 107, 'Σ': 108, 'ς': 109, 'τ': 110, 'Φ': 111, 'φ': 112, 'χ': 113, 'Ψ': 114, 'ψ': 115, 'Ω': 116, 'ω': 117, '[PAD]': 0, '[UNK]': 1, '[BOS]': 2, '[EOS]': 3}
        """
        mask = (input_ids >= 4) & (input_ids <= 13)
        output = self.helper(mask, input_ids.device)
        
        k=0
        if self.training:
            k = random.randint(0, self.max_k)
            output[output>0] += k # as we already have ones in the tensor, the tensor values will be k+1

        return self.embedding(output)

================================================
FILE: cramming/architectures/huggingface_interface.py
================================================
"""HF model variations based on reconfiguring their huggingface implementations."""

import transformers


def construct_huggingface_model(cfg_arch, vocab_size):
    """construct model from given configuration. Only works if this arch exists on the hub."""

    if isinstance(cfg_arch, transformers.PretrainedConfig):
        configuration = cfg_arch
    else:
        model_type = cfg_arch["model_type"]
        configuration = transformers.AutoConfig.from_pretrained(pretrained_model_name_or_path=model_type, **cfg_arch)
    configuration.vocab_size = vocab_size
    model = transformers.AutoModelForPreTraining.from_config(configuration)
    model.vocab_size = model.config.vocab_size

    old_forward = model.forward

    def modified_forward(input_ids, attention_mask=None, **kwargs):
        return old_forward(input_ids=input_ids, labels=input_ids, attention_mask=attention_mask)

    model.forward = modified_forward

    return model


================================================
FILE: cramming/architectures/losses.py
================================================
import torch
import math


class CosineLoss(torch.nn.Module):
    __constants__ = ["reduction"]
    reduction: str

    def __init__(self, reduction: str = "mean", dim=-1, eps=1e-8) -> None:
        super().__init__()
        self.reduction = reduction
        assert self.reduction == "mean"
        self.dim = dim
        self.eps = eps

    def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
        return 1 - torch.nn.functional.cosine_similarity(x1, x2, self.dim, self.eps).mean()


class CrossEntropyWithZLoss(torch.nn.Module):
    """Cross Entropy plus logit regularization via z_loss."""

    __constants__ = ["ignore_index", "z_loss_factor"]
    ignore_index: int
    z_loss_factor: float

    def __init__(self, ignore_index=-100, z_loss_factor=1e-4):
        super().__init__()
        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
        self.z_loss_factor = z_loss_factor
        self.ignore_index = ignore_index

    def forward(self, inputs, labels):
        """Is this is the optimal implementation? Is this even what is meant?
        I wish there were more answers or code for PaLM

        This implementation assumes that log(Z) is log(sum(exp(logits))).
        The usage of log2 here is also a bit wild...
        """
        z_reg = inputs.exp().sum(dim=-1).log2().sum() * self.z_loss_factor
        return self.loss_fn(inputs, labels) + z_reg


class MSELoss(torch.nn.Module):
    """MSE Loss as a drop-in replacement for Cross Entropy Loss.

    This implementation includes a mean reduction in batch dimension and a 1/num_classes/M reduction in classes."""

    def __init__(self, ignore_index=-100):
        """Parameters as in Hui&Belkin, 2021, but k=1, and M=sqrt(C) (so maybe not really Hui&Belkin?)"""
        super().__init__()
        self.ignore_index = ignore_index

    def forward(self, inputs, labels):
        """Is this is the optimal implementation? Could also do an index_select variation..."""
        num_classes = inputs.shape[-1]
        valid_mask = labels != self.ignore_index
        M = math.sqrt(num_classes)
        onehot_labels = self._label_to_onehot(labels[valid_mask], M, num_classes=num_classes)
        return 1 / (2 * M * num_classes) * (inputs[valid_mask] - onehot_labels).pow(2).sum()

    @staticmethod
    @torch.jit.script
    def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100):
        onehot_target = torch.zeros(target.shape[0], num_classes, device=target.device)
        onehot_target.scatter_(1, target.view(-1, 1), M)
        return onehot_target


class MSELossFast(torch.nn.Module):
    """MSE Loss as a drop-in replacement for Cross Entropy Loss. Only for 2dim inputs and 1dim labels

    This implementation includes a mean reduction in batch dimension and a 1/num_classes/M reduction in classes."""

    def __init__(self, ignore_index=-100):
        """Parameters as in Hui&Belkin, 2021, but k=1, and M=sqrt(C) (so maybe not really Hui&Belkin?)"""
        super().__init__()
        self.ignore_index = ignore_index

    def forward(self, inputs, labels):
        """Is this is the optimal implementation? This at least circumvents literal 1-hot labels"""
        num_examples, num_classes = inputs.shape
        valid_mask = labels != self.ignore_index
        M = math.sqrt(num_classes)

        inputs = inputs[valid_mask]
        labels = labels[valid_mask]

        x_i = inputs.pow(2).sum()
        x_j = inputs[torch.arange(labels.shape[-1]), labels].sum()
        return 1 / (2 * M * num_classes) * (x_i - 2 * M * x_j + labels.shape[-1] * M**2)


class L1Loss(torch.nn.Module):
    """L1 Loss as a drop-in replacement for Cross Entropy Loss. Only for 2dim inputs and 1dim labels

    This implementation includes a mean reduction in batch dimension and a 1/num_classes reduction in classes."""

    def __init__(self, ignore_index=-100):
        """."""
        super().__init__()
        self.ignore_index = ignore_index

    def forward(self, inputs, labels):
        """Optimal scaling is less clear for L1"""
        num_classes = inputs.shape[-1]
        valid_mask = labels != self.ignore_index
        M = math.sqrt(num_classes)
        onehot_labels = self._label_to_onehot(labels[valid_mask], float(num_classes), num_classes=num_classes)
        return 1 / inputs.shape[0] / M * (inputs[valid_mask] - onehot_labels).abs().sum()

    @staticmethod
    @torch.jit.script
    def _label_to_onehot(target, M: float = 1.0, num_classes: int = 100):
        onehot_target = torch.zeros(target.shape[0], num_classes, device=target.device)
        onehot_target.scatter_(1, target.view(-1, 1), M)
        return onehot_target


class SzegedyLoss(torch.nn.Module):
    """Regression directly back to input embedding. Remove the decoding layer if using this loss.

    As mentioned at https://twitter.com/ChrSzegedy/status/1533322132368728064?t=xz00T1YT3-WiE0id-h3MEA&s=19
    """

    def __init__(self, embedding_layer, ignore_index=-100, overrelaxation=2.0):
        """Overrelax parameter is quite a bit speculative..."""
        super().__init__()
        self.embedding = embedding_layer
        self.ignore_index = ignore_index
        self.overrelaxation = overrelaxation

    def forward(self, inputs, labels):
        """This really just does L2(DNN(embed(x[:,:-1]), 2.0 * stop_gradient(embed(x[:,1:]))) as quoted above"""
        num_examples, num_classes = inputs.shape
        valid_mask = labels != self.ignore_index
        M = math.sqrt(num_classes)

        inputs = inputs[valid_mask]
        with torch.no_grad():
            embedded_labels = self.overrelaxation * self.embedding(labels)[valid_mask]

        return (inputs - embedded_labels).pow(2).sum() / labels.shape[-1] / num_classes


"""Focal Loss from https://github.com/clcarwin/focal_loss_pytorch (minimally modernized into pytorch 1.12)"""

"""
MIT License

Copyright (c) 2017 carwin

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""


class FocalLoss(torch.nn.Module):
    def __init__(self, gamma: float = 5.0, size_average: bool = True, ignore_index: int = -100):
        super().__init__()
        self.register_buffer("gamma", torch.as_tensor(gamma, dtype=torch.float), persistent=False)
        self.size_average = size_average
        self.ignore_index = ignore_index

    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        valid_mask = target != self.ignore_index

        log_probs = torch.nn.functional.log_softmax(input[valid_mask]).gather(1, target[None, valid_mask])
        loss = -1 * (1 - log_probs.exp()) ** self.gamma * log_probs
        if self.size_average:
            return loss.mean()
        else:
            return loss.sum()


class IncorrectCrossEntropyLoss(torch.nn.CrossEntropyLoss):
    """CrossEntropyLoss, but only on incorrectly classified examples."""

    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            incorrect_preds = input.argmax(dim=-1) != target
        return torch.nn.functional.cross_entropy(
            input[incorrect_preds],
            target[incorrect_preds],
            weight=self.weight,
            ignore_index=self.ignore_index,
            reduction=self.reduction,
            label_smoothing=self.label_smoothing,
        )


================================================
FILE: cramming/architectures/sanity_check.py
================================================
"""Sanity Check architecture."""
import torch
from typing import Optional


class SanityCheckforPreTraining(torch.nn.Module):
    """Make big go fast."""

    def __init__(self, width, vocab_size):
        super().__init__()
        self.word_embedding = torch.nn.Embedding(vocab_size, width, padding_idx=0)
        self.transform = torch.nn.Linear(width, width, bias=False)

    def forward(
        self,
        input_ids,
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
    ) -> dict[str, torch.Tensor]:

        embeds = self.word_embedding(input_ids)
        outputs = self.transform(embeds)
        loss = outputs.mean()

        return {"logits": outputs, "loss": loss}


================================================
FILE: cramming/backend/__init__.py
================================================
"""This module implements interfaces to the various backends."""

from .prepare_backend import load_backend
from .utils import load_model_checkpoint, get_model_engine_tokenizer_dataloaders

__all__ = [
    "load_backend",
    "load_model_checkpoint",
    "get_model_engine_tokenizer_dataloaders",
]


================================================
FILE: cramming/backend/optimizers/__init__.py
================================================
from .progressive_batching import ProgressiveBatching
from .optimizer_modifiers import SAM, LARS
from .schedulers import get_schedule_fn


================================================
FILE: cramming/backend/optimizers/optimizer_modifiers.py
================================================
"""This is the apex LARS implementation, from the apex repository.

It implements LARS + optional clipping

https://github.com/NVIDIA/apex/blob/d74fda260c403f775817470d87f810f816f3d615/apex/parallel/LARC.py


I did rename it to "LARS".
"""

import torch


class MetaOptimizer(torch.optim.Optimizer):
    """base class for a meta optimizer that wraps and modifies an existing pytorch optimizer."""

    def __init__(self, optimizer):
        self.param_groups = optimizer.param_groups
        self.optim = optimizer

    def __getstate__(self):
        return self.optim.__getstate__()

    def __setstate__(self, state):
        self.optim.__setstate__(state)

    def __repr__(self):
        return self.__class__.__name__ + self.optim.__repr__()

    def __getattr__(self, name):
        """Call this only if all other attributes are exhausted."""
        return getattr(self.optim, name)

    @torch.no_grad()
    def step(self, closure=None):
        return self.optim.step(closure)


class LARS(MetaOptimizer):
    """
    :class:`LARS` [LARC in apex] is a pytorch implementation of both the scaling and clipping variants of LARS,
    in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive
    local learning rate for each individual parameter. The algorithm is designed to improve
    convergence of large batch training.

    See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate.

    In practice it modifies the gradients of parameters as a proxy for modifying the learning rate
    of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer.

    ```
    model = ...
    optim = torch.optim.Adam(model.parameters(), lr=...)
    optim = LARS(optim)
    ```

    Args:
        optimizer: Pytorch optimizer to wrap and modify learning rate for.
        trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888
        clip: Decides between clipping or scaling mode of LARC [LARS + clip].
              If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter.
              If `clip=False` the learning rate is set to `local_lr*optimizer_lr`.
        eps: epsilon kludge to help with numerical stability while calculating adaptive_lr
    """

    def __init__(self, optimizer, trust_coefficient=0.02, clip=False, eps=1e-8):
        self.param_groups = optimizer.param_groups
        self.optim = optimizer
        self.trust_coefficient = trust_coefficient
        self.eps = eps
        self.clip = clip

    def step(self, closure=None):
        loss = None
        with torch.no_grad():
            weight_decays = []
            for group in self.optim.param_groups:
                # absorb weight decay control from optimizer
                weight_decay = group["weight_decay"] if "weight_decay" in group else 0
                weight_decays.append(weight_decay)
                group["weight_decay"] = 0
                for p in group["params"]:
                    if p.grad is None:
                        continue
                    param_norm = torch.norm(p.data)
                    grad_norm = torch.norm(p.grad.data)

                    if param_norm != 0 and grad_norm != 0:
                        # calculate adaptive lr + weight decay
                        adaptive_lr = self.trust_coefficient * (param_norm) / (grad_norm + param_norm * weight_decay + self.eps)

                        # clip learning rate for LARC
                        if self.clip:
                            # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
                            adaptive_lr = min(adaptive_lr / group["lr"], 1)

                        p.grad.data += weight_decay * p.data
                        p.grad.data *= adaptive_lr

        loss = self.optim.step(closure)
        # return weight decay control to optimizer
        for i, group in enumerate(self.optim.param_groups):
            group["weight_decay"] = weight_decays[i]

        return loss


"""This the SAM pytorch implementation from https://github.com/davda54/sam
with a minor modification """

"""
MIT License
Copyright (c) 2021 David Samuel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""


class SAM(MetaOptimizer):
    def __init__(self, base_optimizer_instance, rho=0.05):
        assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"
        self.rho = rho

        self.optim = base_optimizer_instance
        self.param_groups = base_optimizer_instance.param_groups

    @torch.no_grad()
    def first_step(self, zero_grad=False):
        grad_norm = self._grad_norm()
        for group in self.param_groups:
            scale = self.rho / (grad_norm + 1e-12)

            for p in group["params"]:
                if p.grad is None:
                    continue
                e_w = p.grad * scale.to(p)
                p.add_(e_w)  # climb to the local maximum "w + e(w)"
                self.state[p]["e_w"] = e_w

        if zero_grad:
            self.zero_grad()

    @torch.no_grad()
    def second_step(self, zero_grad=False):
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                p.sub_(self.state[p]["e_w"])  # get back to "w" from "w + e(w)"

        self.optim.step()  # do the actual "sharpness-aware" update

        if zero_grad:
            self.zero_grad()

    @torch.no_grad()
    def step(self, closure=None):
        assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
        closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass

        closure()
        self.first_step(zero_grad=True)
        loss = closure()
        self.second_step()
        return loss

    def _grad_norm(self):
        # put everything on the same device, in case of model parallelism
        shared_device = self.param_groups[0]["params"][0].device
        norm = torch.norm(
            torch.stack([p.grad.norm(p=2).to(shared_device) for group in self.param_groups for p in group["params"] if p.grad is not None]),
            p=2,
        )
        return norm


================================================
FILE: cramming/backend/optimizers/progressive_batching.py
================================================
"""Implementation of a progressive batching meta optimizer.
The optimizer may defer an optimization step until gradient variance is small enough
"""

import torch

from collections import defaultdict
from .optimizer_modifiers import MetaOptimizer


import logging

log = logging.getLogger(__name__)
DEBUG = False


class ProgressiveBatching(MetaOptimizer):
    def __init__(self, optimizer, progress_rule="norm-based", theta=0.9, monotone=False, min_sample_guard=2, max_sample_guard=128):
        super().__init__(optimizer)

        self.progress_rule = progress_rule
        self.theta = theta
        self.monotone = monotone

        self.min_sample_guard = min_sample_guard
        self.max_sample_guard = max_sample_guard

        self.progress_state = defaultdict(dict)
        self.accumulated_steps = 0
        self.reset_sample_statistics()

    @torch.no_grad()
    def step(self):
        """(Maybe) performs a single optimization step."""
        self.update_sample_statistics()
        if self.accumulated_steps < self.min_sample_guard:
            rule_check = False
        else:
            if self.accumulated_steps > self.max_sample_guard:
                rule_check = True
            else:
                if self.progress_rule == "norm-based":
                    rule_check = self.norm_test()
                elif self.progress_rule == "inner-product":
                    rule_check = self.inner_product_test()
                elif self.progress_rule == "cov":
                    rule_check = self.coefficient_of_variation()
                elif self.progress_rule == "cosine":
                    rule_check = self.cosine_test()
                else:
                    raise ValueError(f"Invalid progress rules {self.progress_rule} given.")

        if rule_check:
            self.copy_mean_grad()  # reference running mean in p.grad attributes
            if self.monotone:
                self.min_sample_guard = self.accumulated_steps  # raise lower limit if forcing monotone batch sizes
            self.reset_sample_statistics()  # reset running mean
            super().step()
        else:
            # otherwise defer the step and accumulate more gradients
            pass

    def inner_product_test(self):
        """Inner product similar to description in Bollapragada,Byrd,Nocedal, "Adaptive Sampling Strategies for Stochastic Optimization".

        This is only a zero-memory inner product test.
        """

        global_inner_product, global_variance = 0, 0
        for group in self.param_groups:
            for p in group["params"]:
                state = self.progress_state[p]
                ndivn1 = self.accumulated_steps / (self.accumulated_steps - 1)
                corrected_mean = (state["running_mean"] - p.grad / self.accumulated_steps) * ndivn1
                global_inner_product += (p.grad * corrected_mean).sum()
                global_variance += corrected_mean.pow(2).sum()
        final_v = (global_inner_product - global_variance).pow(2)

        if DEBUG:
            inequality_repr = f"{final_v / (self.accumulated_steps - 1):10.2f} < {self.theta * global_variance**2:10.2f}"
            log.info(f"{self.accumulated_steps} - {inequality_repr}")

        return final_v / (self.accumulated_steps - 1) < self.theta * global_variance**2

    def norm_test(self):
        """Sohams version."""

        sample_var, mean_norm = 0, 0
        for group in self.param_groups:
            for p in group["params"]:
                state = self.progress_state[p]
                sample_var += state["running_variance"].sum() / (self.accumulated_steps - 1)  # bessel-corrected variance
                mean_norm += state["running_mean"].pow(2).sum()

        if DEBUG:
            log.info(f"{self.accumulated_steps} -  {sample_var / self.accumulated_steps:10.2f} < {self.theta * mean_norm:10.2f}")

        return sample_var / self.accumulated_steps < self.theta * mean_norm  # divide by |B| as in bigbatch, original version is theta=1

    def cosine_test(self):
        """Experimental."""

        total_angles, num_params = 0, 0
        for group in self.param_groups:
            for p in group["params"]:
                state = self.progress_state[p]
                ndivn1 = self.accumulated_steps / (self.accumulated_steps - 1)
                corrected_mean = (state["running_mean"] - p.grad / self.accumulated_steps) * ndivn1
                total_angles += (p.grad * corrected_mean).sum() / corrected_mean.norm() / p.grad.norm()
                num_params += 1

        average_angle = total_angles / num_params  # rather the average cosine, this not (yet) the angle

        if DEBUG:
            log.info(f"{self.accumulated_steps} -  {average_angle:10.2f} > {self.theta:10.2f}")

        return average_angle > self.theta

    def coefficient_of_variation(self):
        """unbiased cov test."""
        cov, mean_norm, num_params = 0, 0, 0
        for group in self.param_groups:
            for p in group["params"]:
                state = self.progress_state[p]
                cov += (state["running_variance"].sum() / (self.accumulated_steps - 1)).sqrt() / (state["running_mean"].pow(2).sum() + 1e-6)
                mean_norm += state["running_mean"].pow(2).sum()
                num_params += 1

        unbiased_avg_cov = (1 + 1 / (4 * self.accumulated_steps)) * cov / num_params / self.accumulated_steps

        if DEBUG:
            log.info(f"{self.accumulated_steps} -  {unbiased_avg_cov:10.2f} < {self.theta * 100:10.2f}")

        return unbiased_avg_cov < self.theta * 100

    def update_sample_statistics(self):
        """Update sample statistics based on welford accumulation. At any step variance can be finalized via running_variance / count"""
        self.accumulated_steps += 1
        for group in self.param_groups:
            for p in group["params"]:
                state = self.progress_state[p]
                current_delta = p.grad - state["running_mean"]
                state["running_mean"] += current_delta / self.accumulated_steps
                corrected_delta = p.grad - state["running_mean"]
                state["running_variance"] += current_delta * corrected_delta

    def reset_sample_statistics(self):
        """Allocate new tensors, old references are still required for the optimizer step."""
        self.last_full_step_accumulation = self.accumulated_steps + 1
        self.accumulated_steps = 0
        for group in self.param_groups:
            for p in group["params"]:
                state = self.progress_state[p]
                state["running_mean"] = torch.zeros_like(p, memory_format=torch.preserve_format)
                state["running_variance"] = torch.zeros_like(p, memory_format=torch.preserve_format)

    def copy_mean_grad(self):
        for group in self.param_groups:
            for p in group["params"]:
                p.grad = self.progress_state[p]["running_mean"]


================================================
FILE: cramming/backend/optimizers/schedulers.py
================================================
"""Misc. optimizer implementations."""
import transformers
import math

from torch.optim.lr_scheduler import LambdaLR
import time
from functools import partial


def get_schedule_fn(cfg_train, elapsed_time: float=0.0, true_budget: float = -1):
    """Returns a callable scheduler_fn(optimizer).

    Todo: Sanitize and unify these schedulers...
    """
    if true_budget <= 0:
        true_budget = cfg_train.budget
    if (cfg_train.warmup_steps) > 0 and (cfg_train.warmup_steps < 1):
        # warmup could be a percentage in which case this line converts to steps again
        cfg_train.warmup_steps = int(cfg_train.warmup_steps * cfg_train.steps)

    if (cfg_train.cooldown_steps) > 0 and (cfg_train.cooldown_steps < 1):
        # cooldown could be a percentage in which case this line converts to steps again
        cfg_train.cooldown_steps = int(cfg_train.cooldown_steps * cfg_train.steps)

    # Load huggingface schedulers based on total steps
    if cfg_train.scheduler == "polynomial-decay":
        scheduler_fn = partial(
            transformers.get_polynomial_decay_schedule_with_warmup,
            num_warmup_steps=cfg_train.warmup_steps,
            num_training_steps=cfg_train.steps,
            lr_end=1e-7,
            power=1.0,
        )
    elif cfg_train.scheduler == "cosine-decay":
        scheduler_fn = partial(
            transformers.get_cosine_schedule_with_warmup,
            num_warmup_steps=cfg_train.warmup_steps,
            num_training_steps=cfg_train.steps,
            num_cycles=0.5,
        )
    elif cfg_train.scheduler == "inverse-sqrt":
        scheduler_fn = partial(
            get_inverse_sqrt_scheduler,
            num_warmup_steps=cfg_train.warmup_steps,
            num_cooldown_steps=cfg_train.cooldown_steps,
            num_training_steps=cfg_train.steps,
        )
    elif cfg_train.scheduler == "one-cycle":  # this is a simplified one-cycle
        scheduler_fn = partial(
            get_one_cycle,
            num_training_steps=cfg_train.steps,
        )
    elif cfg_train.scheduler == "ramp":  # this is a simplified one-cycle
        scheduler_fn = partial(
            get_ramp,
            num_cooldown_steps=cfg_train.cooldown_steps,
            num_training_steps=cfg_train.steps,
        )
        """Budget Schedulers from here: """
    elif cfg_train.scheduler == "budget-inverse-sqrt":
        scheduler_fn = partial(
            get_budget_inv_sqrt_scheduler,
            hour_budget=true_budget,
            num_warmup_steps=cfg_train.warmup_steps,
            num_cooldown_steps=cfg_train.cooldown_steps,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-constant":
        scheduler_fn = partial(
            get_budget_constant_scheduler,
            hour_budget=true_budget,
            num_warmup_steps=cfg_train.warmup_steps,
            num_cooldown_steps=cfg_train.cooldown_steps,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-cosine-decay":
        scheduler_fn = partial(
            get_budget_cosine_schedule_with_warmup,
            hour_budget=true_budget,
            num_warmup_steps=cfg_train.warmup_steps,
            num_training_steps=cfg_train.steps,
            num_cycles=0.5,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-cosine-annealing":
        scheduler_fn = partial(
            get_budget_cosine_half_cycles_with_warmup,
            hour_budget=true_budget,
            num_warmup_steps=cfg_train.warmup_steps,
            num_training_steps=cfg_train.steps,
            num_cycles=4,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-linear":
        scheduler_fn = partial(
            get_budget_linear_schedule_with_warmup,
            hour_budget=true_budget,
            num_warmup_steps=cfg_train.warmup_steps,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-polynomial":
        scheduler_fn = partial(
            get_budget_polynomial_decay_with_warmup,
            hour_budget=true_budget,
            num_warmup_steps=cfg_train.warmup_steps,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-one-cycle":  # this is a simplified one-cycle
        scheduler_fn = partial(
            get_budget_one_cycle,
            hour_budget=true_budget,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-multi-cycle":
        scheduler_fn = partial(
            get_budget_multi_cycle,
            hour_budget=true_budget,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-ramp":
        scheduler_fn = partial(
            get_budget_ramp,
            hour_budget=true_budget,
            num_cooldown_steps=cfg_train.cooldown_steps,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-inv-cosine":
        scheduler_fn = partial(
            get_budget_inv_cosine_schedule,
            hour_budget=true_budget,
            num_cooldown_steps=cfg_train.cooldown_steps,
            num_training_steps=cfg_train.steps,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-dive":
        scheduler_fn = partial(
            get_budget_dive,
            hour_budget=true_budget,
            num_training_steps=cfg_train.steps,
            num_warmup_steps=cfg_train.warmup_steps,
            falloff=0.5,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-dive-slow":
        scheduler_fn = partial(
            get_budget_dive,
            hour_budget=true_budget,
            num_training_steps=cfg_train.steps,
            num_warmup_steps=cfg_train.warmup_steps,
            falloff=0.75,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-dive-fast":
        scheduler_fn = partial(
            get_budget_dive,
            hour_budget=true_budget,
            num_training_steps=cfg_train.steps,
            num_warmup_steps=cfg_train.warmup_steps,
            falloff=0.25,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-triangle1":
        scheduler_fn = partial(
            get_budget_triangle,
            hour_budget=true_budget,
            num_training_steps=cfg_train.steps,
            falloff=0.25,
            base_percentage=0.5,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler == "budget-triangle2":
        scheduler_fn = partial(
            get_budget_triangle,
            hour_budget=true_budget,
            num_training_steps=cfg_train.steps,
            falloff=0.25,
            base_percentage=0.25,
            elapsed_time=elapsed_time,
        )
    elif cfg_train.scheduler in [
        "linear",
        "cosine",
        "cosine_with_restarts",
        "polynomial",
        "constant",
        "constant_with_warmup",
        "get_cosine_with_hard_restarts_schedule_with_warmup",
        "get_polynomial_decay_schedule_with_warmup",
    ]:

        def scheduler_fn(optimizer):
            return transformers.get_scheduler(
                name=cfg_train.scheduler,
                optimizer=optimizer,
                num_warmup_steps=cfg_train.warmup_steps,
                num_training_steps=cfg_train.steps,
            )

    elif cfg_train.scheduler == "none" or cfg_train.scheduler is None:
        scheduler_fn = DumbScheduler
    else:
        raise ValueError(f"Invalid schedule {cfg_train.scheduler} given.")
    return scheduler_fn


class DumbScheduler:
    def __init__(self, *args, **kwargs):
        self._step_count = 0

    def step(self, *args, **kwargs):
        self._step_count += 1

    def _initial_step(self):
        self.optimizer._step_count = 0
        self._step_count = 0
        self.step()

    def state_dict(self):
        return {}

    def load_state_dict(self, state_dict):
        self.__dict__.update(state_dict)

    def get_last_lr(self):
        """Return last computed learning rate by current scheduler."""
        return float("NaN")

    def get_lr(self):
        return float("NaN")

    def print_lr(self, is_verbose, group, lr, epoch=None):
        print(float("NaN"))


"""FairSeq-like inverse-square-root scheduler:"""


def get_inverse_sqrt_scheduler(optimizer, num_warmup_steps, num_cooldown_steps, num_training_steps):
    """Decay the LR based on the inverse square root of the update number.
    We also support a warmup phase where we linearly increase the learning rate
    from some initial learning rate (`--warmup-init-lr`) until the configured
    learning rate (`--lr`). Thereafter we decay proportional to the number of
    updates, with a decay factor set to align with the configured learning rate.
    During warmup:
      lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
      lr = lrs[update_num]
    After warmup:
      lr = decay_factor / sqrt(update_num)
    where
      decay_factor = args.lr * sqrt(args.warmup_updates)
    """
    # linearly warmup for the first args.warmup_updates
    lr_step = 1 / num_warmup_steps
    # then, decay prop. to the inverse square root of the update number
    decay_factor = num_warmup_steps**0.5
    decayed_lr = decay_factor * (num_training_steps - num_cooldown_steps) ** -0.5

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step * lr_step)
        elif current_step > (num_training_steps - num_cooldown_steps):
            return max(0.0, float(decayed_lr * (num_training_steps - current_step) / num_cooldown_steps))
        else:
            return float(decay_factor * current_step**-0.5)

    return LambdaLR(optimizer, lr_lambda, last_epoch=-1)


def get_one_cycle(optimizer, num_training_steps):
    """Simple single-cycle scheduler. Not including paper/fastai three-phase things or asymmetry."""

    def lr_lambda(current_step):
        if current_step < num_training_steps / 2:
            return float(current_step / (num_training_steps / 2))
        else:
            return float(2 - current_step / (num_training_steps / 2))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_ramp(optimizer, num_cooldown_steps, num_training_steps):
    """to the MOON."""
    max_lr = (num_training_steps - num_cooldown_steps) / num_training_steps

    def lr_lambda(current_step):
        if current_step > (num_training_steps - num_cooldown_steps):
            return max(0.0, float(max_lr * (num_training_steps - current_step) / num_cooldown_steps))
        else:
            return float(current_step / num_training_steps)

    return LambdaLR(optimizer, lr_lambda, -1)


"""Wallclock time schedulers."""
def _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, prev_elapsed_time: float = 0.0):
    elapsed_hours = (time.time() - initial_time + prev_elapsed_time) / 60 / 60
    if current_step == 0:
        fake_step = 0
    else:
        fake_step = int(elapsed_hours / hour_budget * num_training_steps)
        # Warning: denominator could be bigger than 1 if passed original budget, so be careful with checkpointing
    return fake_step


def get_budget_inv_sqrt_scheduler(optimizer, hour_budget, num_warmup_steps, num_cooldown_steps, num_training_steps, elapsed_time: float = 0.0):
    """Time-based scheduler as described in Iszak et al. plus inv_sqrt.
    Takes in num_warmup_steps and num_training_steps as normal, but actually squeezes the planned schedule into the
    budget given by hour_budget, based on wallclock measurements.

    Reference: https://github.com/IntelLabs/academic-budget-bert/blob/main/pretraining/schedules.py
    """
    decay_factor = num_warmup_steps**0.5
    decayed_lr = decay_factor * (num_training_steps - num_cooldown_steps) ** -0.5
    initial_time = time.time()

    def lr_lambda(current_step: int):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if fake_step < num_warmup_steps:
            return float(fake_step / num_warmup_steps)
        elif fake_step > (num_training_steps - num_cooldown_steps):
            return max(0.0, float(decayed_lr * (num_training_steps - fake_step) / num_cooldown_steps))
        else:
            return float(decay_factor * fake_step**-0.5)

    return LambdaLR(optimizer, lr_lambda, last_epoch=-1)


def get_budget_constant_scheduler(optimizer, hour_budget, num_warmup_steps, num_cooldown_steps, num_training_steps, elapsed_time: float = 0.0):
    """Time-based scheduler with optional warmup and cooldown (so technically a trapezoidal shape)"""
    initial_time = time.time()

    def lr_lambda(current_step: int):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if fake_step < num_warmup_steps:
            return float(fake_step / num_warmup_steps)
        elif fake_step > (num_training_steps - num_cooldown_steps):
            return max(0.0, float((num_training_steps - fake_step) / num_cooldown_steps))
        else:
            return 1.0

    return LambdaLR(optimizer, lr_lambda, last_epoch=-1)


def get_budget_linear_schedule_with_warmup(optimizer, hour_budget, num_warmup_steps, num_training_steps, num_cycles=0.5, elapsed_time: float = 0.0):
    """Follows the huggingface transformers scheduler with the same name, but gets an additional arg hour_budget"""
    initial_time = time.time()

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if fake_step < num_warmup_steps:
            return float(fake_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - fake_step) / float(max(1, num_training_steps - num_warmup_steps)))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_cosine_schedule_with_warmup(optimizer, hour_budget, num_warmup_steps, num_training_steps, num_cycles=0.5, elapsed_time: float = 0.0):
    """Follows the huggingface transformers scheduler with the same name, but gets an additional arg hour_budget"""
    initial_time = time.time()

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if fake_step < num_warmup_steps:
            return float(fake_step) / float(max(1, num_warmup_steps))
        progress = float(fake_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_cosine_half_cycles_with_warmup(optimizer, hour_budget, num_warmup_steps, num_training_steps, num_cycles=0.5, elapsed_time: float = 0.0):
    """Follows the huggingface transformers scheduler with the same name, but gets an additional arg hour_budget"""
    initial_time = time.time()

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if fake_step < num_warmup_steps:
            return float(fake_step) / float(max(1, num_warmup_steps))
        progress = float(fake_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_one_cycle(optimizer, hour_budget, num_training_steps, elapsed_time: float = 0.0):
    """Simple single-cycle scheduler. Not including paper/fastai three-phase things or asymmetry."""
    initial_time = time.time()

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if fake_step < num_training_steps / 2:
            return float(fake_step / (num_training_steps / 2))
        else:
            return float(2 - fake_step / (num_training_steps / 2))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_multi_cycle(optimizer, hour_budget, num_training_steps, num_cycles=8, elapsed_time: float = 0.0):
    """Simple multi-cycle scheduler. Not including paper/fastai three-phase things or asymmetry."""
    initial_time = time.time()
    cycle_length = int(num_training_steps / num_cycles)

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps) % cycle_lengt, elapsed_timeh
        if fake_step < cycle_length / 2:
            return float(fake_step / (cycle_length / 2))
        else:
            return float(2 - fake_step / (cycle_length / 2))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_ramp(optimizer, hour_budget, num_cooldown_steps, num_training_steps, elapsed_time: float = 0.0):
    """to the moon."""
    initial_time = time.time()
    max_lr = (num_training_steps - num_cooldown_steps) / num_training_steps

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if fake_step > (num_training_steps - num_cooldown_steps):
            return max(0.0, float(max_lr * (num_training_steps - fake_step) / num_cooldown_steps))
        else:
            return float(fake_step / num_training_steps)

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_inv_cosine_schedule(optimizer, hour_budget, num_cooldown_steps, num_training_steps, num_cycles=0.5, elapsed_time: float = 0.0):
    """An inverse cosine schedule, with limited budget."""
    initial_time = time.time()
    ult_step = num_training_steps - num_cooldown_steps
    max_lr = max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * (1 - ult_step / float(max(1, num_training_steps))))))

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)

        progress = 1 - fake_step / float(max(1, num_training_steps))
        if fake_step > (num_training_steps - num_cooldown_steps):
            return max(0.0, float(max_lr * (num_training_steps - fake_step) / num_cooldown_steps))
        else:
            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_triangle(optimizer, hour_budget, num_training_steps, base_percentage=0.5, falloff=0.5, elapsed_time: float = 0.0):
    """Linear increase from a percentage of the base learning rate, then linear decay.

    plot min(0.5 + x * (1 - 0.5)/(1-0.25) / 1000, 1/0.25 - x / (1000 * 0.25)) from 0 to 1000 in the plot range 0 to 1
    """
    initial_time = time.time()

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        return min(
            base_percentage + fake_step * (1 - base_percentage) / (1 - falloff) / num_training_steps,
            float(1 / falloff - fake_step / (num_training_steps * falloff)),
        )

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_dive(optimizer, hour_budget, num_training_steps, num_warmup_steps=0, falloff=0.5, elapsed_time: float = 0.0):
    """Constant, then linear decay.
    plot min(1, 1/0.5 - x / (1000 * 0.5)) from 0 to 1000 in the plot range 0 to 1
    """
    initial_time = time.time()

    def lr_lambda(current_step):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)
        if current_step < num_warmup_steps:
            return float(fake_step) / float(max(1, num_warmup_steps))
        else:
            return min(1.0, float(1 / falloff - fake_step / (num_training_steps * falloff)))

    return LambdaLR(optimizer, lr_lambda, -1)


def get_budget_polynomial_decay_with_warmup(optimizer, hour_budget, num_warmup_steps, num_training_steps, lr_end=0.0, power=1.0, elapsed_time: float = 0.0):
    """Follows the huggingface transformers scheduler with the same name, but gets an additional arg hour_budget"""
    initial_time = time.time()
    lr_init = optimizer.defaults["lr"]

    def lr_lambda(current_step: int):
        fake_step = _get_fake_step(current_step, initial_time, hour_budget, num_training_steps, elapsed_time)

        if fake_step < num_warmup_steps:
            return float(fake_step) / float(max(1, num_warmup_steps))
        elif fake_step > num_training_steps:
            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
        else:
            lr_range = lr_init - lr_end
            decay_steps = num_training_steps - num_warmup_steps
            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
            decay = lr_range * pct_remaining**power + lr_end
            return decay / lr_init  # as LambdaLR multiplies by lr_init

    return LambdaLR(optimizer, lr_lambda, -1)


================================================
FILE: cramming/backend/prepare_backend.py
================================================
"""Instantiate backend objects in a congruent format."""
import torch

from .torch_default import initialize_torch

_default_setup = dict(device=torch.device("cpu"), dtype=torch.float)


def load_backend(model, tokenizer, cfg_train, cfg_impl, setup=_default_setup, init_compile_and_distribute=True):
    if cfg_impl.name == "torch-default":
        return initialize_torch(model, tokenizer, cfg_train, cfg_impl, setup=setup, init_compile_and_distribute=init_compile_and_distribute)
    else:
        raise ValueError(f"Invalid backend {cfg_impl.name} given.")


================================================
FILE: cramming/backend/torch_default.py
================================================
"""Basic training backend engine for pytorch training with all bells and whistles.

Interface set up to be compliant with the deepspeed engine interface.


There are two versions here, the TorchEngineMinimal, which is the default, and TorchEngineFull which contains a few training variations
that were tested but ultimately discarded, so read that part only if you're interested.

"""

import json
import logging
import os
import time
from contextlib import nullcontext
from functools import partial
from typing import Any, Dict, Union

import torch
import torch._inductor.utils
import transformers
from omegaconf import OmegaConf
from safetensors.torch import save_file
from torch.distributed.optim import ZeroRedundancyOptimizer
from transformers.utils.generic import working_or_temp_dir

from .optimizers import LARS, SAM, ProgressiveBatching
from .optimizers.schedulers import get_schedule_fn

# from .utils import group_parameters, prepare_pretraining_dataloader, prepare_validation_dataloader
from .utils import group_parameters, load_model_checkpoint

log = logging.getLogger(__name__)
_default_setup = dict(device=torch.device("cpu"), dtype=torch.float)
import warnings
from ..utils import flatten

warnings.filterwarnings("ignore", "Detected call of ", UserWarning)  # schedulers are deliberately used differently


def initialize_torch(model, tokenizer, cfg_train, cfg_impl, setup=_default_setup, init_compile_and_distribute=True):
    """initialize a torch engine."""
    model_engine = TorchEngine(
        model,
        cfg_train,
        cfg_impl,
        setup=setup,
        seq_length=tokenizer.model_max_length,
        init_compile_and_distribute=init_compile_and_distribute,
    )
    model_engine.train()
    return model_engine


class TorchEngine(torch.nn.Module):
    """This class mirrors deepspeed functionality and hides variable batch sizes, microbatching, AMP details and compilation"""

    def __init__(self, model, cfg_train, cfg_impl, setup=_default_setup, seq_length=128, init_compile_and_distribute=True):
        """Load Engine. The model will be compiled by default.
        init_compile_and_distribute=False => In the case we are loading in a checkpoint we might aswell not send it across GPUs as this will be redone later
        """

        super().__init__()

        self.cfg_train = cfg_train
        self.cfg_impl = cfg_impl
        if self.cfg_impl.microbatch_size is None:
            self.cfg_impl.microbatch_size = self.cfg_train.batch_size
        if self.cfg_impl.microbatch_size > self.cfg_train.batch_size:
            raise ValueError(f"MBS is {self.cfg_impl.microbatch_size}, but BS is only {self.cfg_train.batch_size}.")
        self.current_seq_length = seq_length

        # Mixed Precision:
        enabled = self.cfg_impl.mixed_precision if setup["device"].type != "cpu" else False
        # Modules like LN are unsupported on CPU amp, so mixed precision args are disregarded on CPU
        # See https://pytorch.org/docs/stable/amp.html#cpu-op-specific-behavior and check for layer_norm
        enable_scaling = self.cfg_impl.grad_scaling and self.cfg_impl.mixed_precision and setup["device"].type != "cpu"
        self.scaler = torch.cuda.amp.GradScaler(enabled=enable_scaling)
        amp_dtype = getattr(torch, self.cfg_impl.mixed_precision_target_dtype) if setup["device"].type != "cpu" else torch.bfloat16
        self.amp_settings = dict(device_type=setup["device"].type, enabled=enabled, dtype=amp_dtype)

        # Choose setup and move model
        self.setup = setup
        model.to(**self.setup)
        self._original_model = model
        log.info("Compiling model, in the Constructor of TorchEngine")
        model = torch.compile(
            model,
            mode=self.cfg_impl.mode,
            dynamic=self.cfg_impl.dynamic,
            fullgraph=self.cfg_impl.fullgraph,
            backend=self.cfg_impl.backend,
            disable=not cfg_impl.compile_torch,
            # detailed options; cannot be given at the same time as mode:
            options=flatten(cfg_impl._inductor_vars, parent_key="", sep=".") if cfg_impl._inductor_vars is not None else None,
        )

        if torch.distributed.is_initialized():
            if init_compile_and_distribute:
                log.info("Distributing model, in the Constructor of TorchEngine")
                self.model = self._init_distributed(model)
            else:
                log.info(
                    "<WARNING> NOT Distirbuting model in the Constructor of TorchEngine, we will attempt to do this later as we are loading in a checkpoint"
                )
                self.model = model
            self.num_machines = torch.distributed.get_world_size()
        else:
            self.model = model
            self.model.no_sync = nullcontext
            self.num_machines = 1

        # Microbatch accumulation settings and counters
        self.effective_mbs = self.cfg_impl.microbatch_size * self.num_machines  # across machines
        self.current_batch_size = self.cfg_train.batch_size if self.cfg_train.batch_size_ramp == 0 else self.effective_mbs
        self.accumulation_steps_expected = self.current_batch_size // self.effective_mbs
        self.accumulated_samples = 0  # Record the number of samples seen, reset after triggering gradient update
        self.steps = 0  # Record the number of times "step" has been triggered
        self.steps_since_reset = 0  # Record the number of times "step" has been triggered

        self.initial_time = time.time()
        self.previous_elapsed_time = 0.0
        self.optimizer, self.scheduler = _load_optimizer(model, cfg_train, cfg_impl, self.previous_elapsed_time, self.get_true_budget())

    def get_true_budget(self):
        return (
            min(self.cfg_train.budget, self.cfg_train.overall_budget - self.previous_elapsed_time / 3600)
            + self.previous_elapsed_time / 3600
        )

    def step(self, batch: dict[str, torch.Tensor]):
        loss = self.forward(**batch)["loss"]
        self.backward(loss)
        self.optimizer_step()
        return loss.detach()

    def to_device(self, batch: dict[str, torch.Tensor], keys: list[str] = ["input_ids"]):
        """Move batch of data into device memory."""
        device_batch = {
            k: v.to(device=self.setup["device"], dtype=torch.long if k == "input_ids" else None, non_blocking=True)
            for k, v in batch.items()
            if k in keys  # Add more keywords here if needed
        }
        return device_batch

    def forward(self, *inputs, **kwargs):
        self.accumulated_samples += self.effective_mbs
        context = self.model.no_sync if self.accumulated_samples < self.current_batch_size else nullcontext
        with context():
            with torch.autocast(**self.amp_settings):
                return self.model(*inputs, **kwargs)

    def backward(self, loss):
        context = self.model.no_sync if self.accumulated_samples < self.current_batch_size else nullcontext
        with context():
            return self.scaler.scale(loss / self.accumulation_steps_expected).backward()

    @torch.no_grad()
    @torch._dynamo.disable()
    def forward_inference(self, *inputs, **kwargs):
        with torch.autocast(**self.amp_settings):
            outputs = self.model(*inputs, **kwargs)["logits"]
        predictions = outputs.argmax(dim=-1)
        return outputs, predictions

    @torch._dynamo.disable()
    @torch.inference_mode()
    def dynamic_generation(self, *inputs, temperature=0.7, token_limit=100, **kwargs):
        with torch.autocast(**self.amp_settings):
            try:
                if hasattr(self._original_model, "_generate"):  # my signature
                    outputs = self._original_model._generate(*inputs, temperature=temperature, token_limit=token_limit, **kwargs)
                elif hasattr(self._original_model, "generate"):  # hf signature
                    outputs = self._original_model.generate(
                        *inputs, do_sample=True, num_beams=1, temperature=temperature, max_new_tokens=token_limit, **kwargs
                    )
                else:
                    raise NotImplementedError()
            except Exception as e:  # Fallback
                log.info(f"Falling back to default generation scheme due to error {e} in model._generate or model.generate.")
                # Generate new tokens the dumb way as a fall-back
                # need to implement the improved way for transformers eventually
                device_inputs = inputs[0]
                predicted_ids = []
                for gen_idx in range(token_limit):
                    logits = self._original_model(device_inputs, *inputs[1:], **kwargs)["logits"]
                    predicted_token = torch.multinomial(torch.softmax(logits * temperature, dim=-1), 1)
                    device_inputs = torch.cat([device_inputs, predicted_token], dim=-1)
                    predicted_ids += [predicted_token]
                outputs = torch.cat(predicted_ids, dim=-1)
        return outputs

    def optimizer_step(self):
        """Requires a scheduler that is based on iterations instead of epochs."""
        self.steps += 1
        self.steps_since_reset += 1
        if self.accumulated_samples >= self.current_batch_size:
            self.accumulated_samples = 0

            if self.cfg_train.gradient_clipping is not None:
                self.scaler.unscale_(self.optimizer)
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg_train.gradient_clipping, norm_type=2.0)
            self.scaler.step(self.optimizer)
            self.scaler.update()
            self.optimizer.zero_grad()
            self.schedule_batch_size()
        self.scheduler.step()  # Trigger in every step, otherwise things get annoying with grad accumulation

    def set_train_batch_size(self, batch_size):
        """Allow dynamic modifications of batch size."""
        self.current_batch_size = batch_size
        self.accumulation_steps_expected = self.current_batch_size // self.effective_mbs

    def schedule_batch_size(self):
        """Optionally implement linear batch size ramp-ups."""
        mbs = self.effective_mbs

        if (self.cfg_train.batch_size_ramp > 0) and (self.cfg_train.batch_size_ramp < 1):
            # interpret as percentage of total budget
            elapsed_time = (time.time() - self.initial_time) + self.previous_elapsed_time
            elapsed_hours = elapsed_time / 60 / 60
            fake_step = int(elapsed_hours / self.get_true_budget() * self.cfg_train.steps)
            # WARNING: this does not correctly pick up from checkpoint if elapsed>budget i.e. going over the orginal budget may cause a problem here

            batch_size_step = self.cfg_train.batch_size / (self.cfg_train.steps * self.cfg_train.batch_size_ramp)

            new_batch_size = min(int(fake_step * batch_size_step // mbs + 1) * mbs, self.cfg_train.batch_size)
        elif self.steps < self.cfg_train.batch_size_ramp:
            batch_size_step = self.cfg_train.batch_size / self.cfg_train.batch_size_ramp
            new_batch_size = int(self.steps * batch_size_step // mbs + 1) * mbs
        else:
            new_batch_size = self.cfg_train.batch_size
        self.set_train_batch_size(new_batch_size)

    def record_batch_size(self):
        if self.cfg_train.optim_mod.name != "progressive-batching":
            return self.current_batch_size
        else:
            return self.optimizer.last_full_step_accumulation * self.current_batch_size

    def record_tokens_per_step(self):
        """Tokens in each microbatch step."""
        return self.current_seq_length * self.effective_mbs

    @torch.no_grad()
    def retrieve_model_state_dict(self):
        if self.cfg_impl.compile_torch:
            if torch.distributed.is_initialized():
                state_dict = self.model.module._orig_mod.state_dict()  # ughhhh
            else:
                state_dict = self.model._orig_mod.state_dict()  # ugh
        else:
            if torch.distributed.is_initialized():
                state_dict = self.model.module.state_dict()
            else:
                state_dict = self.model.state_dict()

        state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
        return state_dict

    def _init_distributed(self, model):
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[self.setup["device"]] if self.setup["device"].type == "cuda" else None,
            output_device=self.setup["device"] if self.setup["device"].type == "cuda" else None,
            broadcast_buffers=self.cfg_impl.broadcast_buffers,
            bucket_cap_mb=self.cfg_impl.bucket_cap_mb,
            gradient_as_bucket_view=self.cfg_impl.gradient_as_bucket_view,
            static_graph=self.cfg_impl.static_graph,
        )
        return model

    def load_checkpoint(self, cfg_arch, file, skip_optim_state=False) -> Dict[str, Any]:
        """Load list of states from checkpoint file. Not generally compatible with any other engine?"""
        self.optimizer.zero_grad()
        # defaults
        metadata = {"epochs": 0, "steps": 0, "loss": 0, "data_idx": 0, "elapsed_time": 0.0}
        if file.startswith("hf://"):
            if file.endswith("-untrained"):
                log.info("Loading NO pretrained model as a sanity check ...")
            else:
                self.model = self.model.from_pretrained(file.split("hf://")[1], config=cfg_arch).to(**self.setup)
                # reinit optimizer:
                self.optimizer, self.scheduler = _load_optimizer(
                    self.model, self.cfg_train, self.cfg_impl, metadata.get("elapsed_time", 0.0), self.get_true_budget()
                )
        else:
            # we load back into original model as we want to redistribute the weights across ranks to be super sure!!
            model = load_model_checkpoint(self._original_model, file)
            model.to(**self.setup)
            # reinitialising the model as we are losing the compile speed otherwise
            model = torch.compile(
                model,
                mode=self.cfg_impl.mode,
                dynamic=self.cfg_impl.dynamic,
                fullgraph=self.cfg_impl.fullgraph,
                backend=self.cfg_impl.backend,
                disable=not self.cfg_impl.compile_torch,
                # detailed options; cannot be given at the same time as mode:
                options=flatten(self.cfg_impl._inductor_vars, parent_key="", sep=".") if self.cfg_impl._inductor_vars is not None else None,
            )
            if torch.distributed.is_initialized():
                self.model = self._init_distributed(model)
                log.info("Recompiled and distributed")
            else:
                self.model = model
                log.info("Recompiled")

            if not skip_optim_state:
                state_file = os.path.join(file, "state_dict.pth")
                try:
                    loaded = torch.load(state_file)
                    optim_state = loaded["optim_state"]
                    scheduler_state = loaded["scheduler_state"]
                    scaler_state = loaded["scaler_state"]
                    metadata = loaded["metadata"]
                    self.load_metadata(metadata)

                    # this is mainly so that the scheduler knows about the elapsed time
                    self.optimizer, self.scheduler = _load_optimizer(
                        self.model, self.cfg_train, self.cfg_impl, self.previous_elapsed_time, self.get_true_budget()
                    )
                    self.optimizer.load_state_dict(optim_state)
                    self.scheduler.load_state_dict(scheduler_state)

                    self.scaler.load_state_dict(scaler_state)
                    log.info(f"Successfully loaded state with metadata {metadata}")
                except Exception as e:
                    raise ValueError(f"Error loading optimizer and scheduler states from {state_file}. {e}")
        return metadata

    def load_metadata(self, metadata: Dict[str, Any]):
        self.steps = metadata.get("steps", 0)
        self.previous_elapsed_time = metadata.get("elapsed_time", 0.0)
        # add other state things here

    def save_training_checkpoint(self, checkpoint_directory: str, checkpoint_name: Union[str, float], metadata: Dict[str, Any]):
        """Path, identifier and additional client state. This checkpoint can be used to resume training.
        The default behavior is to save this checkpoint relative to the training working directory.
        """

        os.makedirs(checkpoint_directory, exist_ok=True)
        full_path = os.path.join(checkpoint_directory, checkpoint_name)

        optim_state = self.optimizer.state_dict()
        model_state = self.retrieve_model_state_dict()
        scheduler_state = self.scheduler.state_dict()
        scaler_state = self.scaler.state_dict()
        state_dict = {
            "metadata": metadata,
            "optim_state": optim_state,
            "scaler_state": scaler_state,
            "scheduler_state": scheduler_state,
        }
        safetensor_name = f"{full_path}_model_state.pth"
        save_file(model_state, safetensor_name)
        other_name = f"{full_path}_non_model.pth"
        torch.save(state_dict, other_name)

    def save_final_model(self, base_directory, identifier, tokenizer, cfg_arch, dryrun=False):
        """This checkpoint can be used for downstream tasks.
        The default behavior is to save this checkpoint to a checkpoints folder under base_directory/name/checkpoints"""
        try:
            identifier_str = f"{identifier:2.4f}"
        except ValueError:
            identifier_str = str(identifier)
        full_path = os.path.join(base_directory, "checkpoints", identifier_str)
        os.makedirs(full_path, exist_ok=True)
        # This saves tokenizer_config.json, tokenizer.json and special_tokens_map.json to this folder
        if not dryrun:

            # Save model.safetensors, model_config.json
            save_file(self.retrieve_model_state_dict(), os.path.join(full_path, "model.safetensors"))
            # legacy save: torch.save(self.retrieve_model_state_dict(), os.path.join(full_path, "model.pth"))
            with open(os.path.join(full_path, "model_config.json"), "w") as file:
                json.dump(OmegaConf.to_container(cfg_arch, resolve=True), file)

    def save_model(
        self,
        checkpoint_directory: str,
        checkpoint_name: Union[str, float],
        cfg_arch,
        metadata: Dict[str, Any],
        tokenizer=None,
        save_safe: bool = False,
    ):
        """This checkpoint can be used for downstream tasks.
        The default behavior is to save this checkpoint to a checkpoints folder under base_directory/name/checkpoints"""
        full_path = os.path.join(checkpoint_directory, checkpoint_name)
        os.makedirs(full_path, exist_ok=True)

        with open(os.path.join(full_path, "model_config.json"), "w") as file:
            json.dump(OmegaConf.to_container(cfg_arch, resolve=True), file)

        model_state = self.retrieve_model_state_dict()
        state_dict = {
            "model_state": model_state,
        }

        if save_safe:
            # this is like the final checkpoint, saves as safetensor but doesn't save state
            model_state = state_dict.pop("model_state")
            save_file(model_state, os.path.join(full_path, "model.safetensors"))

        if metadata is not None:
            optim_state = self.optimizer.state_dict()
            scheduler_state = self.scheduler.state_dict()
            scaler_state = self.scaler.state_dict()
            state_dict["metadata"] = metadata
            state_dict["optim_state"] = optim_state
            state_dict["scheduler_state"] = scheduler_state
            state_dict["scaler_state"] = scaler_state
        if len(state_dict) > 0:
            # if save_safe this will only save non-model stuff
            state_dict_path = os.path.join(full_path, "state_dict.pth")
            torch.save(state_dict, state_dict_path)

        return full_path

    def push_to_hub(self, tokenizer, cfg, dryrun=False):
        """Analogous to save_final_model, but save model to hugginface hub."""
        from huggingface_hub import HfApi
        from io import BytesIO

        api = HfApi()

        if not dryrun:
            log.info(f"Pushing model to hub repository {cfg.impl.hf_directoy_name}.")
            final_state_dict = self.retrieve_model_state_dict()
            self.model.load_state_dict(final_state_dict)

            # Push model with safetensors:
            # This is a manual modification of model.push_to_hub which doesn't support safetensors yet
            repo_id = cfg.impl.hf_directoy_name
            if os.path.isdir(repo_id):
                working_dir = repo_id
                repo_id = repo_id.split(os.path.sep)[-1]
            else:
                working_dir = repo_id.split("/")[-1]
            repo_id = self.model._create_repo(repo_id)
            use_temp_dir = not os.path.isdir(working_dir)
            with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:
                files_timestamps = self.model._get_files_timestamps(work_dir)
                # Save all files.
                self.model.save_pretrained(
                    work_dir,
                    max_shard_size="10GB",
                    safe_serialization=True,
                    state_dict=self.retrieve_model_state_dict(),
                )
                self.model._upload_modified_files(
                    work_dir,
                    repo_id,
                    files_timestamps,
                    commit_message=None,
                    token=None,
                    create_pr=None,
                )
            # Push tokenizer:
            tokenizer.push_to_hub(cfg.impl.hf_directoy_name)
            # Push config files:
            for config_group, config_name in zip([cfg.arch, cfg.data, cfg.train], ["arch", "data", "train"]):
                buffer = BytesIO()
                buffer.write(json.dumps(OmegaConf.to_container(config_group, resolve=True), indent=4).encode())
                api.upload_file(
                    path_or_fileobj=buffer,
                    path_in_repo=f"{config_name}_budget_hours_{cfg.budget}.json",
                    repo_id=f"{api.whoami()['name']}/{cfg.impl.hf_directoy_name}",
                    # there has to be a better way to do this, but ...
                    repo_type="model",
                )
        else:
            log.info(f"Skipping huggingface upload in dryrun state. Would upload to {cfg.impl.hf_directoy_name}.")


def _load_optimizer(model, cfg_train, cfg_impl, elapsed_time=0.0, true_budget=-1):
    # Filter some parameters
    grouped_parameters = group_parameters(model, cfg_train)

    # Select optimizer implementation
    if cfg_train.optim.type == "AdamW":
        optimizer_class = torch.optim.AdamW
    elif cfg_train.optim.type == "Adam":
        optimizer_class = torch.optim.Adam
    elif cfg_train.optim.type == "RAdam":
        optimizer_class = torch.optim.RAdam
    elif cfg_train.optim.type == "SGD":
        optimizer_class = torch.optim.SGD
    elif cfg_train.optim.type == "Adafactor":
        optimizer_class = transformers.Adafactor
    elif cfg_train.optim.type == "Shampoo":
        optimizer_class = Shampoo
    elif cfg_train.optim.type == "AdaHessian":
        optimizer_class = Adahessian
    elif cfg_train.optim.type == "AdamWScale":
        optimizer_class = AdamWScale
    elif cfg_train.optim.type == "Sophia-G":
        optimizer_class = Sophia
    elif cfg_train.optim.type == "Lion":
        from lion_pytorch import Lion

        optimizer_class = Lion

    elif cfg_train.optim.type == "Adam8bit":
        import bitsandbytes as bnb

        optimizer_class = bnb.optim.Adam8bit
    elif cfg_train.optim.type == "AGD":
        depth = len(list(model.parameters()))
        optimizer_class = partial(AGD, depth=depth)
    else:
        raise ValueError(f"Invalid optimizer {cfg_train.optim.type} given.")
    optimizer_args = {k: v for k, v in cfg_train.optim.items() if k != "type"}
    if cfg_impl.foreach_optimizer and cfg_train.optim.type != "Shampoo":
        optimizer_args["foreach"] = True

    if torch.distributed.is_initialized() and cfg_impl.zero_redundancy_optimizer:
        # The overlap option is a whole bucket of problems in itself for now...
        optimizer = ZeroRedundancyOptimizer(
            grouped_parameters,
            optimizer_class=optimizer_class,
            parameters_as_bucket_view=True,
            overlap_with_ddp=False,
            **optimizer_args,
        )
    else:
        optimizer = optimizer_class(grouped_parameters, **optimizer_args)

    if cfg_train.optim_mod.name == "none":
        optimizer_to_schedule = optimizer
    else:
        optim_params = {k: v for k, v in cfg_train.optim_mod.items() if k != "name"}
        if cfg_train.optim_mod.name == "LARS":
            optimizer = LARS(optimizer, **optim_params)
        elif cfg_train.optim_mod.name == "LARC":
            optimizer = LARS(optimizer, **optim_params)
        elif cfg_train.optim_mod.name == "SAM":
            optimizer = SAM(optimizer, **optim_params)
        elif cfg_train.optim_mod.name == "progressive-batching":
            optimizer = ProgressiveBatching(optimizer, **optim_params)

        optimizer_to_schedule = optimizer.optim

    scheduler = get_schedule_fn(cfg_train, elapsed_time=elapsed_time, true_budget=true_budget)(optimizer_to_schedule)

    return optimizer, scheduler


================================================
FILE: cramming/backend/utils.py
================================================
import logging
import os
import torch

import logging

from safetensors.torch import load_file, save_file
import cramming

log = logging.getLogger(__name__)


"""Utilities common to several backends."""
def group_parameters(model, cfg_train):
    model_parameters = list(model.named_parameters())
    if len(cfg_train.limited_decay_keys) > 0:
        grouped_parameters = optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model_parameters if not any(nd in n for nd in cfg_train.limited_decay_keys)],
                "weight_decay": cfg_train.optim.weight_decay,
            },
            {
                "params": [p for n, p in model_parameters if any(nd in n for nd in cfg_train.limited_decay_keys)],
                "weight_decay": 0.0,
            },
        ]
    else:
        grouped_parameters = [p for n, p in model_parameters]
    return grouped_parameters


def get_model_engine_tokenizer_dataloaders(cfg, setup, train_eval: bool = True):
    """This function gets the model, model engine (if needed), tokenizer, and data"""
    if train_eval:
        train_eval_cfg = cfg.train
    else:
        train_eval_cfg = cfg.eval

    tokenizer_model = None
    cfg_arch = cfg.arch  # if not loading from checkpoint, need architecture config
    checkpoint_path = None
    try:
        # attempt to load latest in case of preemption
        prev_checkpoint_path = os.path.join(cfg.model_dir, cfg.name, "checkpoints")
        tokenizer_model, cfg_arch, checkpoint_path = cramming.utils.find_pretrained_checkpoint(
            "latest",
            local_checkpoint_folder=str(prev_checkpoint_path),
            arch_modifications=train_eval_cfg.arch_modifications
        )
        log.info(f"Getting latest checkpoint at {prev_checkpoint_path}")

    except:
        # no previous checkpoint saved.  Checking separate model directory
        if train_eval_cfg.checkpoint is not None:
            try:
                tokenizer_model, cfg_arch, checkpoint_path = cramming.utils.find_pretrained_checkpoint(
                    train_eval_cfg.checkpoint,
                    local_checkpoint_folder=cfg.model_dir,
                    arch_modifications=train_eval_cfg.arch_modifications
                )
                log.info(f"Found checkpoint at {cfg.model_dir} or {train_eval_cfg.checkpoint}")
                # importantly, if checkpoint is found, we will use that model arch, modifications doesnt seem to work.
            except Exception as e:
                log.info(f"Unable to load checkpoint {train_eval_cfg.checkpoint} or in directory {cfg.model_dir}."
                         f"  Initializing model from scratch!")

    log.info(f"Loading Data")
    datasets, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl, cfg.data_dir)

    real_dataset_sample_length = len(datasets['train'][0]['input_ids']) # for arithmetic datasets

    if tokenizer_model is not None:
        # todo consider if we even need to return the tokenizer with the checkpoint (only HF?)
        tokenizer = tokenizer_model
    dataloaders = cramming.prepare_dataloaders(datasets, tokenizer, train_eval_cfg, cfg.impl)

    log.info(f"Constructing Model")
    model = cramming.construct_model(cfg_arch, tokenizer)

    metadata = {}

    if train_eval:
        # if in train mode, need engine
        fully_init_model_to_begin = False if checkpoint_path is not None else True
        model_engine = cramming.load_backend(
            model,
            tokenizer,
            cfg.train,
            cfg.impl,
            setup=setup,
            init_compile_and_distribute=fully_init_model_to_begin, # false if we are planning to load a checkpoint in later
        )

        if checkpoint_path is not None:
            # load checkpoint, engine handles loaded model
            metadata = model_engine.load_checkpoint(cfg_arch, checkpoint_path)
            for k, v in dataloaders.items():
                try:
                    # for dataloaders with epochs (RuntimeInfiniteDataLoader) set that epoch to start here
                    v.set_epoch(metadata.get("epoch", 0))
                except:
                    pass

        model_engine.train(train_eval_cfg.pretrain_in_train_mode)
        model_engine.current_seq_length = real_dataset_sample_length # setting the number of tokens seen correctly for arithmetic data
    else:
        if checkpoint_path is not None:
            model = load_model_checkpoint(model, checkpoint_path)
        model_engine = None
    return model, model_engine, tokenizer, dataloaders, metadata


def load_model_checkpoint(model, model_dir, forward_only_model_with_skip=False):
    ext = "model.safetensors"
    try:
        model_file = os.path.join(model_dir, ext)
        model_state = load_file(model_file)
    except:
        ext = "state_dict.pth"
        model_file = os.path.join(model_dir, ext)
        loaded = torch.load(model_file)
        model_state = loaded.get("model_state", None)

    if model_state is None:
        raise ValueError(f"No model found in directory {model_dir} (in '/state_dict.pth' or '/model.safetensors')")
    else:
        log.info(f"Loading Model from {model_file}")

    if "encoder.embedding.word_embedding.weight" not in model_state:
        # Hack to save space when saving the model, more clever though would be save the right one in the first place
        model_state["encoder.embedding.word_embedding.weight"] = model_state["decoder.weight"]
    sanitized_state = {}
    try:
        for k, v in model_state.items():
            if k.startswith("module."):
                k = k[7:]
            if forward_only_model_with_skip:
                if "_orig_mod" in k: # we load in original model to here so we can drop this
                    k = k.replace("._orig_mod", "")
            sanitized_state[k] = v

        model.load_state_dict(sanitized_state, strict=True)
        log.info("finished loading state dict")
    except RuntimeError as e:
        log.info(f"State dict difference is {str(e).split('Error(s) in loading state_dict for')[1]}... Ok?")
        exit()

    return model


================================================
FILE: cramming/config/__init__.py
================================================


================================================
FILE: cramming/config/arch/__init__.py
================================================


================================================
FILE: cramming/config/arch/albert.yaml
================================================
# Instantiates a (non-huggingface) scriptable decoder-based LM
# This is set up to be as close to ALBERT-large (Lan et al.) as reasonable for a decoder-based model

model_type: ScriptableCrammedDepthRecurrent

layers_in_recurrent_block: 1
maximal_recurrence: 24
max_backprop: # use half of maximal_recurrence if not given, minimal is 1 # only valid for TBTT
maximal_recurrence_in_eval: 24

hidden_size: 1024
intermed_size: 4096
input_injection_type: none
initial_hidden_randomized: False
state_init:

norm: LayerNorm
norm_eps: 1e-12
norm_scheme: post # can be "pre", "post"
nonlin: GELU
sub_normalization: False

tie_weights: True # Tie input/output embedding
decoder_bias: True # Whether to include a bias in the decoding step
use_bias: True # Whether to learn biases on all dense layers
final_norm: False # Add a final norm layer before the end
head: identity

objective_layout: fixed

embedding:
  vocab_size: # will be populated automatically
  pos_embedding: learned
  max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
  embedding_dim: 128
  normalization: True
  stable_low_precision: False

attention:
  type: pytorch # also works with "pytorch"
  num_attention_heads: 16 # for flash
  skip_output_projection: False
  qkv_bias: True
  bias_in_proj: True

  rotary_embedding: False
  seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
  sequence_op: torch-softmax # Can be normalization
  sub_normalization: False # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)

init:
  type: normal
  std: 0.02

throttle: False # only active during TBPTT
local_compilation: True # Try to compile the static block, no matter what the global compile setting is set to


================================================
FILE: cramming/config/arch/crammed-depthrecurrent.yaml
================================================
# Instantiates a (non-huggingface) scriptable decoder-based LM
# This inherits architecture changes from the crammed-bert project

model_type: ScriptableCrammedDepthRecurrent

layers_in_recurrent_block: 4
maximal_recurrence: 4
max_backprop: # use half of maximal_recurrence if not given, minimal is 1
maximal_recurrence_in_eval: ${arch.maximal_recurrence} # could be set to think longer

hidden_size: 768
intermed_size: 3072
input_injection_type: add
initial_hidden_randomized: True
state_init: embed # initialized random like embedding


norm: LayerNorm
norm_eps: 1e-12
norm_scheme: post # can be "pre", "post"

nonlin: GELUglu
sub_normalization: False # Sub-normalization in attn and ffn blocks

tie_weights: False # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step
use_bias: False # Whether to learn biases on all dense layers
final_norm: True # Add a final norm layer before the end
head: ffn

objective_layout: TBPTT

embedding:
  vocab_size: # will be populated automatically
  pos_embedding: learned
  max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
  embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
  normalization: True
  stable_low_precision: False
  max_abacus_len: 100

attention:
  type: pytorch # also works with "pytorch"
  num_attention_heads: 16 # for flash
  skip_output_projection: False
  qkv_bias: False
  bias_in_proj: False
  max_length: 0 # for randomised PE's (NOT IMPLEMENTED FOR ALL)

  rotary_embedding: False
  seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
  sequence_op: torch-softmax # Can be normalization
  sub_normalization: ${arch.sub_normalization} # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)

init:
  type: deepnorm-straight
  std: 0.02 # only used if type=normal

throttle: False # only active during TBPTT
alpha: 1.0 # only active during TBPTT
mask_before_equals: False
local_compilation: True # Try to compile the static block, no matter what the global compile setting is set to
loss_reduction: mean
forward_only_model_with_skip: False # forward only model with skip

================================================
FILE: cramming/config/arch/crammed-fakeRNN.yaml
================================================
# Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
# Modernized version of bert-c5

# These are the huggingface bert parameters
model_type: ScriptableFakeRNN

n_blocks: 5
state_size: 512
hidden_size: 512
bottle_size: 256
block_type: resnet

tie_weights: True # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step

loss: cross-entropy
objective_layout: autoregressive

embedding:
  vocab_size: # will be populated automatically
  pos_embedding: None
  dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
  pad_token_id: 0
  max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
  embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
  normalization: False
  stable_low_precision: False

init:
  type: normal
  std: 0.02

# Set dynamically:
eos_token_id:


================================================
FILE: cramming/config/arch/crammed-janus.yaml
================================================
# Instantiates a (non-huggingface) scriptable janus-type RNN, right now with all tested bells-and-whistles

# These are the huggingface bert parameters
model_type: ScriptableCrammedJanus

num_transformer_layers: 8
state_dim: 1024

norm_scheme: shaped
norm: LayerNorm
norm_eps: 1e-12

nonlin: GELUglu
sub_normalization: False # Sub-normalization in attn and ffn blocks

tie_weights: True # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step
use_bias: True # Whether to learn biases on all dense layers
final_norm: True # crashes without this improvement to stability
force_normalized_state: False # last normalization learnable?

loss: cross-entropy
objective_layout: autoregressive # nothing else implemented so far

ffn_block:
  structure: joined-injection # state-branch-embedding-injection

  intermed_multiplier: 4
  hidden_dropout_prob: 0.0

  num_chunks_in_sequence: 16 # only necessary if head.structure=chunked

head:
  structure: ffn # dense-nonlin-norm
  nonlin: GELU
  norm: LayerNorm
  norm_eps: 1e-12
  use_bias: True
  include_attn_in_chunked_heads: False # only valid for chunked heads
  num_chunked_heads: 4 # only valid for chunked heads
  intermed_multiplier: 4

objective:
  historian_weight: 1.0
  predictor_weight: 1.0
  present_historian_weight: 1.0
  present_predictor_weight: 1.0
  rscale_correction: False

  antiquarian_weight: 0.0 #
  antiquarian_range: ${data.seq_length} # maximal range a previous state may be looked up with # set to -1 to encompass all previous states
  historian_loss_fn: MSE # can also be cosine

embedding:
  vocab_size: # will be populated automatically
  pos_embedding:
  embedding_dim: 512
  normalization: True
  stable_low_precision: False
  max_seq_length: ${data.seq_length} # legacy position, do not use


max_seq_length: ${data.seq_length} # max seq length during training (not always used)
position_information: learned # none learned or simple

init:
  type: megatron
  std: 0.02 # only used if type=normal

# Experimental options:
state_corruption: 0.0
state_init: unit
eos_state_reset: True

# Set dynamically:
eos_token_id:


================================================
FILE: cramming/config/arch/crammed-rnn.yaml
================================================
# Instantiates a (non-huggingface) scriptable encoder-based LM with BERT as baseline
# Modernized version of bert-c5

# These are the huggingface bert parameters
model_type: ScriptableCrammedRNN

# PyTorch LSTM settings:
input_size: 512
hidden_size: 512
num_layers: 2
bias: True
seq_first: True
dropout: 0.1
bidirectional: False
proj_size: 0

norm: LayerNorm
norm_eps: 1e-12
final_norm: True # Add a final norm layer before the end
skip_head_transform: True # This is only possible if embedding_dim=hidden_size
use_bias: False # Whether to learn biases on all dense layers

tie_weights: True # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step

loss: cross-entropy
objective_layout: autoregressive

embedding:
  vocab_size: # will be populated automatically
  pos_embedding: scaled-sinusoidal
  dropout_prob: 0.1 # equal to hidden_dropout_prob in BERT
  pad_token_id: 0
  max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
  embedding_dim: ${arch.input_size} # has to be this value for crammedBERT
  normalization: True
  stable_low_precision: False

# Set dynamically:
eos_token_id:


================================================
FILE: cramming/config/arch/crammed-stack-janus.yaml
================================================
# Instantiates a (non-huggingface) scriptable janus-type RNN, right now with all tested bells-and-whistles

# These are the huggingface bert parameters
model_type: ScriptableCrammedJanus

num_transformer_layers: 8
state_dim: 3584

norm_scheme: shaped
norm: LayerNorm
norm_eps: 1e-12

nonlin: GELUglu
sub_normalization: False # Sub-normalization in attn and ffn blocks

tie_weights: True # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step
use_bias: True # Whether to learn biases on all dense layers
final_norm: True # crashes without this improvement to stability
force_normalized_state: True # last normalization learnable?

loss: cross-entropy
objective_layout: autoregressive # nothing else implemented so far

ffn_block:
  structure: stack-sideways-transformer
  intermed_multiplier: 4
  hidden_dropout_prob: 0.0

  # settings only relevant for structure=state-attention:
  qkv_bias: True
  proj_bias: True
  num_chunks_in_sequence: 16
  num_read_write_heads: 8
  run_causal_heads: False
  positional_info: True
  garbage_collect_state: False
  num_blocks_to_accumulate: 0 # Can be any number of embedding chunks that will added to state, this is N^2 atttention again :>
  gradient_checkpointing: False
  workspace: ${arch.ffn_block.num_chunks_in_sequence} # only used if block in structure, can be smaller than num_chunks_in_sequence

head:
  structure: chunked # dense-nonlin-norm
  nonlin: GELU
  norm: LayerNorm
  norm_eps: 1e-12
  use_bias: True
  include_attn_in_chunked_heads: True # only valid for chunked heads
  num_chunked_heads: 4 # only valid for chunked heads
  intermed_multiplier: 4

objective:
  historian_weight: 1.0
  predictor_weight: 1.0
  present_historian_weight: 1.0
  present_predictor_weight: 1.0
  rscale_correction: False

  antiquarian_weight: 0.0 #
  antiquarian_range: ${data.seq_length} # maximal range a previous state may be looked up with # set to -1 to encompass all previous states
  historian_loss_fn: MSE

embedding:
  vocab_size: # will be populated automatically
  pos_embedding:
  embedding_dim: 512
  normalization: True
  stable_low_precision: False
  max_seq_length: ${data.seq_length} # legacy position, do not use


max_seq_length: ${data.seq_length} # max seq length during training (not always used)
position_information: learned # none learned or simple

init:
  type: deepnorm-straight
  std: 0.02

# Experimental options:
state_corruption: 0.0
eos_state_reset: True
state_init: unit

# Set dynamically:
eos_token_id:


================================================
FILE: cramming/config/arch/crammed-tiny.yaml
================================================
# Instantiates a (non-huggingface) scriptable decoder-based LM
# This is the tiny setting, modified from bert-tiny with larger hidden and lower number of heads

model_type: ScriptableCrammedTransformer

num_transformer_layers: 4
hidden_size: 256
intermed_size: 1024

norm: LayerNorm
norm_eps: 1e-12
norm_scheme: pre # can be "pre", "post", "sandwich"
nonlin: GELUglu

tie_weights: True # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step
use_bias: False # Whether to learn biases on all dense layers
final_norm: True # Add a final norm layer before the end
sub_normalization: False # Sub-normalization in attn and ffn blocks

loss: cross-entropy

embedding:
  vocab_size: # will be populated automatically
  pos_embedding: scaled-sinusoidal
  max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
  embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
  normalization: True
  stable_low_precision: False

attention:
  type: pytorch # also works with "pytorch"
  num_attention_heads: 8
  skip_output_projection: False
  qkv_bias: False
  bias_in_proj: False

  rotary_embedding: False
  seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
  sequence_op: torch-softmax # Can be normalization
  sub_normalization: ${arch.sub_normalization} # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)

init:
  type: normal
  std: 0.02


================================================
FILE: cramming/config/arch/crammed-transformer.yaml
================================================
# Instantiates a (non-huggingface) scriptable decoder-based LM
# This inherits architecture changes from the crammed-bert project
# How performant is this?

model_type: ScriptableCrammedTransformer

num_transformer_layers: 16
hidden_size: 768
intermed_size: 3072

norm: LayerNorm
norm_eps: 1e-12
norm_scheme: pre # can be "pre", "post"
nonlin: GELUglu

tie_weights: True # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step
use_bias: False # Whether to learn biases on all dense layers
final_norm: True # Add a final norm layer before the end
sub_normalization: False # Sub-normalization in attn and ffn blocks

embedding:
  vocab_size: # will be populated automatically
  pos_embedding: scaled-sinusoidal
  max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
  embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
  normalization: True
  stable_low_precision: False

attention:
  type: pytorch # also works with "pytorch"
  num_attention_heads: 16 # for flash
  skip_output_projection: False
  qkv_bias: False
  bias_in_proj: False

  rotary_embedding: False
  seq_op_in_fp32: False # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
  sequence_op: torch-softmax # Can be normalization
  sub_normalization: ${arch.sub_normalization} # could be turned off separately # Is only used if type=self-attention (i.e the hand-made version)

init:
  type: normal
  std: 0.02


================================================
FILE: cramming/config/arch/gpt2-base.yaml
================================================
# Instantiates a (non-huggingface) scriptable decoder-based LM
# This matches the gpt2 settings in the custom implementation
# (minus dropout which I did not even implement)

model_type: ScriptableCrammedTransformer

num_transformer_layers: 12
hidden_size: 768
intermed_size: 3072

norm: LayerNorm
norm_eps: 1e-05
norm_scheme: post # can be "pre", "post"
nonlin: GELU

tie_weights: True # Tie input/output embedding
decoder_bias: False # Whether to include a bias in the decoding step
use_bias: True # Whether to learn biases on all dense layers
final_norm: True # Add a final norm layer before the end
sub_normalization: False

embedding:
  vocab_size: # will be populated automatically
  pos_embedding: learned
  max_seq_length: ${data.seq_length} # max seq length that the positional embedding is instantiated for
  embedding_dim: ${arch.hidden_size} # has to be this value for crammedBERT
  normalization: True
  stable_low_precision: False

attention:
  type: pytorch # also works with "pytorch"
  num_attention_heads: 12
  skip_output_projection: False
  qkv_bias: True
  bias_in_proj: True

  rotary_embedding: False
  seq_op_in_fp32: True # whether to always cast the operation over the sequence into fp32 (e.g.. the softmax in normal attn)
  sequence_op: torch-softmax # Can be normalization
  sub_normalization: False

init:
  type: normal
  std: 0.02


================================================
FILE: cramming/config/arch/hf-gpt2.yaml
================================================
# These are the huggingface bert parameters

model_type: "gpt2"

n_ctx: 1024
n_embd: 768
n_head: 12
n_layer: 12
n_positions: ${data.seq_length} # max seq length that the positional embedding is instantiated for


activation_function: "gelu_new"
attn_pdrop: 0.1
resid_pdrop: 0.1
embd_pdrop: 0.1
initializer_range: 0.02
layer_norm_epsilon: 1e-05


summary_activation: null
summary_first_dropout: 0.1
summary_proj_to_labels: true
summary_type: "cls_index"
summary_use_proj: true

bos_token_id: 50256
eos_token_id: 50256


================================================
FILE: cramming/config/arch/sanitycheck.yaml
================================================
model_type: SanityCheckLM

width: 1024 # 8352


================================================
FILE: cramming/config/cfg_eval.yaml
================================================
# Configuration defaults
# Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
defaults:
  - impl: torch-default
  - train: common
  - wandb: default
  - eval: pythia
  - data: arithemtic
  - _self_
  - override hydra/job_logging: custom

reverse_inputs: True
pad_zeros: 0
extended_eval: False
greedy: True
temp: 1.0
token_limit: 30 # number of tokens in 'thinking plot'
max_rec: null # to give more or less recurrence at evaluation that during training

## Addition
remove_padding: True # used as our eval data has some padding in it that needs to be removed on the fly
large: True
ood_only: False
up_to_40: False
up_to_50: False

checkerboard: null
big_eval_step_1: False
big_eval_step_2: False
big_eval_step_3: False
big_eval_step_4: False
big_eval_step_5: False
big_eval_step_6: False
big_eval_step_7: False
big_eval_step_8: False
big_eval_step_9: False
big_eval_step_10: False

# for doing custom splits
max_size_given: null
start_ind_1_given: null
start_ind_2_given: null

## Multiplication
mul: False

## Pos arithmetic
pos_arth: False
pos_arth_ood: False

wandb:
  project: generative-eval

# Total and central computation budget in hours:
budget: 24
overall_budget: ${budget}

base_dir: outputs
model_dir:

hydra:
  sweep:
    dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
  run:
    dir: ${base_dir}/${name}/downstream/${now:%Y-%m-%d}/${now:%H-%M-%S}
  job:
    chdir: True

seed: # Optional: Set initial seed

# A name for this run [will draw the checkpoint from runs with this name
# and use this name for the summary table and outputs folder]
name: default

# debug implementation by running every loop just once:
dryrun: False


================================================
FILE: cramming/config/cfg_pretrain.yaml
================================================
# Configuration defaults
# Settings are separated into hyperparameters for architecture, data, implementation and train/eval hyperparams
defaults:
  - arch: crammed-depthrecurrent
  - data: arithmetic
  - impl: torch-default
  - wandb: default
  - train: cramming
  - _self_
  - override hydra/job_logging: custom

base_dir: outputs
model_dir: ${base_dir}
data_dir:

hydra:
  sweep:
    dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
  run:
    dir: ${base_dir}/${name}/pretrain/${now:%Y-%m-%d}/${now:%H-%M-%S}
  job:
    chdir: True

seed: # Optional: Set initial seed
name: default # A name for this run [will be used for the summary table and outputs folder]

# Total and central computation budget in hours:
budget: 4
overall_budget: ${budget}

# debug implementation by running every loop just once:
dryrun: False


================================================
FILE: cramming/config/data/__init__.py
================================================


================================================
FILE: cramming/config/data/arithmetic.yaml
================================================
name: arithmetic
defaults:
  - sources:
      - arithmetic


# all the below stuff may not be required
# Preprocessing
normalizer:
  force_lowercase: False
  strip_accents: False
  force_english_keyboard: False
tokenizer: bigcode/starcoder
vocab_size: 49152 #32768 # 2^17

# Dataset Formation
seq_length: 512
include_eot_token_in_corpus: True

max_entries_in_raw_dataset: 20e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
max_seq_in_tokenized_dataset: 80e6 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
remove_trash: False
trash_cutoff: 0.25
deduplicate_entries: False
deduplication_threshold: 75

# Data Order:
ordering: randomized # for now

# Validation Split
validation_seqs: 4096 # how many sequences to reserve for validation


================================================
FILE: cramming/config/data/c4-subset-processed.yaml
================================================
# This would be a slice of C4
name: c4-subset
defaults:
  - sources:
      - c4

# Preprocessing
normalizer:
  force_lowercase: False
  strip_accents: False
  force_english_keyboard: False
tokenizer: SentencePieceBPE
vocab_size: 131072 # 2^17

# Dataset Formation
seq_length: 512
include_eot_token_in_corpus: True

max_entries_in_raw_dataset: 25e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
max_seq_in_tokenized_dataset: 85e6 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
remove_trash: False
trash_cutoff: 0.25
deduplicate_entries: False
deduplication_threshold: 75

# Data Order:
ordering: randomized # for now

# Validation Split
validation_seqs: 4096 # how many sequences to reserve for validation


================================================
FILE: cramming/config/data/openweb.yaml
================================================
# Selection of English sources from the ROOTS project
name: openweb
defaults:
  - sources:
      - openwebtext

# Preprocessing
normalizer:
  force_lowercase: False
  strip_accents: False
  force_english_keyboard: False
tokenizer: BPE
vocab_size: 32768 # 2^17

# Dataset Formation
seq_length: 512
include_eot_token_in_corpus: True

max_entries_in_raw_dataset: 20e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
max_seq_in_tokenized_dataset: 80e6 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
remove_trash: False
trash_cutoff: 0.25
deduplicate_entries: False
deduplication_threshold: 75

# Data Order:
ordering: randomized # for now

# Validation Split
validation_seqs: 4096 # how many sequences to reserve for validation


================================================
FILE: cramming/config/data/proofpile.yaml
================================================
name: proofpile
defaults:
  - sources:
      - proofpiledata

# Preprocessing
normalizer:
  force_lowercase: False
  strip_accents: False
  force_english_keyboard: False
tokenizer: EleutherAI/llemma_34b
vocab_size: 49152 #32768 # 2^17

# Dataset Formation
seq_length: 512
include_eot_token_in_corpus: True

max_entries_in_raw_dataset: 10e5 #10e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
max_seq_in_tokenized_dataset: 5e4 #5e5 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
remove_trash: False
trash_cutoff: 0.25
deduplicate_entries: False
deduplication_threshold: 75

# Data Order:
ordering: randomized # for now

# Validation Split
validation_seqs: 4096 # how many sequences to reserve for validation


================================================
FILE: cramming/config/data/sanity-check-1.yaml
================================================
# Just a bunch of fake data ...
name: sanity-check-1
defaults:
  - sources:
      - fake

#
# Preprocessing
normalizer: # This is ignored and the default bert normalizer is used instead
  force_lowercase:
  strip_accents:
  force_english_keyboard:
tokenizer: gpt2
vocab_size: 50257

# Dataset Formation
seq_length: 64
include_eot_token_in_corpus:

max_entries_in_raw_dataset: 1e12 # Select only this many examples from the dataset
max_seq_in_tokenized_dataset: 1e12 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
remove_trash: False
trash_cutoff: 0.3
deduplicate_entries: False
deduplication_threshold: 100

# Data Order:
ordering: randomized # could be a curriculum

# Validation Split
validation_seqs: 128 # how many sequences to reserve for validation


================================================
FILE: cramming/config/data/sanity-check-2.yaml
================================================
# Just a tiny test dataset ...
name: sanity-check-2
# https://hydra.cc/docs/patterns/select_multiple_configs_from_config_group/
defaults:
  - sources:
      - ag_news

# Preprocessing
normalizer:
  force_lowercase: False
  strip_accents: False
  force_english_keyboard: False
tokenizer: BPE # faster for sanity checks
vocab_size: 32768 # to make sure there are not memory surprises compared to the actual data

# Dataset Formation
seq_length: 128
include_eot_token_in_corpus: True

max_entries_in_raw_dataset: 1e10 # Select only this many examples from the dataset
max_seq_in_tokenized_dataset: 1e10 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
remove_trash: False
trash_cutoff: 0.3
deduplicate_entries: False
deduplication_threshold: 100

# Data Order:
ordering: randomized # could be a curriculum

# Validation Split
validation_seqs: 128 # how many sequences to reserve for validation


================================================
FILE: cramming/config/data/sources/ag_news.yaml
================================================
# For sanity testing
ag_news:
  provider: huggingface
  partition: default
  split: train

  streaming: False

  remove_columns: label
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/arithmetic.yaml
================================================
# Just a bunch of fake data ...
arithmetic:
  provider: arithmetic
  split:

  randgen_seed: 0
  size: 2048

  tokenized_dataset_path: "arithmetic_data/+_n_3_m_3_examples_100_seed_42/hf_tokenized_dataset"
  tokenizer_type: # for specifiying which arthmetic tokenizer we want to use


================================================
FILE: cramming/config/data/sources/bookcorpus.yaml
================================================
# The bookcorpus dataset, drawn from it huggingface mirror
bookcorpus:
  provider: huggingface
  partition: plain_text
  split: train

  streaming: False

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 16


================================================
FILE: cramming/config/data/sources/c4.yaml
================================================
# The wikipedia en dataset, drawn from it huggingface mirror
c4:
  provider: huggingface
  partition: en
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/dash_books.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_book_dash_books:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/fake.yaml
================================================
# Just a bunch of fake data ...
fake:
  provider: fake
  split:

  randgen_seed: 0
  size: 2048


================================================
FILE: cramming/config/data/sources/iwslt.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_ted_talks_iwslt:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/local.yaml
================================================
# Just a bunch of fake data ...
local:
  provider: local
  split:

  randgen_seed: 0
  size: 2048


================================================
FILE: cramming/config/data/sources/no_code_stackexchange.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_no_code_stackexchange:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/openwebtext.yaml
================================================
# The open webtext replication, as mirrored on HF
openwebtext:
  provider: huggingface
  partition: plain_text
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/oscar.yaml
================================================
# The oscar dataset, drawn from it huggingface mirror
# should be 1.2T in this deduplicated version
oscar:
  provider: huggingface
  partition: unshuffled_deduplicated_en
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0 # cannot concat when streaming


================================================
FILE: cramming/config/data/sources/proofpiledata.yaml
================================================
# The open webtext replication, as mirrored on HF
EleutherAI/proof-pile-2:
  provider: huggingface
  partition: open-web-math #['default', 'arxiv', 'open-web-math', 'algebraic-stack']
  split: train

  streaming: False #True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/the_pile.yaml
================================================
#
the_pile:
  provider: local
  file_type: json
  files:
    - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
  filter:
    #  pile_set_name:
    # possible pile_set_name values are
    # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
    # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
    # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
    # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
    # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
    # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
    # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
    # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
    # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
    # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
    # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
    # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
    # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
    # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
    # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
    # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
    # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
    # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
    # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
    # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
    # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
    # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
  split: train
  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/the_pileCC.yaml
================================================
#
the_pileCC:
  provider: local
  file_type: json
  files:
    - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
  filter:
    pile_set_name:
      - Pile-CC
  # possible pile_set_name values are
  # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
  # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
  # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
  # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
  # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
  # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
  # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
  # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
  # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
  # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
  # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
  # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
  # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
  # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
  # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
  # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
  # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
  # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
  # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
  # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
  # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
  # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
  split: train
  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/the_pile_dedup.yaml
================================================
# The EleutherAI/the_pile_deduplicated
EleutherAI/the_pile_deduplicated:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/the_pile_natural.yaml
================================================
#
the_pile_natural:
  provider: local
  file_type: json
  files:
    - "/fs/cml-datasets/Pile/train/00.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/01.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/02.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/03.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/04.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/05.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/06.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/07.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/08.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/09.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/10.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/11.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/12.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/13.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/14.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/15.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/16.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/17.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/18.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/19.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/20.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/21.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/22.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/23.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/24.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/25.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/26.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/27.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/28.jsonl.zst"
    - "/fs/cml-datasets/Pile/train/29.jsonl.zst"
  filter:
    pile_set_name:
      - Gutenberg
      - Books3
      - Wikipedia (en)
  # possible pile_set_name values are
  # Pile-CC 227.12 GiB 18.11% 1.0 227.12 GiB 4.33 KiB
  # PubMed Central 90.27 GiB 14.40% 2.0 180.55 GiB 30.55 KiB
  # # Books3† 100.96 GiB 12.07% 1.5 151.44 GiB 538.36 KiB
  # OpenWebText2 62.77 GiB 10.01% 2.0 125.54 GiB 3.85 KiB
  # ArXiv 56.21 GiB 8.96% 2.0 112.42 GiB 46.61 KiB
  # Github 95.16 GiB 7.59% 1.0 95.16 GiB 5.25 KiB
  # FreeLaw 51.15 GiB 6.12% 1.5 76.73 GiB 15.06 KiB
  # Stack Exchange 32.20 GiB 5.13% 2.0 64.39 GiB 2.16 KiB
  # USPTO Backgrounds 22.90 GiB 3.65% 2.0 45.81 GiB 4.08 KiB
  # PubMed Abstracts 19.26 GiB 3.07% 2.0 38.53 GiB 1.30 KiB
  # Gutenberg (PG-19)† 10.88 GiB 2.17% 2.5 27.19 GiB 398.73 KiB
  # OpenSubtitles† 12.98 GiB 1.55% 1.5 19.47 GiB 30.48 KiB
  # Wikipedia (en)† 6.38 GiB 1.53% 3.0 19.13 GiB 1.11 KiB
  # DM Mathematics† 7.75 GiB 1.24% 2.0 15.49 GiB 8.00 KiB
  # Ubuntu IRC 5.52 GiB 0.88% 2.0 11.03 GiB 545.48 KiB
  # BookCorpus2 6.30 GiB 0.75% 1.5 9.45 GiB 369.87 KiB
  # EuroParl† 4.59 GiB 0.73% 2.0 9.17 GiB 68.87 KiB
  # HackerNews 3.90 GiB 0.62% 2.0 7.80 GiB 4.92 KiB
  # YoutubeSubtitles 3.73 GiB 0.60% 2.0 7.47 GiB 22.55 KiB
  # PhilPapers 2.38 GiB 0.38% 2.0 4.76 GiB 73.37 KiB
  # NIH ExPorter 1.89 GiB 0.30% 2.0 3.79 GiB 2.11 KiB
  # Enron Emails† 0.88 GiB 0.14% 2.0 1.76 GiB 1.78 KiB
  split: train
  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/the_pile_stream.yaml
================================================
# Pile streaming from huggingface with new streaming tech :>
# should be 1.2T in this deduplicated version
EleutherAI/the_pile:
  provider: huggingface
  partition: unshuffled_deduplicated_en
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0 # cannot concat when streaming


================================================
FILE: cramming/config/data/sources/uncorpus.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_uncorpus:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/uspto.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_the_pile_uspto:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/wikibooks.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_wikibooks:
  provider: huggingface
  partition:
  split: train

  streaming: False

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/wikinews.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_wikinews:
  provider: huggingface
  partition:
  split: train

  streaming: False

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/wikipedia.yaml
================================================
# The wikipedia en dataset, drawn from it huggingface mirror
wikipedia:
  provider: huggingface
  partition: 20220301.en
  split: train

  streaming: False

  # source-specific cleaning rules?
  remove_columns: title
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/wikiquote.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_wikiquote:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/wikiversity.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_wikiversity:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/data/sources/wikivoyage.yaml
================================================
# A part of ROOTS
bigscience-data/roots_en_wikivoyage:
  provider: huggingface
  partition:
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0


================================================
FILE: cramming/config/eval/__init__.py
================================================


================================================
FILE: cramming/config/eval/pythia.yaml
================================================
# defaults:
#   - optim: adam
#   - tasks:
      # - winogrande
      # - lambada_openai
      # - piqa
      # - winograd_wsc
      # - arc
      # - sciq
      # - logiqa

name: pythia-tests

arch_modifications: null
# checkpoint name:
# This can be either "latest", or a reference to a specific checkpoint in a subfolder
checkpoint: latest
path: ${impl.path} # Path for caches of datasets and tokenizers


================================================
FILE: cramming/config/eval/tasks/lambada_openai.yaml
================================================
# dataset-specific settings
lambada_openai:


================================================
FILE: cramming/config/eval/tasks/winogrande.yaml
================================================
# dataset-specific settings
winogrande:


================================================
FILE: cramming/config/hydra/__init__.py
================================================


================================================
FILE: cramming/config/hydra/job_logging/custom.yaml
================================================
# python logging configuration for tasks
version: 1
formatters:
  simple:
    format: "[%(asctime)s] %(message)s"
handlers:
  console:
    class: logging.StreamHandler
    formatter: simple
    stream: ext://sys.stdout
  file:
    class: logging.FileHandler
    formatter: simple
    # relative to the job log directory
    filename: ${name}_${hydra.job.name}.log
root:
  level: INFO
  handlers: [console, file]

disable_existing_loggers: false


================================================
FILE: cramming/config/impl/__init__.py
================================================


================================================
FILE: cramming/config/impl/_default.yaml
================================================
# Settings for implementation details
# These settings "should" not influence the outcome of the computation in major ways, only its speed.
# These settings are generic implementation details
# -----------------------------------------------------------------------------------------------------

# This is the main folder where data will be stored (such as caches of datasets and tokenizers):
# This can be an absolute path (which will be honored) or a relative path
# The relative path will be executed relative to the cfg.base_dir
# This behavior is controlled in the main_launcher
path: data

# data implementation:
local_staging_dir: # Optionally copy a preprocessed dataset into this folder before loading it for training
forbid_dataset_preprocessing: True
temporary_corpus: False # Save data directly into local staging dir, forget after use
max_raw_chunk_size: 1e14

# checkpointing and logging:
print_loss_every_nth_step: 1000
save_intermediate_checkpoints: False
save_every_nth_step: -1
save_every_n_minutes: -1
save_intermediate_model_name:

# early termination, cancel runs that do not meet this loss threshold early.
early_termination:
  enabled: False
  budget: 3 # budget in hours
  loss_threshold: 6.0 # modify this for non-xent losses
  overall_budget: -1

# Batch size settings:
# batch_size: This is handled in train after commit 982a4d33cd7f79a48b691114ae78f6ad1cdbee69
microbatch_size: 128 # dont make it larger than batch_size...

# Basic compute settings
threads: 32 # maximal number of cpu dataloader workers used per GPU, this value will never exceed num_gpus * num_physical threads
# Dataloader multiprocessing
pad_to_multiple_of: 8 # padding in dataloader during downstream
shuffle_in_dataloader: False # There is still shuffling in the preprocessing pipeline.
pin_memory: True
prefetch_factor: 2
persistent_workers: True # this clashes with pin_memory in pytorch<1.7.1

# Default floating point precision:
default_precision: float # needs to be a pytorch datatype

# Distributed training
dist_backend: nccl
sharing_strategy: # file_descriptor # if no argument is given, then the OS default is picked by pytorch

# Misc:
enable_huggingface_offline_mode: False
local_rank: # This is set automatically by the system_startup

save_final_model: False
push_to_huggingface_hub: False
hf_directoy_name: "test-crammedBERT-c5" # set a clever name here!

add_env_variables:
# should be NAME: stringval

# TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE
# TORCHINDUCTOR_MAX_AUTOTUNE_GEMM

# Other constants:
# OMP_NUM_THREADS:[number_of_physical_cores]
# OMP_SCHEDULE:  # STATIC
# OMP_PROC_BIND: # CLOSE
# GOMP_CPU_AFFINITY:  # "N-M"
# KMP_AFFINITY: # "granularity=fine,compact,1,0"
# KMP_BLOCKTIME: # 1
# optional_ld_preloads:
#  - libiomp5.so
# - jemalloc.so

#
# ### jemalloc
# export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
# export LD_PRELOAD=/home/mingfeim/packages/jemalloc-5.2.1/lib/libjemalloc.so
#
# ### tcmalloc
# export LD_PRELOAD=/home/mingfeim/packages/gperftools-2.8/install/lib/libtcmalloc.so

example_token_limit: 30 # never generate more example tokens than this
# example_prompts:
#   - "Oh, distinctly I remember, it was in the bleak"
#   - "The capital of Germany is"
#   - "The Westphalian peace ended the"
#   - "Hi! My name is"
#   - "In the place where we were born,"
#   - "Time is a"

# example_prompts:
#   - "System.out.println("
#   - "public class "
#   - "public static void main"
#   - "/* print hello world */"
#   - "System.out.println(2);"
#   - "for (let i = 0; i < myarray.length; i++) {"
example_prompts:
    - "3 + 3 = "
    - "44 + 56 = "
    - "003 + 003 = "
    - "070 + 094 = "
    - "345 + 324 = "
    - "598 + 527 = "
    - "1234 + 4321 = "
    - "94633 + 91826 = "

================================================
FILE: cramming/config/impl/torch-default.yaml
================================================
# Settings for implementation details
# These settings "should" not influence the outcome of the computation in major ways, only its speed.
# These settings are pytorch implementation details, tuned for singl(ish) GPU, sane pytorch stuff
# -----------------------------------------------------------------------------------------------------

name: torch-default
defaults:
  - _default
  - _self_


# Basic pytorch settings
benchmark: True # CUDNN benchmarking
deterministic: False # This option will disable non-deterministic ops
non_blocking: True # unblocked .to(device) handles
tf32_allowed: True
matmul_precision: medium # highest/high/medium

mixed_precision: True # turns on AMP on GPUs/Intel devices. The default precision needs to be float
grad_scaling: True # Only activates when mixed_precision=True
mixed_precision_target_dtype: float16 # you might try your luck with bfloat16 too

# Distributed training:
zero_redundancy_optimizer: False # requires limited_decay_keys=[] for pytorch<=1.10.2
broadcast_buffers: False
bucket_cap_mb: 25
gradient_as_bucket_view: True
static_graph: True

# scaled dot products:
enable_mem_efficient_sdp: False
enable_math_sdp: True
enable_flash_sdp: True

# Misc:
foreach_optimizer: False

# Compilation
compile_torch: True
mode: default # overwritten by manual selection of inductor variables below
dynamic: False # this is a world of pain (when I last tested it, around torch2.0 release)
fullgraph: True # why even compile when not compile everywhere :>
backend: inductor
_inductor_vars:
  # max_autotune_gemm: True
  # max_autotune_pointwise: False # was better in some tests not to enable this?
  # triton:
  #   cudagraphs: False # cannot fit with overhead
  #   # cudagraph_trees: False # fixes memory problems but has scary warning messages
  # # epilogue_fusion: True # true by default is latest nightly
  # # aggressive_fusion: False # oom on latest nightly
  # permute_fusion: True # nice
  # shape_padding: True # flaky on the new nightly?
  # optional to mess with the internal inductor config. Maybe not advisable
  # - `epilogue_fusion` which fuses pointwise ops into templates. Requires `max_autotune` to also be set
  # - `max_autotune` which will profile to pick the best matmul configuration
  # - `fallback_random` which is useful when debugging accuracy issues
  # - `shape_padding` which pads matrix shapes to better align loads on GPUs especially for tensor cores
  # - `triton.cudagraphs` which will reduce the overhead of python with CUDA graphs
  # - `trace.enabled` which is the most useful debugging flag to turn on
  # - `trace.graph_diagram` which will show you a picture of your graph after fusion
  # - For inductor you can see the full list of configs that it supports by calling `torch._inductor.list_options()`
  # or directly at https://github.com/pytorch/pytorch/blob/master/torch/_inductor/config.py


================================================
FILE: cramming/config/train/__init__.py
================================================


================================================
FILE: cramming/config/train/common.yaml
================================================
# Basic hyperparameter for normal BERT pretraining
# working hard here to separate "impl" implementation details and "train" abstract hyperparameters

name: common

defaults:
  - optim: adam_classic
  - optim_mod: disabled

optim:
  lr: 1e-4

limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers

# steps:
warmup_steps: 80_000 # These are microbatch steps
cooldown_steps: 0
steps: 8_000_000 # These are microbatch steps at bs=64. The original 1mio steps for BERT are recovered with 512/64=8
scheduler: polynomial-decay

# Training settting:
stream_depth: ${data.seq_length} # full sequence as input to model
batch_size: 512
batch_size_ramp: 0

gradient_clipping:
pretrain_in_train_mode: True # default BERT trains with dropout layers
reverse_dataset_order: False

budget: ${budget}
overall_budget: ${overall_budget}


================================================
FILE: cramming/config/train/cramming.yaml
================================================
# Version 4 of changes to bert training hyperparameters
# Optimizes MLM rate for torch.compile, includes improved weight decay limitation, finally updated to a relative bs ramp

name: cramming-o4

defaults:
  - optim: adam
  - optim_mod: disabled

optim:
  lr: 1e-3
  weight_decay: 0.01

limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers

# steps:
warmup_steps: 0.1
cooldown_steps: 0.1
steps: 12_000_000 # these are microbatch steps. This is an upper limit that is usually never reached
scheduler: budget-constant

# Training settting:
stream_depth: ${data.seq_length} # full sequence as input to model
batch_size: 8192
batch_size_ramp: 0.60

gradient_clipping: 0.5
pretrain_in_train_mode: True # default BERT trains with dropout layers enabled in pretrain
reverse_dataset_order: False

budget: ${budget}
overall_budget: ${overall_budget}

# for loading previously saved
arch_modifications: null
# checkpoint name:
# This can be either "latest", or a reference to a specific checkpoint in a subfolder
checkpoint: latest
path: ${impl.path} # Path for caches of datasets and tokenizers


================================================
FILE: cramming/config/train/janus-regime.yaml
================================================
# Version 4 of changes to bert training hyperparameters
# Optimizes MLM rate for torch.compile, includes improved weight decay limitation, finally updated to a relative bs ramp

name: cramming-o4

defaults:
  - optim: adam
  - optim_mod: disabled

optim:
  lr: 1e-3
  weight_decay: 0.01

limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers

# steps:
warmup_steps: 0.1
cooldown_steps: 0.1
steps: 4_000_000 # these are microbatch steps. This is an upper limit that is usually never reached
scheduler: budget-constant

# Training settting:
stream_depth: 2 # Train one token at a time
batch_size: 16384
batch_size_ramp: 0.60

gradient_clipping: 0.5
pretrain_in_train_mode: True # default BERT trains with dropout layers enabled in pretrain
reverse_dataset_order: False

budget: ${budget}


================================================
FILE: cramming/config/train/optim/adafactor.yaml
================================================
type: Adafactor

lr: 0.001
eps:
  - 1e-30
  - 0.001
clip_threshold: 1.0
decay_rate: -0.8
beta1:
weight_decay: 0.0
scale_parameter: False
relative_step: False
warmup_init: False


================================================
FILE: cramming/config/train/optim/adahessian.yaml
================================================
type: AdaHessian

lr: 0.15
betas:
  - 0.9
  - 0.98
eps: 1e-12
weight_decay: 0.01
hessian_power: 1.0


================================================
FILE: cramming/config/train/optim/adam.yaml
================================================
type: AdamW

lr: 0.0005
betas:
  - 0.9
  - 0.98
eps: 1e-12
weight_decay: 0.01
amsgrad: False
fused:


================================================
FILE: cramming/config/train/optim/adam8bit.yaml
================================================
type: Adam8bit

lr: 0.0005
betas:
  - 0.9
  - 0.98
eps: 1e-12
weight_decay: 0.01
amsgrad: False


================================================
FILE: cramming/config/train/optim/adam_classic.yaml
================================================
type: Adam

lr: 0.0005
betas:
  - 0.9
  - 0.999
eps: 1e-8
weight_decay: 0.01
amsgrad: False


================================================
FILE: cramming/config/train/optim/adamscale.yaml
================================================
type: AdamWScale

lr: 0.0005
betas:
  - 0.9
  - 0.98
eps: 1e-12
weight_decay: 0.01
correct_bias: True # adamw fix


================================================
FILE: cramming/config/train/optim/agd.yaml
================================================
type: AGD

gain: 1.0


================================================
FILE: cramming/config/train/optim/lion.yaml
================================================
type: Lion

lr: 1e-4
betas:
  - 0.9
  - 0.99
# use 0.95, 0.98 if unstable
weight_decay: 0.1


================================================
FILE: cramming/config/train/optim/radam.yaml
================================================
type: RAdam

lr: 0.0005
betas:
  - 0.9
  - 0.98
eps: 1e-12
weight_decay: 0.01


================================================
FILE: cramming/config/train/optim/sgd.yaml
================================================
type: SGD

lr: 0.0005
momentum: 0.9
dampening: 0.0
weight_decay: 0.01
nesterov: True


================================================
FILE: cramming/config/train/optim/shampoo.yaml
================================================
type: Shampoo

lr: 0.0005
betas:
  - 0.9
  - 0.98
epsilon: 1e-12
use_bias_correction: True
adam_w_mode: True
weight_decay: 0.01
grafting_type: 4
grafting_epsilon: 1e-08
grafting_beta2: 0.999

root_inv_dist: False
# update_freq (int): frequency for updating inverse preconditioner (Default: 100)
# init_delay (int): initial delay before starting to compute root inverse (Default: 1000)
# threshold (int): threshold for switching to diagonal preconditioner (Default: 1024)
# preconditioner_dtype (torch.dtype): data type for preconditioner (Default: torch.float)
# large_dim_method (LargeDimMethod): method for handling large scale tensors. (Default: LargeDimMethod.BLOCKING)
# root_inv_dist (bool): distributes root inverse computation across multiple GPU workers (Default: True)
# use_merge_dims (bool): merge dimensions if possible while respecting threshold. (Default: True)
# grafting_type (GraftingType): Selects grafting method. (Default: GraftingType.ADAGRAD)
# grafting_epsilon (float): Epsilon for grafting method. (Default: 1e-3)
# grafting_beta2 (float): Exponential moving average factor for grafting method. (Default: 1.0)

# class PreconditionerType(enum.Enum):
#     FULL = 0
#     DIAGONAL = 1
#
#
# class GraftingType(enum.Enum):
#     NONE = 0
#     SGD = 1
#     ADAGRAD = 2
#     RMSPROP = 3
#     ADAM = 4
#
#
# class LargeDimMethod(enum.Enum):
#     DIAGONAL = 0
#     ADAGRAD = 1
#     BLOCKING = 2


================================================
FILE: cramming/config/train/optim_mod/disabled.yaml
================================================
name: none


================================================
FILE: cramming/config/train/optim_mod/larc.yaml
================================================
name: LARC

trust_coefficient: 0.02
clip: True
eps: 1e-8


================================================
FILE: cramming/config/train/optim_mod/lars.yaml
================================================
name: LARS

trust_coefficient: 0.02
clip: False
eps: 1e-8


================================================
FILE: cramming/config/train/optim_mod/progressive.yaml
================================================
name: progressive-batching

progress_rule: norm-based

monotone: False
theta: 0.9

min_sample_guard: 2
max_sample_guard: 128


================================================
FILE: cramming/config/train/optim_mod/sam.yaml
================================================
name: SAM
rho: 0.05


================================================
FILE: cramming/config/wandb/default.yaml
================================================
enabled: True
entity: placeholder # change this obviously ;>
project: arithmetic
tags: []


================================================
FILE: cramming/config/wandb/none.yaml
================================================
enabled: False
entity:
project:
tags: []


================================================
FILE: cramming/data/__init__.py
================================================
"""This module handles and hides the data away ;)"""

from .pretraining_preparation import load_pretraining_corpus, prepare_dataloaders


================================================
FILE: cramming/data/arithmetic_tokenizers.py
================================================
"""
Character level tokenizers for arithemtic projects
Multiple tokenizers for different tasks
"""

from transformers import PreTrainedTokenizer
import re
import torch
import random

class CustomCharLevelTokenizerForAddingPadding(PreTrainedTokenizer):
    """Simple char level math tokenizer"""
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789+-x= '

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        # text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')


class CustomCharLevelTokenizerForAddingPaddingWithIndexHints(PreTrainedTokenizer):
    """Tokenizer for index hints"""
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789+-x= '
        self.char_set = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwyz!@£#$%^&*()~?.,<>{}[]:;/|βΓΔδεζηθκΛλμΞξΠπΣςτΦφχΨψΩω"
        characters = characters + self.char_set

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}
        
    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        # text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')


class CustomCharLevelTokenizerSort(PreTrainedTokenizer):
    """Tokenizer for sorting"""
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789D,:= '
        set_of_chars = ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                        'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
                        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '!', '@', '£', '#', '$', '%', '^',
                        '&', '*', '(', ')', '~', '?', '.', '<', '>', '{', '}', '[', ']', ';', '/', '|', 'β', 'Γ', 'Δ',
                        'δ', 'ε', 'ζ', 'η', 'θ', 'κ', 'Λ', 'λ', 'μ', 'Ξ', 'ξ', 'Π', 'π', 'Σ', 'ς', 'τ', 'Φ', 'φ', 'χ',
                        'Ψ', 'ψ', 'Ω', 'ω']
        self.char_set = ''.join(set_of_chars)
        characters = characters + self.char_set

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')


================================================
FILE: cramming/data/curriculum_sorting.py
================================================
"""Baseline curricula."""
import torch
import numpy as np

import logging

log = logging.getLogger(__name__)


def _sort_tokenized_dataset_by_unigram(tokenized_dataset, tokenizer, num_threads=1, ngram=1, reverse=False):
    # Force unigram counts per token:
    map_setup = dict(
        batched=True,
        batch_size=1024,
        # num_proc=None,  # have to reimplement counting as in-out instead of side effects for this to work. Lets see how slow num_proc=0 is
        load_from_cache_file=False,
        # keep_in_memory=True,
    )

    unigrams_counts_per_token = np.zeros(tokenizer.vocab_size, dtype=np.int64)

    def count_unigrams(examples):
        nonlocal unigrams_counts_per_token
        unigrams_counts_per_token += np.bincount(np.asarray(examples["input_ids"]).reshape(-1), minlength=tokenizer.vocab_size)

    tokenized_dataset.map(count_unigrams, desc="Counting token unigrams", **map_setup, num_proc=None)

    token_count = sum(unigrams_counts_per_token)
    k = 1
    k_smoothed_probs = (unigrams_counts_per_token + k) / (token_count + k * tokenizer.vocab_size)
    log2_probs = np.log2(k_smoothed_probs)

    def return_seq_prob(examples):
        logprob_scores = log2_probs[np.asarray(examples["input_ids"])].sum(axis=1) / tokenizer.model_max_length
        return dict(scores=logprob_scores)

    dataset_probs = tokenized_dataset.map(
        return_seq_prob,
        desc="Computing log probs per sequence",
        remove_columns=tokenized_dataset.column_names,
        **map_setup,
        num_proc=num_threads if num_threads > 0 else None,
    )

    new_order = np.argsort(np.asarray(dataset_probs["scores"]))

    if reverse:
        new_order = new_order[::-1]

    return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)


def _sort_tokenized_dataset_by_token(tokenized_dataset, tokenizer, target_token_id, num_threads=1):
    map_setup = dict(
        batched=True,
        batch_size=1024,
        num_proc=num_threads if num_threads > 0 else None,
        load_from_cache_file=False,
        # keep_in_memory=True,
    )

    def count_token(examples):
        return dict(counts=(np.asarray(examples["input_ids"]) == target_token_id).sum(axis=1))

    dataset_counts = tokenized_dataset.map(
        count_token,
        desc=f"Counting occurrences of token {tokenizer.decode(target_token_id)}",
        remove_columns=tokenized_dataset.column_names,
        **map_setup,
    )

    new_order = np.argsort(np.asarray(dataset_counts["counts"]))[::-1]

    # Print sentence with most occurrences:
    sentence_idx = int(new_order[0])
    input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
    dataset_size = len(tokenized_dataset)

    log.info("Sentence with most occurrences of token ...")
    log.info(tokenizer.batch_decode(input_data[None])[0])

    sentence_idx = int(new_order[-1])
    input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
    dataset_size = len(tokenized_dataset)

    log.info("Sentence with least occurrences of token ...")
    log.info(tokenizer.batch_decode(input_data[None])[0])

    return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)


def _sort_tokenized_dataset_by_word_length(tokenized_dataset, tokenizer, num_threads=1):
    map_setup = dict(
        batched=True,
        batch_size=1024,
        num_proc=num_threads if num_threads > 0 else None,
        load_from_cache_file=False,
    )

    def count_word_lengths(examples):
        return dict(lengths=[len(s) for s in tokenizer.batch_decode(torch.as_tensor(examples["input_ids"]))])

    dataset_counts = tokenized_dataset.map(
        count_word_lengths,
        desc="Counting word lengths per sequence",
        remove_columns=tokenized_dataset.column_names,
        **map_setup,
    )

    new_order = np.argsort(np.asarray(dataset_counts["lengths"]))  # shortest sentences first

    # Print sentence with shortest length
    sentence_idx = int(new_order[0])
    input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
    dataset_size = len(tokenized_dataset)

    log.info("Sentence with shortest length ...")
    log.info(tokenizer.batch_decode(input_data[None])[0])

    sentence_idx = int(new_order[-1])
    input_data = torch.as_tensor(tokenized_dataset[sentence_idx]["input_ids"]).squeeze()  # squeeze because hf has leading dim
    dataset_size = len(tokenized_dataset)

    log.info("and longest ...")
    log.info(tokenizer.batch_decode(input_data[None])[0])

    return tokenized_dataset.select(indices=new_order, writer_batch_size=1024)


================================================
FILE: cramming/data/deduplicate.py
================================================
"""This is glue code to connect to the rust-based deduplication of https://github.com/google-research/deduplicate-text-datasets
there is probably a smart way to implement deduplication for huggingface datasets directly,
but this is just a dumb dump-everything-into-tmp-files solution.

Code based on branch https://github.com/google-research/deduplicate-text-datasets/tree/dev-v1
See original license below.
"""

"""Installation how-to:
cargo install --target-dir ../cramming/dedup
Make sure to make sure that path_to_rust_code is set to the correct value if installing differently
"""

# ORIGINAL LICENSE:

# Copyright 2021 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datasets

import os
import numpy as np
from tqdm import tqdm

import time
import tempfile

import torch


def deduplicate_huggingface_dataset(dataset, threshold=100, original_cwd="."):
    """ "Seamlessly" run exact deduplication as in Lee et al."""
    path_to_rust_code = os.path.join(original_cwd, "dedup", "release")
    with tempfile.TemporaryDirectory() as tmpdir:
        text_file = _write_tmp_file(dataset, dirname=tmpdir)
        _make_suffix_array(text_file, tmpdir, path_to_rust_code)

        # Run other rust code directly
        options = f"--length-threshold {threshold} --cache-dir {tmpdir}/cache/"

        print("Finding self-similar parts...")
        os.popen(
            f"{path_to_rust_code}/dedup_dataset self-similar --data-file {text_file} " f"{options} --num-threads {torch.get_num_threads()}"
        ).read()
        print("Collect self-similar from all parts...")
        os.popen(f"{path_to_rust_code}/dedup_dataset collect --data-file {text_file} " f"{options}> {tmpdir}/drop_tokens_file").read()
        dataset = _finish_and_return_to_hf_dataset(text_file, f"{tmpdir}/drop_tokens_file")
    return dataset


def _write_tmp_file(dataset, dirname):
    text_file = os.path.join(dirname, "tmp_full_dataset_as_text")

    with open(text_file, "wb") as fout:
        for example in tqdm(dataset, desc="Writing dataset to tmp files."):  # not batched...
            fout.write((example["text"] + "<EOT>").encode("utf-8"))
    return text_file


def _make_suffix_array(text_file, tmpdir, path_to_rust_code):
    data_size = os.path.getsize(text_file)
    HACK = 100000

    started = []

    if data_size > 10e9:
        total_jobs = 100
        jobs_at_once = 20
    elif data_size > 1e9:
        total_jobs = 96
        jobs_at_once = 96
    elif data_size > 10e6:
        total_jobs = 4
        jobs_at_once = 4
    else:
        total_jobs = 4
        jobs_at_once = 1

    S = data_size // total_jobs
    print("Partition into parts and create suffix arrays...")
    for jobstart in range(0, total_jobs, jobs_at_once):
        wait = []
        for i in range(jobstart, jobstart + jobs_at_once):
            s, e = i * S, min((i + 1) * S + HACK, data_size)
            cmd = f"{path_to_rust_code}/dedup_dataset make-part --data-file {text_file} --start-byte {s} --end-byte {e}"
            started.append((s, e))
            # print(cmd)
            wait.append(os.popen(cmd))

            if e == data_size:
                break

        print("Waiting for jobs to finish")
        [x.read() for x in wait]

    print("Checking all wrote correctly")

    while True:
        files = [f"{text_file}.part.{s}-{e}" for s, e in started]

        wait = []
        for x, (s, e) in zip(files, started):
            size_data = os.path.getsize(x)
            FACT = np.ceil(np.log(size_data) / np.log(2) / 8)
            # print("FACT", FACT)
            size_table = os.path.getsize(x + ".table.bin")
            if not os.path.exists(x) or not os.path.exists(x + ".table.bin") or size_table == 0 or size_data * FACT != size_table:
                cmd = f"{path_to_rust_code}/dedup_dataset make-part --data-file {text_file} --start-byte {s} --end-byte {e}"
                # print(cmd)
                wait.append(os.popen(cmd))
        print("Rerunning", len(wait), "jobs because they failed.")
        [x.read() for x in wait]
        time.sleep(1)
        if len(wait) == 0:
            break

    print("Merging suffix trees")

    torun = " --suffix-path ".join(files)
    options = f"--output-file {tmpdir}/out.table.bin --suffix-path {torun} --num-threads {torch.get_num_threads()}"
    print(f"{path_to_rust_code}/dedup_dataset merge {options}")
    os.popen(f"{path_to_rust_code}/dedup_dataset merge {options}").read()
    # exit(0)
    print("Now merging individual tables")
    os.popen(f"cat {tmpdir}/out.table.bin.* > {tmpdir}/out.table.bin").read()
    print("Cleaning up")
    os.popen(f"mv {tmpdir}/out.table.bin {text_file}.table.bin").read()


def _finish_and_return_to_hf_dataset(original_text_file, remove_file_cache):
    """For simplicity the entire new dataset has to fit into memory..."""
    remove = []
    with open(remove_file_cache) as fin:
        for line in fin:
            if "out" in line:
                break
        for line in fin:
            remove.append(list(map(int, line.split())))
        remove = remove[::-1]

    print(f"Number of removal tuples is {len(remove)}")

    with open(original_text_file, "rb") as original_dataset:
        deduped_dataset = dict(text=[])
        start = 0
        buffer = ""
        for _ in tqdm(range(len(remove)), desc="Writing deduplicated data back to hf dataset"):
            a, b = remove.pop()
            buffer += original_dataset.read(a - start).decode("utf-8", errors="ignore")  # Is the error ignore here a terrible idea??
            original_dataset.seek(b)
            start = b

            buf_split = buffer.split("<EOT>")
            if len(buf_split) > 1:
                deduped_dataset["text"] += buf_split[:-1]
                buffer = buf_split[-1]
        deduped_dataset["text"] += (buffer + original_dataset.read().decode("utf-8")).split("<EOT>")[:-1]

    dataset = datasets.Dataset.from_dict(deduped_dataset)
    return dataset


================================================
FILE: cramming/data/pretraining_preparation.py
================================================
"""Prepare and preprocess datasets."""

import torch
import datasets
import hydra
import pandas as pd
import os
import contextlib
import logging
import tempfile
from itertools import chain
from collections import defaultdict

import json
from omegaconf import OmegaConf

from .tokenizer_preparation import construct_tokenizer, load_tokenizer
from .curriculum_sorting import _sort_tokenized_dataset_by_unigram, _sort_tokenized_dataset_by_token, _sort_tokenized_dataset_by_word_length
from .deduplicate import deduplicate_huggingface_dataset
from .utils import checksum_config, stage_dataset, detailed_OSError
from .tokenizer_preparation import get_tokenizer


import random
import transformers

from datasets.distributed import split_dataset_by_node
import random

from torch.utils.data import DataLoader
from typing import Dict


log = logging.getLogger(__name__)
datasets.enable_progress_bar()
datasets.disable_caching()  # We'll save only the final preprocessed dataset

device = "cuda" if torch.cuda.is_available() else "cpu"


def get_num_workers(cfg_impl):
    if cfg_impl is None:
        return 0
    elif cfg_impl.threads > 0:
        return min(torch.get_num_threads() // max(1, torch.cuda.device_count()), cfg_impl.threads)
    else:
        return 0


def load_pretraining_corpus(cfg_data, cfg_impl, data_dir: str = None):
    """Load (and optionally stage) a pre-processed corpus. Create one if it doesn't exist."""
    datasets.disable_caching()
    checksum = checksum_config(cfg_data)

    data_path = data_dir
    if data_path is None:
        data_path = cfg_impl.path
    data_src = list(cfg_data.sources.values())[0]
    provider = data_src["provider"]
    tokenizer_type = data_src["tokenizer_type"]
    if provider == "fake":
        # Shortcut for fake data
        return _load_fake_dataset(cfg_data, data_src, path=cfg_impl.path)
    elif provider == "hub":
        # pulling from huggingface
        return _load_from_hub(cfg_data, data_path)
    elif provider == "arithmetic":
        # our math data
        tokenized_dataset_path = data_src["tokenized_dataset_path"]
        tokenized_dataset_path = os.path.join(data_path, tokenized_dataset_path)
        print(f"Loading tokenized dataset from {tokenized_dataset_path}")
        tokenized_data = load_tokenized_data(tokenized_dataset_path)
        print(f"Loaded tokenized dataset from {tokenized_dataset_path}")
        tokenizer = get_tokenizer(tokenizer_type)
        print(f"Loaded tokenizer {tokenizer_type}")
        tokenizer.model_max_length = cfg_data["seq_length"]  # not perfect but better than nothing
        return tokenized_data, tokenizer
    else:
        # not found so creating
        try:
            if cfg_impl.local_staging_dir is not None:
                with main_process_first():
                    data_path = stage_dataset(data_path, cfg_impl.local_staging_dir)
            # Load already processed dataset
            tokenized_dataset = datasets.load_from_disk(data_path)
            tokenizer = load_tokenizer(
                os.path.join(data_path, "tokenizer"),
                seq_length=cfg_data.seq_length,
                vocab_size=cfg_data.vocab_size,
                cache_dir=cfg_impl.path,
            )
        except FileNotFoundError:
            if cfg_impl.forbid_dataset_preprocessing:
                raise ValueError(
                    f"Cannot find processed at path {data_path}. Dataset preprocessing disabled. "
                    "Dataset preprocessing can be enabled with 'impl.forbid_dataset_preprocessing=False'."
                )
            # Run preprocessing to create dataset
            with main_process_first():
                num_threads = min(torch.get_num_threads(), cfg_impl.threads)  # Mitigate worker overloading
                preprocessed_dataset, new_tokenizer = preprocess_dataset(
                    cfg_data,
                    download_path=cfg_impl.path,
                    num_threads=num_threads,
                    max_raw_chunk_size=cfg_impl.max_raw_chunk_size,
                )

                def save_corpus(path):
                    preprocessed_dataset.save_to_disk(path)
                    new_tokenizer.save_pretrained(os.path.join(path, "tokenizer"))
                    with open(os.path.join(path, "model_config.json"), "w") as file:
                        json.dump(OmegaConf.to_container(cfg_data, resolve=True), file)

                if not cfg_impl.temporary_corpus:
                    # Save to base directory:
                    save_corpus(os.path.join(cfg_impl.path, processed_dataset_dir))
                    if cfg_impl.local_staging_dir is not None:
                        # Optionally also copy into local staging directory
                        data_path = stage_dataset(data_path, cfg_impl.local_staging_dir)
                else:
                    # Directly use staging directory
                    save_corpus(os.path.join(cfg_impl.local_staging_dir, processed_dataset_dir))

            # Reload dataset
            tokenized_dataset = datasets.load_from_disk(data_path)
            tokenizer = load_tokenizer(
                os.path.join(data_path, "tokenizer"),
                seq_length=cfg_data.seq_length,
                vocab_size=cfg_data.vocab_size,
                cache_dir=cfg_impl.path,
            )

    # Cast to tensors after loading from arrow:
    tokenized_dataset.set_format("torch")

    # 4) Log overviews so we always know what's going on with weird tokenization tricks
    dataset_size = len(tokenized_dataset["train"])
    random_sentence_idx = torch.randint(0, dataset_size, (1,)).item()
    input_data = tokenized_dataset["train"][random_sentence_idx]["input_ids"].squeeze()  # squeeze because hf has leading dim

    log.info(f"Random sentence with seq_length {tokenizer.model_max_length} from dataset of size {dataset_size:,}: ...")
    log.info(tokenizer.batch_decode(input_data[None])[0])
    log.info("above is tokenized into below with _ joined to every token")
    log.info("_".join(tokenizer.decode(t) for t in input_data))
    return tokenized_dataset, tokenizer

def load_tokenized_data(tokenized_dataset_path):
    tokenized_dataset = datasets.load_from_disk(tokenized_dataset_path)
    return tokenized_dataset

def convert_to_hf_dataset(tokenized_data):
    # Convert the PyTorch tensor to a list of lists (if it's not already)
    data_list = tokenized_data.tolist()

    # Create a DataFrame from the list
    df = pd.DataFrame({'tokens': data_list})

    # Convert the DataFrame to a Hugging Face dataset
    hf_dataset = datasets.Dataset.from_pandas(df)
    return hf_dataset

def preprocess_dataset(cfg_data, download_path, num_threads=1, max_raw_chunk_size=1e14):
    """A lot of loading and preprocessing."""
    # 1) Collect raw source datasets
    raw_datasets = []
    for name, details in cfg_data.sources.items():
        log.info(f"Now preparing source {name}...")
        if details.provider == "huggingface":
            if name == "EleutherAI/proof-pile-2":
                raw_dataset = datasets.load_dataset(
                    name,
                    name=details.partition,
                    split=details.split,
                    cache_dir=download_path,
                    streaming=details.streaming,
                )
            else:              
                raw_dataset = datasets.load_dataset(
                    name,
                    data_dir=details.partition,
                    split=details.split,
                    cache_dir=download_path,
                    streaming=details.streaming,
                )
        elif details.provider == "local":
            raw_dataset = datasets.load_dataset(details.file_type, data_files=details.files, streaming=details.streaming)[details.split]
        else:
            raise ValueError(f"Invalid data provider {details.provider} given.")

        # remove columns that break later processing steps
        if details.remove_columns is not None:
            raw_dataset = raw_dataset.remove_columns(details.remove_columns)
        # Filter?
        if getattr(details, "filter", None) is not None:

            def filter_fn(entry):
                """Assume a metadata key 'meta' is present"""
                for key, values in details.filter.items():
                    if entry["meta"][key] in values:
                        return True
                return False

            raw_dataset = raw_dataset.filter(filter_fn)
        # move streams to fixed datasets to make everything sane (and to allow concatenation with unstreamed data)
        if details.streaming:
            raw_dataset = raw_dataset.take(int(cfg_data.max_entries_in_raw_dataset))
            raw_dataset = _move_stream_to_fixed_map(raw_dataset, cfg_data.max_entries_in_raw_dataset, max_raw_chunk_size)
        else:
            if cfg_data.max_entries_in_raw_dataset < len(raw_dataset):
                raw_dataset = raw_dataset.select(range(int(cfg_data.max_entries_in_raw_dataset)))
        # concatenate dataset that were cut into pieces that are too small
        if details.concatenate_successive_entries > 0:
            raw_dataset = _concatenate_entries(raw_dataset, details.concatenate_successive_entries, num_threads=num_threads)
        raw_datasets += [raw_dataset]

    # 2) Preprocess and tokenize
    raw_data = datasets.concatenate_datasets(raw_datasets)
    raw_data = raw_data.shuffle(seed=89)  # Shuffle once here so that multiproc has shards of similar size!
    # This shuffle is crucial for fast multiprocessing tokenization
    # because datasets.map uses a contiguous sharding under the hood.

    # However, we also shuffle so we can now select a smaller range:
    if cfg_data.max_entries_in_raw_dataset < len(raw_data):
        raw_data = raw_data.select(range(int(cfg_data.max_entries_in_raw_dataset)))

    raw_data = raw_dataset_preprocessing(raw_data, num_threads, cfg_data)  # This is by default a no-op, but can be dedup, filtering...
    tokenizer = construct_tokenizer(raw_data, cfg_data, path=download_path)
    tokenized_dataset = _huggingface_preprocessing(raw_data, tokenizer, cfg_data, num_threads=num_threads)  # Tokenize, group, sort...

    return tokenized_dataset, tokenizer


def _move_stream_to_fixed_map(raw_data_streamed, max_entries_in_raw_dataset, max_raw_chunk_size=1e14):
    """Save streaming dataset to a fixed mapping-style database."""
    # I'm tired of IterableDatasets and will take the performance hit to write them out instead:
    try:
        if max_raw_chunk_size > max_entries_in_raw_dataset:
            with tempfile.TemporaryDirectory() as tmpdirname:
                datasets.Dataset.from_dict(dict(text=[v["text"] for v in raw_data_streamed])).save_to_disk(tmpdirname + "raw_data")
                raw_data_mapped = datasets.load_from_disk(tmpdirname + "raw_data")
            # This used to be only a move into RAM but this breaks memory later using C4:
            # raw_data = datasets.Dataset.from_dict(dict(text=[v["text"] for v in raw_data]))
            return raw_data_mapped
        else:
            with tempfile.TemporaryDirectory() as tmpdirname:
                mapped_sets = []
                data_in_RAM = defaultdict(list)
                for idx, value_stream in enumerate(raw_data_streamed):
                    data_in_RAM["text"].append(value_stream["text"])
                    if ((idx + 1) % max_raw_chunk_size == 0) or ((idx - 1) == max_entries_in_raw_dataset):
                        datasets.Dataset.from_dict(data_in_RAM).save_to_disk(tmpdirname + "raw_data" + str(idx))
                        mapped_dataset = datasets.load_from_disk(tmpdirname + "raw_data" + str(idx))
                        log.info(
                            f"Saved temporary copy at idx {idx} of {max_entries_in_raw_dataset} at {tmpdirname + 'raw_data' + str(idx)}."
                        )
                        data_in_RAM["text"] = []
                        mapped_sets.append(mapped_dataset)
            return datasets.concatenate_datasets(mapped_sets)
    except OSError as e:
        detailed_OSError(e)


def _huggingface_preprocessing(raw_dataset, tokenizer, cfg_data, num_threads=4):
    """Dataset preprocessing and tokenization.

    This is basically the default HF routine from
    https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_mlm.py
    """
    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = getattr(raw_dataset, "column_names", "text")
    text_column_name = "text" if "text" in column_names else column_names[0]

    max_seq_length = tokenizer.model_max_length
    map_setup = dict(
        batched=True,
        batch_size=512,
        num_proc=num_threads if num_threads > 0 else None,
        # load_from_cache_file=False,
        # keep_in_memory=False,
    )
    parellism_flag = os.environ["TOKENIZERS_PARALLELISM"]
    if num_threads > 0:
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
    # The Collator is modified not to read special_masks anyway:

    def tokenize_function(examples):
        return tokenizer(
            examples[text_column_name],
            return_special_tokens_mask=False,
            return_attention_mask=False,  # handle this manually elsewhere if necessary
            return_token_type_ids=False,
        )

    tokenizer.model_max_length = 1e30
    tokenized_dataset = raw_dataset.map(
        tokenize_function, remove_columns=column_names, desc="Running tokenizer on every text in dataset", **map_setup
    )
    tokenizer.model_max_length = max_seq_length

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of
    # max_seq_length.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= max_seq_length:
            total_length = (total_length // max_seq_length) * max_seq_length
        # Split by chunks of max_len.
        result = {k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items()}
        return result

    tokenized_dataset = tokenized_dataset.map(group_texts, desc=f"Grouping texts in chunks of {max_seq_length}", **map_setup)

    # Reduce size to maximal limit:
    if cfg_data.max_seq_in_tokenized_dataset < len(tokenized_dataset):
        tokenized_dataset = tokenized_dataset.select(range(int(cfg_data.max_seq_in_tokenized_dataset)), keep_in_memory=True)

    # Split into train-val
    tokenized_dataset = tokenized_dataset.train_test_split(test_size=cfg_data.validation_seqs, shuffle=False)

    # Shuffle?
    if cfg_data.ordering == "randomized":
        tokenized_dataset["train"] = tokenized_dataset["train"].shuffle(seed=233)
    elif cfg_data.ordering == "unigram-curriculum":
        tokenized_dataset["train"] = _sort_tokenized_dataset_by_unigram(tokenized_dataset["train"], tokenizer, num_threads)
    elif cfg_data.ordering == "word-length-curriculum":
        tokenized_dataset["train"] = _sort_tokenized_dataset_by_word_length(tokenized_dataset["train"], tokenizer, num_threads)
    elif cfg_data.ordering == "sentence-length-curriculum":
        tokenized_dataset["train"] = _sort_tokenized_dataset_by_token(
            tokenized_dataset["train"],
            tokenizer,
            tokenizer.vocab[" ."],
            num_threads,
        )
    elif cfg_data.ordering == "fragment-curriculum":
        tokenized_dataset["train"] = _sort_tokenized_dataset_by_token(
            tokenized_dataset["train"],
            tokenizer,
            tokenizer.vocab["<eot>"],
            num_threads,
        )
    else:
        raise ValueError(f"Invalid dataset ordering {cfg_data.ordering} provided.")

    # Finally flatten
    # This is necessary for the save_to_disk call that comes next. If skipped here, the call will be invoked from save_to_disk
    # This way, atleast it shares the same batch parameters and prints a progress bar.
    tokenized_dataset = tokenized_dataset.map(desc="Flattening the indices", **map_setup)
    os.environ["TOKENIZERS_PARALLELISM"] = parellism_flag
    return tokenized_dataset


def _load_fake_dataset(cfg_data, details, path=None):
    tokenizer = load_tokenizer(cfg_data.tokenizer, cfg_data.seq_length, cfg_data.vocab_size, cache_dir=path)
    tokenizer.model_max_length = cfg_data.seq_length
    generator = torch.Generator()
    generator.manual_seed(details.randgen_seed)
    dataset = torch.randint(0, cfg_data.vocab_size, (details.size, cfg_data.seq_length), generator=generator)
    return dataset, tokenizer


def _concatenate_entries(dataset, num_entries_in_group, num_threads):
    parellism_flag = os.environ["TOKENIZERS_PARALLELISM"]
    if num_threads > 0:
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

    def group_texts(examples):
        result = dict()
        for key, entries in examples.items():
            reduced_list = []
            state, num_collected = None, 0
            for entry in entries:
                num_collected += 1
                if num_collected == 1:
                    state = entry
                else:
                    state += entry
                if num_collected == num_entries_in_group:
                    reduced_list.append(state)
                    state, num_collected = None, 0

            result[key] = reduced_list

        return result

    map_setup = dict(
        batched=True,
        batch_size=512,
        num_proc=num_threads if num_threads > 0 else None,
        # load_from_cache_file=False,
        # keep_in_memory=True,
    )
    dataset = dataset.map(group_texts, desc="Concatenating examples", **map_setup)
    os.environ["TOKENIZERS_PARALLELISM"] = parellism_flag
    return dataset


def raw_dataset_preprocessing(raw_dataset, num_threads, cfg_data):
    """Some dataset "improvements". These are optional filtering or normalization rules that are only applied to the pretraining corpus.
    This separates them from generic normalizations that are baked into the tokenizer."""
    column_names = getattr(raw_dataset, "column_names", "text")
    text_column_name = "text" if "text" in column_names else column_names[0]
    known_tokens = []
    map_setup = dict(
        batched=True,
        batch_size=512,
        num_proc=None,  # a bit messy but c4 in RAM can be overbearing otherwise
    )
    parellism_flag = os.environ["TOKENIZERS_PARALLELISM"]
    if num_threads > 0:
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

    if cfg_data.remove_trash:
        # experimental first test based on Unigram tokenization:
        from transformers import AutoTokenizer

        if cfg_data.remove_trash == "self":
            os.environ["TOKENIZERS_PARALLELISM"] = parellism_flag
            tokenizer = construct_tokenizer(raw_dataset, cfg_data, path=None)
            if num_threads > 0:
                os.environ["TOKENIZERS_PARALLELISM"] = "false"
        else:
            tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
        tokenizer.model_max_length = 1e30

        def filtering_rule(examples):
            tokenized = tokenizer(examples[text_column_name])["input_ids"]
            return [len(t) < cfg_data.trash_cutoff * len(e) for t, e in zip(tokenized, examples[text_column_name])]

        log.info(f"Size of dataset before trash removal: {len(raw_dataset)}.")
        raw_dataset = raw_dataset.filter(
            filtering_rule,
            desc="Filter sentences that cannot be tokenized well.",
            **map_setup,
        )
        log.info(f"Size of filtered dataset: {len(raw_dataset)}.")

    if cfg_data.deduplicate_entries:
        log.info(f"Size of dataset before deduplication: {len(raw_dataset)}.")
        raw_dataset = deduplicate_huggingface_dataset(
            raw_dataset, threshold=cfg_data.deduplication_threshold, original_cwd=hydra.utils.get_original_cwd()
        )
        log.info(f"Size of deduplicated dataset: {len(raw_dataset)}.")

    os.environ["TOKENIZERS_PARALLELISM"] = parellism_flag
    return raw_dataset


@contextlib.contextmanager
def main_process_first():
    """
    A context manager for torch distributed environment where on needs to do something on the main process, while
    blocking replicas, and when it's finished releasing the replicas.
    One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process,
    which upon completion saves a cached version of results and which then automatically gets loaded by the
    replicas.

    This is a stripped-down version of the the huggingface context manager from commit 2eb7bb15e771f13192968cd4657c78f76b0799fe
    """
    if torch.distributed.is_initialized():
        is_main_process = torch.distributed.get_rank() == 0
        try:
            if not is_main_process:
                # tell all replicas to wait
                torch.distributed.barrier()
            yield
        finally:
            if is_main_process:
                torch.distributed.barrier()
    else:
        yield


def _load_from_hub(cfg_data, data_path):
    from huggingface_hub import hf_hub_download

    tokenized_dataset = datasets.load_dataset(cfg_data.hf_location, "train", streaming=cfg_data.streaming, cache_dir=data_path)["train"]
    tokenized_dataset = tokenized_dataset.with_format("torch")

    tokenizer_req_files = ["special_tokens_map.json", "tokenizer.json", "tokenizer_config.json"]
    os.makedirs(os.path.join(data_path, "tokenizer"), exist_ok=True)
    for file in tokenizer_req_files:
        hf_hub_download(
            cfg_data.hf_location,
            file,
            subfolder="tokenizer",
            repo_type="dataset",
            local_dir=os.path.join(data_path),
        )
    tokenizer = load_tokenizer(os.path.join(data_path, "tokenizer"), seq_length=cfg_data.seq_length, cache_dir=data_path)
    return tokenized_dataset, tokenizer


def prepare_dataloaders(datasets, tokenizer, cfg_train, cfg_impl) -> Dict[str, DataLoader]:
    dataloaders = dict()
    train_loader = prepare_pretraining_dataloader(datasets["train"], tokenizer, cfg_train, cfg_impl)
    dataloaders["train"] = train_loader
    dataloaders["test"] = prepare_validation_dataloader(datasets["test"], tokenizer, cfg_impl)
    return dataloaders


def prepare_pretraining_dataloader(dataset, tokenizer, cfg_train, cfg_impl) -> torch.utils.data.DataLoader:

    num_workers = get_num_workers(cfg_impl)
    collate_fn = FastDataCollatorForLanguageModeling(tokenizer=tokenizer, pad_to_multiple_of=8, mlm=False)

    if dataset is None:
        # generate data at runtime
        return RuntimeInfiniteDataLoader(tokenizer, device)
    elif isinstance(dataset, torch.utils.data.IterableDataset):
        # streaming mode for ready-made datasets, speed not tested
        if torch.distributed.is_initialized():
            dataset = split_dataset_by_node(dataset, rank=int(os.environ["RANK"]), world_size=int(os.environ["WORLD_SIZE"]))

        if cfg_impl.shuffle_in_dataloader:
            dataset = dataset.shuffle(seed=42, buffer_size=256)
        if cfg_train.reverse_dataset_order:
            raise ValueError("Reverse stream not implemented.")
        sampler = None
    else:
        # Normally, we'd just use nice map-style datasets:
        if torch.distributed.is_initialized():
            sampler = torch.utils.data.distributed.DistributedSampler(
                dataset,
                shuffle=cfg_impl.shuffle_in_dataloader,
                drop_last=True,
            )
        else:
            if cfg_impl.shuffle_in_dataloader:
                sampler = torch.utils.data.RandomSampler(dataset)
            else:
                sampler = torch.utils.data.SequentialSampler(dataset)

    if cfg_train.reverse_dataset_order:
        dataset = dataset.select(reversed(range(len(dataset))))
    repeated_dataloader = InfiniteDataLoader(
        dataset,
        sampler=sampler,
        batch_size=min(cfg_impl.microbatch_size, len(dataset)),
        num_workers=num_workers,
        pin_memory=cfg_impl.pin_memory,
        drop_last=True,
        prefetch_factor=cfg_impl.prefetch_factor if num_workers > 0 else None,
        persistent_workers=cfg_impl.persistent_workers if num_workers > 0 else False,
        collate_fn=collate_fn,
    )
    return repeated_dataloader


def prepare_validation_dataloader(dataset, tokenizer, cfg_impl):

    num_workers = get_num_workers(cfg_impl)
    collate_fn = FastDataCollatorForLanguageModeling(tokenizer=tokenizer, pad_to_multiple_of=8, mlm=False)
    if dataset is None:
        # generate data at runtime
        return RuntimeInfiniteDataLoader(tokenizer, device)
    elif isinstance(dataset, torch.utils.data.IterableDataset):
        sampler = None
    else:
        sampler = torch.utils.data.SequentialSampler(dataset)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=min(cfg_impl.microbatch_size, len(dataset)),
        num_workers=num_workers,
        pin_memory=cfg_impl.pin_memory,
        drop_last=True,  # better make it fit elsewhere
        prefetch_factor=cfg_impl.prefetch_factor if num_workers > 0 else None,
        persistent_workers=False,
        collate_fn=collate_fn,
    )
    return dataloader


"""This is a minor modification of huggingface's toking masking:"""
"""original source:
https://github.com/huggingface/transformers/blob/130b987880a9b1ade5c76dc1413c12c8924fda50/src/transformers/data/data_collator.py#L748
at commit f00f22a3e290fd377b979124dcf9800b3d73eb11"""


class FastDataCollatorForLanguageModeling(transformers.DataCollatorForLanguageModeling):
    def __init__(self, *args, create_labels_entry=False, **kwargs):
        super().__init__(*args, **kwargs)
        self.mlm = False
        self.create_labels_entry = create_labels_entry

    def torch_call(self, examples):
        """Simplified call assuming all dicts in the list of examples have the same layout and contain tensors.
        Assume further that all these tensors contain vectors of Long Tensors  [AND THEY HAVE TO BE LONG]"""
        if isinstance(examples[0], torch.Tensor):
            examples = [{"input_ids": ex} for ex in examples]
        # So this is the handmade version
        batch = dict()
        for key in examples[0].keys():
            elem = torch.as_tensor(examples[0][key])
            out = None
            if torch.utils.data.get_worker_info() is not None:
                storage = elem._typed_storage()._new_shared(len(examples) * elem.shape[0], device=elem.device)
                out = elem.new(storage).resize_(len(examples), elem.shape[0])

            batch[key] = torch.stack([torch.as_tensor(example[key]) for example in examples], 0, out=out).contiguous()

        if self.create_labels_entry:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        return batch


class InfiniteDataLoader(torch.utils.data.DataLoader):
    """Lazy copy-paste from https://gist.github.com/MFreidank/821cc87b012c53fade03b0c7aba13958."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Initialize an iterator over the dataset.
        self.dataset_iterator = super().__iter__()
        self.epoch_counter = 0

    def __iter__(self):
        return self

    def __next__(self):
        try:
            batch = next(self.dataset_iterator)
        except StopIteration:
            # Dataset exhausted, use a new fresh iterator.
            self.dataset_iterator = super().__iter__()
            self.epoch_counter += 1
            if hasattr(self.sampler, "set_epoch"):
                self.sampler.set_epoch(self.epoch_counter)
            batch = next(self.dataset_iterator)
        return batch

    def set_epoch(self, epoch: int):
        self.epoch_counter = epoch

class RuntimeInfiniteDataLoader(torch.utils.data.DataLoader):
    """Lazy copy-paste from https://gist.github.com/MFreidank/821cc87b012c53fade03b0c7aba13958."""

    def __init__(self, tokenizer, device, *args, **kwargs):
        self.epoch_counter = 0
        ## All need to be moved to cfg
        self.max_n = 20
        self.max_m = 20
        self.batch_size = 16
        self.reverse_answer = False
        self.reverse_all = False
        self.operation = '+'

        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.vocab[self.tokenizer.eos_token]
        self.device = device
        self.current_batch = []

    def get_arithmetic(self, n, m):
        batch = []
        for _ in range(self.batch_size):
            num1 = random.randint((10**(n-1)), (10**n - 1))
            num2 = random.randint(10**(m-1), 10**m - 1)

            num1_str = str(num1)
            num2_str = str(num2)

            result = num1 + num2

            result = str(result)

            if self.reverse_answer:
                result = result[::-1]
            if self.reverse_all:
                result = result[::-1]
                num1_str = num1_str[::-1]
                num2_str = num2_str[::-1]

            batch.append(f"{num1_str}{self.operation}{num2_str}={result}")

        return batch

    def tokenize_batch(self, batch):
        # todo this can be sped up using the HF dataset.map
        tokenized_list = [self.tokenizer(entry)["input_ids"] + [self.eos_token_id] for entry in batch]

        max_length = max(len(entry) for entry in tokenized_list)
        pad_token_id = self.tokenizer.pad_token_id
        tokenized_list = [entry + [pad_token_id] * (max_length - len(entry)) for entry in tokenized_list]

        tokenized_tensor = torch.tensor(tokenized_list, device=self.device)
        return tokenized_tensor

    def __iter__(self):
        return self

    def __next__(self):
        n = random.randint(1, self.max_n)
        m = random.randint(1, self.max_m)
        batch = self.get_arithmetic(n, m)
        tokenized_batch = self.tokenize_batch(batch)
        return {'input_ids': tokenized_batch, 'max_recur': max(n, m)+5}


================================================
FILE: cramming/data/tokenizer_preparation.py
================================================
"""Tokenizer functionality.

Note: CANNOT name this file "tokenizers.py ;>
"""

from transformers import AutoTokenizer, PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers, Regex, processors
from cramming.data.arithmetic_tokenizers import CustomCharLevelTokenizerForAddingPadding, CustomCharLevelTokenizerForAddingPaddingWithIndexHints, CustomCharLevelTokenizerSort


def get_tokenizer(tokenizer_type: str):
    """Get an arithemtic tokenizer"""
    if tokenizer_type == "pad":
        tokenizer = CustomCharLevelTokenizerForAddingPadding()
    elif tokenizer_type == "index":
        tokenizer = CustomCharLevelTokenizerForAddingPaddingWithIndexHints()
    elif tokenizer_type == "sort":
        # also has the index hints charset
        tokenizer = CustomCharLevelTokenizerSort()
    else:
        print("tokenizer not found")
        exit()
    return tokenizer


def load_tokenizer(tokenizer_path_or_name, seq_length=512, vocab_size=None, cache_dir=None):
    """Load a tokenizer from disk/huggingface. This will never construct a new tokenizer."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path_or_name, model_max_length=seq_length)
    except FileNotFoundError:
        tokenizer = _download_tokenizer(tokenizer_path_or_name, seq_length, cache_dir)
    if vocab_size is not None and tokenizer.vocab_size != vocab_size:
        raise ValueError(f"Loaded tokenizer with vocab_size {tokenizer.vocab_size} incompatible with given vocab size {vocab_size}.")
    return tokenizer


def construct_tokenizer(raw_datasets, cfg_data, path, known_tokens=[]):
    """Construct a new tokenizer. This may include downloading from huggingface."""
    if cfg_data.tokenizer not in ["BPE", "Unigram", "WordLevel", "WordPiece", "WordPieceBERT", "SentencePieceUnigram", "SentencePieceBPE","starcoder"]:
        tokenizer = _download_tokenizer(cfg_data.tokenizer, cfg_data.seq_length, cache_dir=path)
    else:
        tokenizer = _construct_tokenizer(raw_datasets, cfg_data, known_tokens)
    tokenizer.name = f"{cfg_data.tokenizer}-{cfg_data.name}-{cfg_data.vocab_size}.json"
    return tokenizer


def _download_tokenizer(tokenizer_path_or_name, seq_length, cache_dir=None):
    try:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path_or_name, cache_dir=cache_dir)
        tokenizer.model_max_length = seq_length
    except OSError as error_msg:
        raise OSError(f"Invalid huggingface tokenizer {tokenizer_path_or_name} given: {error_msg}")
    return tokenizer


def _get_sane_token_args():
    return dict(
        pad_token="<pad>",
        bos_token="<eot>",
        eos_token="<eot>",
        sep_token="<eot>",
        unk_token="<unk>",
    )


def _get_sane_normalizers(force_english_keyboard=False, force_lowercase=False, strip_accents=False, whitespace_escape=False, sanity=False):
    """original rules as in XLNET with optional modifications. force_english_keyboard is actually an ascii normalization."""
    if sanity:
        return normalizers.BertNormalizer(lowercase=force_lowercase)
    normalize_ops = []
    normalize_ops.append(normalizers.Replace("``", '"'))
    normalize_ops.append(normalizers.Replace("''", '"'))
    normalize_ops.append(normalizers.NFD() if strip_accents else normalizers.NFKC())
    if force_lowercase:
        normalize_ops.append(normalizers.Lowercase())
    if strip_accents:
        normalize_ops.append(normalizers.StripAccents())
    normalize_ops.append(normalizers.Replace(Regex(" {2,}"), " "))
    if force_english_keyboard:
        normalize_ops.append(normalizers.Replace(Regex(r"[^\x00-\x7F]+"), ""))  # start from 00 instead of 1F to include tab
    return normalizers.Sequence(normalize_ops)


def _construct_tokenizer(raw_datasets, cfg_data, known_tokens=[]):
    """The actual generation instructions for a new tokenizer. Might make this more scriptable in the future...

    Follows closely along with https://huggingface.co/course/chapter6"""
    try:
        len_dataset = len(raw_datasets)

        def batch_iterator(batch_size=1024):
            for i in range(0, len_dataset, batch_size):
                try:
                    yield raw_datasets[i : i + batch_size]["content"]
                except:
                    yield raw_datasets[i : i + batch_size]["text"]

    except TypeError:
        # streaming dataset
        len_dataset = int(cfg_data.max_entries_in_dataset)

        def batch_iterator():
            for entry in iter(raw_datasets):
                try:
                    yield entry["content"]
                except:
                    yield entry["text"]

    special_token_args = _get_sane_token_args()
    normalizer_sequence = _get_sane_normalizers(**cfg_data.normalizer)
    # Outline tokenizer rules:
    if cfg_data.tokenizer == "Unigram":  # without the sentencepice part
        tokenizer = Tokenizer(models.Unigram())
        tokenizer.add_tokens(known_tokens)
        tokenizer.normalizer = normalizer_sequence
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        # tokenizer.decoder = None
        special_tokens = list(set(v for k, v in special_token_args.items()))

        trainer = trainers.UnigramTrainer(
            vocab_size=cfg_data.vocab_size,
            special_tokens=special_tokens,
            unk_token=special_token_args["unk_token"],
        )
    elif cfg_data.tokenizer == "BPE":
        tokenizer = Tokenizer(models.BPE())
        tokenizer.add_tokens(known_tokens)

        tokenizer.normalizer = normalizer_sequence
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
        tokenizer.decoder = decoders.ByteLevel()

        trainer = trainers.BpeTrainer(
            vocab_size=cfg_data.vocab_size,
            min_frequency=2,
            special_tokens=list(set(special_token_args.values())),
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        )
    elif cfg_data.tokenizer == "WordPiece":
        tokenizer = Tokenizer(models.WordPiece(unk_token=special_token_args["unk_token"]))
        tokenizer.add_tokens(known_tokens)

        tokenizer.normalizer = normalizer_sequence
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        trainer = trainers.WordPieceTrainer(vocab_size=cfg_data.vocab_size, special_tokens=list(set(special_token_args.values())))
    elif cfg_data.tokenizer == "WordPieceBERT":
        # Sanity check tokenizer
        tokenizer = Tokenizer(models.WordPiece(unk_token="<unk>"))
        tokenizer.add_tokens(known_tokens)
        tokenizer.normalizer = normalizers.BertNormalizer()
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        trainer = trainers.WordPieceTrainer(vocab_size=cfg_data.vocab_size, special_tokens=list(set(special_token_args.values())))
    elif cfg_data.tokenizer == "WordLevel":
        tokenizer = Tokenizer(models.WordLevel(unk_token=special_token_args["unk_token"]))
        tokenizer.add_tokens(known_tokens)
        tokenizer.normalizer = normalizer_sequence
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        trainer = trainers.WordLevelTrainer(vocab_size=cfg_data.vocab_size, special_tokens=list(set(special_token_args.values())))
    elif cfg_data.tokenizer == "SentencePieceBPE":
        """ref https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py"""
        tokenizer = Tokenizer(models.BPE())
        tokenizer.add_tokens(known_tokens)

        tokenizer.normalizer = normalizer_sequence
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [pre_tokenizers.Metaspace(replacement="▁", add_prefix_space=True), pre_tokenizers.ByteLevel(add_prefix_space=False)],
        )
        tokenizer.decoder = decoders.Sequence([decoders.ByteLevel(), decoders.Metaspace(replacement="▁", add_prefix_space=True)])

        trainer = trainers.BpeTrainer(
            vocab_size=cfg_data.vocab_size,
            min_frequency=2,
            special_tokens=list(set(special_token_args.values())),
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        )
    elif cfg_data.tokenizer == "SentencePieceUnigram":
        tokenizer = Tokenizer(models.Unigram())
        tokenizer.add_tokens(known_tokens)
        tokenizer.normalizer = normalizer_sequence
        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", add_prefix_space=True)
        tokenizer.decoder = decoders.Metaspace(replacement="▁", add_prefix_space=True)
        special_tokens = list(set(v for k, v in special_token_args.items()))

        trainer = trainers.UnigramTrainer(
            vocab_size=cfg_data.vocab_size,
            special_tokens=special_tokens,
            unk_token=special_token_args["unk_token"],
        )
    else:
        raise ValueError(f"Invalid tokenization strategy {cfg_data.tokenizer} given.")

    # Construct tokenizer
    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len_dataset)

    if tokenizer.get_vocab_size() != cfg_data.vocab_size:
        raise RuntimeError(f"Tokenizer generation failure. Vocab size of trained tokenizer is {tokenizer.get_vocab_size()}.")

    # Postprocess:
    eot_token_id = tokenizer.token_to_id("<eot>")

    # Generate template:
    single_template = "$A"
    if cfg_data.include_eot_token_in_corpus:
        single_template = single_template + " <eot>"
    tokenizer.post_processor = processors.TemplateProcessing(
        single=single_template,
        special_tokens=[("<eot>", eot_token_id)],
    )
    # Wrap into fast codebase
    wrapped_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        model_max_length=cfg_data.seq_length,
        **special_token_args,
    )
    return wrapped_tokenizer


================================================
FILE: cramming/data/utils.py
================================================
"""Various utilities."""
import os
from omegaconf import OmegaConf
import hashlib
import json
import shutil
import subprocess

import logging
import time

import datasets

log = logging.getLogger(__name__)


def checksum_config(cfg):
    """This is more annoying that I thought it would be. But a json-dump of the config file is hashed and used as checksum."""
    bindump = json.dumps(OmegaConf.to_container(cfg, resolve=True), sort_keys=True).encode("utf-8")
    checksum_of_config = hashlib.md5(bindump).hexdigest()
    if "tokenizer" in cfg and "vocab_size" in cfg:
        checksum_of_config = f"{cfg.tokenizer}x{cfg.vocab_size}_{checksum_of_config}"
    return checksum_of_config


def stage_dataset(data_directory_path, local_staging_dir):
    """This is a mess because our network drives are a mess. You might not need this."""
    data_directory_name = os.path.basename(data_directory_path)
    new_path = os.path.join(local_staging_dir, data_directory_name)
    if os.path.isdir(data_directory_path):
        try:
            if not os.path.isdir(new_path):
                try:
                    shutil.copytree(data_directory_path, new_path)
                    log.info(f"Staging dataset to {new_path}...")
                except FileExistsError:
                    log.info(f"Concurrent writing to {new_path} detected. Stopping staging in this run and waiting for 300 seconds.")
                    time.sleep(300)
            else:
                log.info(f"Using staged dataset found at {new_path}...")

            for retries in range(15):
                _, _, free = shutil.disk_usage(new_path)
                used = _get_size(new_path)
                try:
                    tokenized_dataset = datasets.load_from_disk(new_path)
                    log.info(f"Staged dataset size is {used / 1024**3:,.3f}GB. {free/ 1024**3:,.3f}GB free in staging dir.")
                    return new_path
                except FileNotFoundError:
                    log.info(
                        f"Staged dataset is incomplete. Size is {used / 1024**3:,.3f}GB. "
                        f" Waiting for 60 more secs for staging race condition."
                    )
                    time.sleep(60)
            log.info(f"Staging dataset corrupted. Falling back to network drive location {data_directory_path}")
            return data_directory_path

        except Exception as e:  # noqa
            log.info(f"Staging failed with error {e}. Falling back to network drive location {data_directory_path}")
            return data_directory_path
    else:
        raise FileNotFoundError(f"Dataset not yet generated or not found at {data_directory_path}.")


def _get_size(start_path="."):
    """Compute the size of a directory path. Why is this not in the standard library?"""
    """Stolen from https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python"""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size


def detailed_OSError(e):
    if e.errno == 28:  # "no space left on device"
        if e.filename:
            df_output = subprocess.check_output(["df", "-h", e.filename]).decode("utf-8")
            df_lines = df_output.strip().split("\n")[1:]
            if df_lines:
                # The file system containing the file is full
                device_name, size, used, available, percent, mount_point = df_lines[0].split()
                error_path = os.path.abspath(e.filename)
                error_message = f"Error writing to {error_path}: {e.strerror}"
                space_message = f"{available} space left on {mount_point}"
                full_error_message = f"{error_message}\nDevice {device_name} is full. {space_message}"
        else:
            # The file name is unknown
            error_message = f"Error: {e.strerror}"
            full_error_message = f"{error_message}\nUnknown file name. Device may be full."
        raise OSError(full_error_message)
    else:
        raise e


================================================
FILE: cramming/utils.py
================================================
"""System utilities."""

import socket
import sys

import os
import csv
import yaml
import psutil
import pynvml

import multiprocess  # hf uses this for some reason
import collections

import torch
import torch._inductor.config
import transformers


import json
import random
import numpy as np
import time
import datetime
import tempfile
from .data.utils import checksum_config

import logging
import hydra
from omegaconf import OmegaConf, open_dict
import cramming

log = logging.getLogger(__name__)
os.environ["HYDRA_FULL_ERROR"] = "0"


def main_launcher(cfg, main_fn, job_name=""):
    """This is boiler-plate code for a launcher."""
    launch_time = time.time()
    # Set definitive random seed:
    if cfg.seed is None:
        cfg.seed = torch.randint(0, 2**32 - 1, (1,)).item()

    # Figure out all paths:
    cfg = pathfinder(cfg)

    # Decide GPU and possibly connect to distributed setup
    setup, kWh_counter = system_startup(cfg)
    # Initialize wanDB
    if cfg.wandb.enabled:
        _initialize_wandb(setup, cfg)
    log.info("--------------------------------------------------------------")
    log.info(f"--------------Launching {job_name} run! ---------------------")
    log.info(OmegaConf.to_yaml(cfg, resolve=True))
    metrics = main_fn(cfg, setup)
    metrics = collect_system_metrics(cfg, metrics, kWh_counter, setup)

    log.info("-------------------------------------------------------------")
    log.info(f"Finished running job {cfg.name} with total train time: " f"{str(datetime.timedelta(seconds=time.time() - launch_time))}")
    if is_main_process():
        metrics = flatten(metrics)
        dump_metrics(cfg, metrics)
        # Export to wandb:
        if cfg.wandb.enabled:
            import wandb

            for k, v in metrics.items():
                wandb.run.summary[k] = v

        if torch.cuda.is_available():
            max_alloc = f"{torch.cuda.max_memory_allocated(setup['device'])/float(1024**3):,.3f} GB"
            max_reserved = f"{torch.cuda.max_memory_reserved(setup['device'])/float(1024**3):,.3f} GB"
            log.info(f"Max. Mem allocated: {max_alloc}. Max. Mem reserved: {max_reserved}.")
            log.info(f"{metrics['kWh']:.2e} kWh of electricity used for GPU(s) during job.")
    log.info("-----------------Shutdown complete.--------------------------")


def get_cpus() -> int:
    # Number of threads
    try:
        return min(psutil.cpu_count(logical=False), len(psutil.Process().cpu_affinity()))  # covering both affinity and phys.
    except:
        pass
    try:
        return os.cpu_count()  # when running on mac
    except:
        return 1


def system_startup(cfg):
    """Decide and print GPU / CPU / hostname info. Generate local distributed setting if running in distr. mode.

    Set all required and interesting environment variables.
    """
    torch.backends.cudnn.benchmark = cfg.impl.benchmark
    torch.backends.cuda.enable_flash_sdp(cfg.impl.enable_flash_sdp) if cfg.impl.enable_flash_sdp is not None else 0
    torch.backends.cuda.enable_math_sdp(cfg.impl.enable_math_sdp) if cfg.impl.enable_math_sdp is not None else 0
    torch.backends.cuda.enable_mem_efficient_sdp(cfg.impl.enable_mem_efficient_sdp) if cfg.impl.enable_mem_efficient_sdp is not None else 0
    torch.set_float32_matmul_precision(cfg.impl.matmul_precision)

    if cfg.impl.sharing_strategy is not None:
        torch.multiprocessing.set_sharing_strategy(cfg.impl.sharing_strategy)

    if cfg.impl.tf32_allowed:
        torch.backends.cudnn.allow_tf32 = True
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True  # Should be true anyway

    multiprocess.set_start_method("forkserver")
    if cfg.impl.local_staging_dir is not None:
        tmp_path = os.path.join(cfg.impl.local_staging_dir, "tmp")
        os.makedirs(tmp_path, exist_ok=True)
        os.environ["TMPDIR"] = tmp_path
        tempfile.tempdir = None  # Force temporary directory regeneration
    if cfg.impl.enable_huggingface_offline_mode:
        os.environ["HF_DATASETS_OFFLINE"] = "1"
        os.environ["TRANSFORMERS_OFFLINE"] = "1"

    if cfg.impl.add_env_variables is not None:
        # Note that for any environment variables added here, they have to be able to change behavior at runtime
        # for example, the torchdynamo settings are read at import and cannot be changed at runtime here
        for env_var, string_val in cfg.impl.add_env_variables.items():
            os.environ[str(env_var)] = str(string_val)
        log.info(os.environ)

    allowed_cpus_available = get_cpus()
    # Distributed launch?
    if "LOCAL_RANK" in os.environ:
        torch.distributed.init_process_group(backend=cfg.impl.dist_backend)
        local_rank = int(os.environ["LOCAL_RANK"])
        global_rank = int(os.environ["RANK"])
        world_size = int(os.environ["WORLD_SIZE"])
        run = os.environ.get("TORCHELASTIC_RUN_ID", "unknown")
        threads_per_gpu = max(1, min(allowed_cpus_available // max(1, torch.cuda.device_count()), cfg.impl.threads))
        log.info(
            f"Distributed worker initialized on rank {global_rank} (local rank {local_rank}) "
            f"with {world_size} total processes. OMP Threads set to {threads_per_gpu}. Run ID is {run}."
        )
        log.setLevel(logging.INFO if is_main_process() else logging.ERROR)
    else:
        threads_per_gpu = max(1, min(allowed_cpus_available, cfg.impl.threads))
        global_rank = local_rank = 0

    torch.set_num_threads(threads_per_gpu)
    os.environ["OMP_NUM_THREADS"] = str(threads_per_gpu)
    cfg.impl.local_rank = local_rank

    # datasets will automatically disable tokenizer parallelism when needed:
    os.environ["TOKENIZERS_PARALLELISM"] = "true"
    os.environ["RAYON_RS_NUM_CPUS"] = str(threads_per_gpu)
    max_dataset_memory = f"{psutil.virtual_memory().total // 2 // max(torch.cuda.device_count(), 1)}"
    os.environ["HF_DATASETS_IN_MEMORY_MAX_SIZE"] = max_dataset_memory

    # Construct setup dictionary:
    dtype = getattr(torch, cfg.impl.default_precision)  # :> dont mess this up
    device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.set_device(local_rank)
        log.info(f"GPU : {torch.cuda.get_device_name(device=device)}. CUDA: {torch.version.cuda}.")

        # Populate kwH counter:
        pynvml.nvmlInit()
        miilijoule_start = pynvml.nvmlDeviceGetTotalEnergyConsumption(pynvml.nvmlDeviceGetHandleByIndex(device.index))
        kWh_counter = dict(initial_value=miilijoule_start * 1e-6 / 3600)  # kilojoule per hour
    else:
        kWh_counter = dict(initial_value=float("NaN"))
    setup = dict(device=device, dtype=dtype)
    python_version = sys.version.split(" (")[0]

    if local_rank == 0:
        log.info(f"Platform: {sys.platform}, Python: {python_version}, PyTorch: {torch.__version__}")
        log.info(f"CPUs: {allowed_cpus_available}, GPUs: {torch.cuda.device_count()} on {socket.gethostname()}.")

    if cfg.impl.deterministic:
        set_deterministic()
    if cfg.seed is not None:
        if is_main_process():
            log.info(f"Seeding with random seed {cfg.seed} on rank 0.")
        set_random_seed(cfg.seed + 10 * global_rank)

    return setup, kWh_counter


def is_main_process():
    return not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0


def num_processes():
    num_procs = 1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()
    return num_procs


# def find_pretrained_checkpoint(cfg, downstream_classes=None):
def find_pretrained_checkpoint(checkpoint: str, local_checkpoint_folder: str = None, arch_modifications=None):
    """Load a checkpoint either locally or from the internet."""
    # tokenizer is only returned for HF models
    tokenizer = None
    cfg_arch = None
    checkpoint_path = None
    if checkpoint is None:
        checkpoint_name = local_checkpoint_folder
    elif checkpoint == "latest":
        # Load the latest local checkpoint
        all_checkpoints = [f for f in os.listdir(local_checkpoint_folder)]
        checkpoint_paths = [os.path.join(local_checkpoint_folder, c) for c in all_checkpoints]
        # checkpoint_paths = [x for x in checkpoint_paths if x[:6] != "FINAL_"]
        checkpoint_name = max(checkpoint_paths, key=os.path.getmtime)
    elif checkpoint == "smallest":
        # Load maybe the local checkpoint with smallest loss
        all_checkpoints = [f for f in os.listdir(local_checkpoint_folder)]
        checkpoint_paths = [os.path.join(local_checkpoint_folder, c) for c in all_checkpoints]
        checkpoint_losses = [float(path[-5:]) for path in checkpoint_paths]
        checkpoint_name = checkpoint_paths[np.argmin(checkpoint_losses)]
    elif not os.path.isabs(checkpoint) and not checkpoint.startswith("hf://"):
        # Look locally for a checkpoint with this name
        checkpoint_name = os.path.join(local_checkpoint_folder, checkpoint)
    elif checkpoint.startswith("hf://"):
        # Download this checkpoint directly from huggingface
        model_name = checkpoint.split("hf://")[1].removesuffix("-untrained")
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        cfg_arch = transformers.AutoConfig.from_pretrained(model_name)
        checkpoint_path = checkpoint
        checkpoint_name = None
    else:
        # Look for this name as an absolute path
        checkpoint_name = checkpoint

    if checkpoint_name is not None:
        # Load these checkpoints locally, might not be a huggingface model
        try:
            tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint_name)
        except:
            log.warning(f"Could not load tokenizer from checkpoint: {checkpoint_name}")

        with open(os.path.join(checkpoint_name, "model_config.json"), "r") as file:
            cfg_arch = OmegaConf.create(json.load(file))  # Could have done pure hydra here, but wanted interop

        # Optionally modify parts of the arch at eval time. This is not guaranteed to be a good idea ...
        # All mismatched parameters will be randomly initialized ...
        if arch_modifications is not None:
            cfg_arch = OmegaConf.merge(cfg_arch, arch_modifications)
            log.info("Using arch modifications")

        checkpoint_path = checkpoint_name

        log.info(f"Architecture: {cfg_arch}")

    if checkpoint_path is not None:
        log.info(f"Loading from checkpoint {checkpoint_path}...")
    else:
        log.error(f"No checkpoint to be loaded by: {checkpoint}")

    return tokenizer, cfg_arch, checkpoint_path


def save_summary(table_name, cfg, stats, local_time, setup, original_cwd=True):
    """Save two summary tables. A detailed table of iterations/loss+acc and a summary of the end results."""
    # 1) detailed table:
    for step in range(len(stats["loss"])):
        iteration = dict()
        for key in stats:
            iteration[key] = stats[key][step] if step < len(stats[key]) else None
        save_to_table(".", f"{cfg.name}_convergence_results", dryrun=cfg.dryrun, **iteration)

    def _maybe_record(key, step=-1):
        try:
            return stats[key][step]
        except (IndexError, ValueError):
            return ""

    if "data" in cfg:
        processed_dataset_dir = f"{cfg.data.name}_{checksum_config(cfg.data)}"
    else:
        processed_dataset_dir = None
    base_name = cfg.base_dir.rstrip(os.sep).split(os.sep)[-1]
    local_folder = os.getcwd().split(base_name)[1].lstrip(os.sep)

    # 2) save a reduced summary
    if table_name == "pretrain":
        summary = dict(
            name=cfg.name,
            budget=cfg.budget,
            dataset="_".join(processed_dataset_dir.split("_")[:-1]),
            backend=cfg.impl.name,
            arch=" ".join(cfg.arch.architectures),
            loss=_maybe_record("loss"),
            final_step=_maybe_record("step"),
            final_epoch=_maybe_record("epoch"),
            step_time=np.mean(stats["train_time"]) if len(stats["train_time"]) > 0 else "",
            loss100k=_maybe_record("loss", step=100_000 // cfg.impl.print_loss_every_nth_step),
            loss200k=_maybe_record("loss", step=200_000 // cfg.impl.print_loss_every_nth_step),
            loss300k=_maybe_record("loss", step=300_000 // cfg.impl.print_loss_every_nth_step),
            total_time=str(datetime.timedelta(seconds=local_time)).replace(",", ""),
            batch_size=cfg.train.batch_size,
            lr=cfg.train.optim.lr,
            warmup=cfg.train.warmup_steps,
            steps=cfg.train.steps,
            # System settings:
            seed=cfg.seed,
            dataset_hash=processed_dataset_dir.split("_")[-1],
            base_dir=cfg.base_dir,
            impl_path=cfg.impl.path,
            local_folder=local_folder,
            # # Dump configs from here on:
            **{f"Data_{k}": v for k, v in cfg.data.items()},
            **{f"Arch_{k}": v for k, v in cfg.arch.items()},
            **{f"Train_{k}": v for k, v in cfg.train.items()},
        )
    else:
        summary = dict(
            name=cfg.name,
            backend=cfg.impl.name,
            checkpoint=cfg.eval.checkpoint,
            loss=_maybe_record("loss"),
            avg_loss=_maybe_record("avg_loss"),
            final_epoch=_maybe_record("epoch"),
            step_time=np.mean(stats["train_time"]) if len(stats["train_time"]) > 0 else "",
            total_time=str(datetime.timedelta(seconds=local_time)).replace(",", ""),
            batch_size=cfg.eval.batch_size,
            lr=cfg.eval.optim.lr,
            warmup=cfg.eval.warmup_steps,
            # System settings:
            seed=cfg.seed,
            base_dir=cfg.base_dir,
            impl_path=cfg.impl.path,
            local_folder=local_folder,
            # # Dump configs from here on:
            **{f"Eval_{k}": v for k, v in cfg.eval.items()},
        )
    location = os.path.join(cfg.original_cwd, "tables") if original_cwd else "tables"
    save_to_table(location, f"{table_name}_reports", dryrun=cfg.dryrun, **summary)


def save_to_table(out_dir, table_name, dryrun, **kwargs):
    """Save keys to .csv files."""
    # Check for file
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    fname = os.path.join(out_dir, f"table_{table_name}.csv")
    fieldnames = list(kwargs.keys())
    # Read or write header
    try:
        with open(fname, "r") as f:
            reader = csv.reader(f, delimiter="\t")
            header = next(reader)  # noqa  # this line is testing the header
            # assert header == fieldnames[:len(header)]  # new columns are ok, but old columns need to be consistent
            # dont test, always write when in doubt to prevent erroneous table deletions
    except Exception as e:  # noqa
        if not dryrun:
            # print('Creating a new .csv table...')
            with open(fname, "w") as f:
                writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
                writer.writeheader()
        else:
            pass

    # Write a new row
    if not dryrun:
        # Add row for this experiment
        with open(fname, "a") as f:
            writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
            writer.writerow(kwargs)
    else:
        pass


def set_random_seed(seed=233):
    """."""
    torch.manual_seed(seed + 1)
    torch.cuda.manual_seed(seed + 2)
    torch.cuda.manual_seed_all(seed + 3)
    np.random.seed(seed + 4)
    torch.cuda.manual_seed_all(seed + 5)
    random.seed(seed + 6)
    # Can't be too careful :>


def set_deterministic():
    """Switch pytorch into a deterministic computation mode."""
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"


def avg_n_dicts(dicts):
    """https://github.com/wronnyhuang/metapoison/blob/master/utils.py."""
    # given a list of dicts with the same exact schema, return a single dict with same schema whose values are the
    # key-wise average over all input dicts
    means = {}
    for dic in dicts:
        for key in dic:
            if key not in means:
                if isinstance(dic[key], list):
                    means[key] = [0 for entry in dic[key]]
                else:
                    means[key] = 0
            if isinstance(dic[key], list):
                for idx, entry in enumerate(dic[key]):
                    means[key][idx] += entry / len(dicts)
            else:
                means[key] += dic[key] / len(dicts)
    return means


def dump_metrics(cfg, metrics):
    """Simple yaml dump of metric values."""

    filepath = f"metrics_{cfg.name}.yaml"
    sanitized_metrics = dict()
    for metric, val in metrics.items():
        try:
            sanitized_metrics[metric] = np.asarray(val).item()
        except ValueError:
            sanitized_metrics[metric] = np.asarray(val).tolist()
    with open(filepath, "w") as yaml_file:
        yaml.dump(sanitized_metrics, yaml_file, default_flow_style=False)


def _initialize_wandb(setup, cfg):
    if is_main_process():
        import wandb

        config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
        settings = wandb.Settings(start_method="thread")
        settings.update({"git_root": cfg.original_cwd})
        run = wandb.init(
            entity=cfg.wandb.entity,
            project=cfg.wandb.project,
            settings=settings,
            name=cfg.name,
            mode="disabled" if cfg.dryrun else None,
            tags=cfg.wandb.tags if len(cfg.wandb.tags) > 0 else None,
            config=config_dict,
        )
        run.summary["GPU"] = torch.cuda.get_device_name(device=setup["device"]) if torch.cuda.device_count() > 0 else ""
        run.summary["numGPUs"] = torch.cuda.device_count()


def wandb_log(stats, cfg):
    if cfg.wandb.enabled:
        if is_main_process():
            import wandb

            wandb.log({k: v[-1] for k, v in stats.items()}, step=stats["step"][-1] if "step" in stats else None)


def flatten(d, parent_key="", sep="_"):
    """Straight-up from https://stackoverflow.com/a/6027615/3775820."""
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.abc.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


def collect_system_metrics(cfg, metrics, kWh_counter, setup):
    # Finalize some compute metrics:
    metrics["GPU"] = torch.cuda.get_device_name(device=setup["device"]) if torch.cuda.device_count() > 0 else ""
    metrics["numGPUs"] = torch.cuda.device_count()
    metrics["VRAM"] = torch.cuda.max_memory_allocated(setup["device"]) / float(1 << 30)
    metrics["RAM"] = psutil.Process(os.getpid()).memory_info().rss / 1024**3
    if torch.cuda.device_count() == 1:
        metrics["kWh"] = get_kWh(kWh_counter, setup)
    else:
        if torch.distributed.is_initialized():
            local_kWh = get_kWh(kWh_counter, setup)
            kWh_comm = torch.as_tensor(local_kWh).cuda() if torch.cuda.is_available() else kWh_comm.float()
            torch.distributed.all_reduce(kWh_comm, torch.distributed.ReduceOp.SUM, async_op=False)
            metrics["kWh"] = kWh_comm.item()
        else:
            metrics["kWh"] = float("NaN")
    return metrics


def get_kWh(kWh_counter, setup):
    miilijoule_final = pynvml.nvmlDeviceGetTotalEnergyConsumption(pynvml.nvmlDeviceGetHandleByIndex(setup["device"].index))
    kWh_final = miilijoule_final * 1e-6 / 3600  # kilojoule per hour
    kWh = kWh_final - kWh_counter["initial_value"]
    return kWh


def pathfinder(cfg):
    with open_dict(cfg):
        cfg.original_cwd = hydra.utils.get_original_cwd()
        # ugliest way to get the absolute path to output subdir
        if not os.path.isabs(cfg.base_dir):
            base_dir_full_path = os.path.abspath(os.getcwd())
            while os.path.basename(base_dir_full_path) != cfg.base_dir:
                base_dir_full_path = os.path.dirname(base_dir_full_path)
                if base_dir_full_path == "/":
                    raise ValueError("Cannot find base directory.")
            cfg.base_dir = base_dir_full_path

        cfg.impl.path = os.path.expanduser(cfg.impl.path)
        if not os.path.isabs(cfg.impl.path):
            cfg.impl.path = os.path.join(cfg.base_dir, cfg.impl.path)
    return cfg


================================================
FILE: create_data_split.py
================================================
from transformers import PreTrainedTokenizer
import random
import os
import torch
from transformers import AutoTokenizer
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset, DatasetDict
import pandas as pd
import datasets
import json
import argparse
from cramming.data.tokenizer_preparation import get_tokenizer
import matplotlib.pyplot as plt
from collections import Counter
from matplotlib import cm
import re
from dataset_analysis import main as data_analysis_main
import numpy as np

def generate_no_carry_addition(n, m):
    """No carries addition, brute force implementation"""
    num1 = random.randint(10**(n-1), 10**n - 1)
    num2 = random.randint(10**(m-1), 10**m - 1)

    while has_carry(num1, num2):
        num1 = random.randint(10**(n-1), 10**n - 1)
        num2 = random.randint(10**(m-1), 10**m - 1)

    return num1, num2, num1 + num2

def has_carry(num1, num2):
    # Check if there is a carry in any column during addition
    for digit1, digit2 in zip(str(num1)[::-1], str(num2)[::-1]):
        if int(digit1) + int(digit2) >= 10:
            return True
    return False

# Function to generate the arithmetic dataset
def generate_dataset(dir_name, operation, n, m, num_examples, base_folder_name, keep_places, exact, prepend_zeros, reverse_answer, reverse_all, p=0, no_carry_addition=False, seed=42, interleave=False):
    """
    generate a dataset, NOT using the bucket method!
    p = probability for random padding to be inserted
    """
    if p < 0 or p >= 1:
        raise ValueError("Probability p must be strictly between 0 and 1.")

    random.seed()
    dataset = []

    for _ in range(num_examples):
        if exact: # exactly length n,m 
            num1 = random.randint(10**(n-1), 10**n - 1)
            num2 = random.randint(10**(m-1), 10**m - 1)
        elif no_carry_addition and operation == '+':
            num1, num2, _ = generate_no_carry_addition(n,m)
        else:
            num1 = random.randint(0, 10**n - 1)
            num2 = random.randint(0, 10**m - 1)

        if keep_places: # fill with zeros so it is always the same length
            num1_str = str(num1).zfill(n)
            num2_str = str(num2).zfill(m)
        else:
            num1_str = str(num1)
            num2_str = str(num2)

        if operation == '+':
            result = num1 + num2
        elif operation == '-':
            result = num1 - num2
        elif operation == 'x':
            result = num1 * num2
        else:
            raise ValueError("Invalid operation")

        result = str(result)

        if prepend_zeros > 0:
            zeros = "0"*prepend_zeros
            num1_str = zeros + num1_str
            num2_str = zeros + num2_str
            result = "0" + zeros + result

        orgional_p = p

        if reverse_all: # reversals 
            result = result[::-1]
            num1_str = num1_str[::-1]
            num2_str = num2_str[::-1]
        elif reverse_answer:
            result = result[::-1]
        

        dataset_entry = f"{num1_str}{operation}{num2_str}={result}"
        if interleave: # interleave the operands so the digits of the same significance are  next to eachother
            dataset_entry = ''.join([a + b for a, b in zip(num1_str, num2_str)]) + num1_str[len(num2_str):] + num2_str[len(num1_str):]+f"={result}"
        p = orgional_p
        if p > 0: # adds random spaces, exponentially decaying
            dataset_entry = f"{num1_str}{operation}{num2_str}={result}"
            if interleave:
                dataset_entry = ''.join([a + b for a, b in zip(num1_str, num2_str)]) + num1_str[len(num2_str):] + num2_str[len(num1_str):]+f"={result}"
            spaced_string = ""
            for char in dataset_entry:
                space_p = p
                while random.random() < space_p:
                    space_p *= 0.1
                    spaced_string += " "
                spaced_string += char
            dataset_entry = spaced_string
        dataset.append(dataset_entry)

    for i in range(0,min(len(dataset),5)):
        print(dataset[i])
    
    folder_name = f"{base_folder_name}/{dir_name}"
    os.makedirs(folder_name, exist_ok=True)
    # automated file name
    file_name = f"{operation}_n_{n}_m_{m}_examples_{num_examples}{'_diff_lens' if not keep_places else ''}{'_exact' if exact else ''}{f'_prepend_{prepend_zeros}zeros' if prepend_zeros>0 else ''}{f'_reverse_ans' if reverse_answer else ''}{f'_prob_space_{p}' if p>0 else ''}_seed_{seed}.txt"
    file_path = os.path.join(folder_name, file_name)

    with open(file_path, 'w') as file:
        for entry in dataset:
            file.write(entry + '\n')
    print(f"created: {file_path}")
    return dataset, folder_name, file_path


def tokenize_and_save_dataset(dataset, tokenizer, directory, test_split_ratio=0.05, pad_sequences=False):
    # tokenization, slow but gets the job done

    os.makedirs(directory, exist_ok=True)

    # Tokenize the dataset and add EOS token at the end of each entry
    eos_token_id = tokenizer.vocab[tokenizer.eos_token]
    tokenized_dataset = [tokenizer(entry)["input_ids"] + [eos_token_id] for entry in dataset]

    # print some of them say 5 input and its tokenized version
    print("Some examples of tokenized dataset:")
    for i in range(0,min(len(dataset),5)):
        print(f"Input: {dataset[i]}")
        print(f"Tokenized: {tokenized_dataset[i]}")
        decoded = tokenizer.decode(tokenized_dataset[i])
        print(f"Decoded: {decoded}")
        print()

    # Optionally pad the sequences
    if pad_sequences:
        max_length = max(len(entry) for entry in tokenized_dataset)
        pad_token_id = tokenizer.pad_token_id
        tokenized_dataset = [entry + [pad_token_id] * (max_length - len(entry)) for entry in tokenized_dataset]

    save_to_json_intermed = False # save the tokenized dataset to a json instead of hf
    if save_to_json_intermed:
        print(tokenized_dataset)
        data_path = os.path.join(directory, "dataset.json")
        with open(data_path, "w") as outfile:
            # Iterate over each dictionary in the list
            for entry in tokenized_dataset:
                # Convert dictionary to JSON string and write it to the file
                json.dump({'input_ids': entry}, outfile)
                # Write a newline character to separate each JSON object
                outfile.write('\n')
        exit()

    # Split the data into train and test sets
    test_size = int(len(tokenized_dataset) * test_split_ratio)
    train_data = tokenized_dataset[:-test_size]
    test_data = tokenized_dataset[-test_size:]
    # Convert to Hugging Face datasets with 'input_ids' column
    train_dataset = Dataset.from_pandas(pd.DataFrame({"input_ids": train_data}))
    test_dataset = Dataset.from_pandas(pd.DataFrame({"input_ids": test_data}))

    # Create a DatasetDict with train and test splits
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "test": test_dataset
    })

    # Save the dataset to disk
    hf_dataset_path = os.path.join(directory, "hf_tokenized_dataset")
    dataset_dict.save_to_disk(hf_dataset_path)

    # # Save tokenizer
    # print(f"Tokenized data saved to {tokenized_data_path}")
    print(f"HuggingFace Dataset saved to {hf_dataset_path}")

    # return dataset_dict, tokenized_data_path, hf_dataset_path #, tokenizer_dir
    return dataset_dict, hf_dataset_path

def character_histogram(dir_name, condense_white_space=False):
    """Histogram of character occurences"""
    base_directory = "./cramming-data/data/arithmetic_data"
    dir_name = os.path.join(base_directory, dir_name)

    # open all data files and append to big list
    dataset = []
    for filename in os.listdir(dir_name):
        if filename.endswith(".txt"):
            file_path = os.path.join(dir_name, filename)
            with open(file_path, "r") as file:
                lines = file.readlines()
                stripped_lines = [line.replace("\n", "") for line in lines]
                if condense_white_space:
                    stripped_lines = [re.sub('\s+',' ', line) for line in lines]
                dataset.extend(stripped_lines)

    for i in range(0,min(len(dataset),5)):
        print(dataset[i])

    max_length = max(map(len, dataset))
    
    counters_list = [Counter() for _ in range(max_length)]

    for string in dataset:
        for index, char in enumerate(string):
            counters_list[index][char] += 1

    # Plot the occurrences for each index
    plt.figure(figsize=(10, 6))
    indices = list(range(max_length))
    bottom = [0] * max_length
    sorted_chars = sorted(set(''.join(dataset)))

    colors = cm.get_cmap('tab20', len(sorted_chars))

    for char, color in zip(sorted_chars, colors.colors):
        occurrences = [counter[char] for counter in counters_list]
        legend_char = char if char != " " else "\' \'"
        plt.bar(indices, occurrences, label=legend_char, bottom=bottom, color=color)
        bottom = [b + o for b, o in zip(bottom, occurrences)]

    plt.xlabel('Index')
    plt.ylabel('Occurrences')
    plt.title("Character Frequency")
    plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=10)
    plt.savefig(f"{dir_name}/char_histogram{'_condensed_ws' if condense_white_space else ''}", bbox_inches='tight')

def token_histogram(dir_name, tokenizer_type="normal"):
    """Histogram of token occurences"""
    base_directory = "./cramming-data/data/arithmetic_data"
    dir_name = os.path.join(base_directory, dir_name)
    hf_dir_name = os.path.join(dir_name, "hf_tokenized_dataset")
    tokenized_dataset = datasets.load_from_disk(hf_dir_name)
    train_part = tokenized_dataset["train"]
    test_part = tokenized_dataset["test"]
    
    tokenizer = get_tokenizer(tokenizer_type)
    EOS_token = tokenizer._convert_token_to_id("[EOS]")
    
    dataset = []
    for example in train_part:
        tokens = example["input_ids"]
        eos_index = tokens.index(EOS_token) if EOS_token in tokens else len(tokens) # not including the EOS token
        tokens = tokens[:eos_index]
        dataset.append(tokens)
    for example in test_part:
        tokens = example["input_ids"]
        eos_index = tokens.index(EOS_token) if EOS_token in tokens else len(tokens) # not including the EOS token
        tokens = tokens[:eos_index]
        dataset.append(tokens)

    for i in range(0,min(len(dataset),5)):
        print(dataset[i])

    max_length = max(map(len, dataset))
    counters_list = [Counter() for _ in range(max_length)]

    for string in dataset:
        for index, char in enumerate(string):
            counters_list[index][str(char)] += 1

    plt.figure(figsize=(10, 6))
    indices = list(range(max_length))
    bottom = [0] * max_length
    print(tokenizer.vocab.values())
    sorted_chars = [str(x) for x in sorted(tokenizer.vocab.values())]
    
    colors = cm.get_cmap('tab20', len(sorted_chars))

    for char, color in zip(sorted_chars, colors.colors):
        occurrences = [counter[char] for counter in counters_list]
        tokenizer_char = tokenizer._convert_id_to_token(int(char))
        tokenizer_char = tokenizer_char if tokenizer_char != " " else "\' \'"
        legend_char = f"{char} => {tokenizer_char}"
        plt.bar(indices, occurrences, label=legend_char, bottom=bottom, color=color)
        bottom = [b + o for b, o in zip(bottom, occurrences)]

    plt.xlabel('Index')
    plt.ylabel('Occurrences')
    plt.title("Token Frequency")
    legend = plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.35), ncol=6)
    legend.set_title("token => char")

    plt.savefig(f"{dir_name}/token_histogram", bbox_inches='tight')

def main_dataset_gen(dir_name, op, n, m, num_samples, exact=False, keep_places=False, prepend_zeros=0, reverse_answer=False, reverse_all=False, p=0, no_carry_addition=False, seed=42, interleave=False):
    """Main method for non bucket datasets"""
    base_directory = "./cramming-data/data"
    os.makedirs(base_directory, exist_ok=True)
    base_directory = f"{base_directory}/arithmetic_data"
    os.makedirs(base_directory, exist_ok=True)
    
    dataset, data_folder_name, _ = generate_dataset(dir_name, op, n, m, num_samples, base_directory, keep_places, exact, prepend_zeros, reverse_answer, reverse_all, p, no_carry_addition, seed=seed, interleave=interleave)

def tokenize_main(dir_name, tokenizer_type, test_split_ratio=0.05):
    """Main tokenizer method"""
    base_directory = "./cramming-data/data/arithmetic_data"
    dir_name = os.path.join(base_directory, dir_name)
    data_folder_name = dir_name

    # Initialize the tokenizer
    tokenizer = get_tokenizer(tokenizer_type)

    # open all data files and append to big list
    dataset = []

    for filename in os.listdir(dir_name):
        if filename.endswith(".txt"):
            file_path = os.path.join(dir_name, filename)
            with open(file_path, "r") as file:
                lines = file.readlines()
                # stripped_lines = [line.strip() for line in lines]
                stripped_lines = [line.replace("\n", "") for line in lines]
                dataset.extend(stripped_lines)
    random.shuffle(dataset) # shuffling all the datasets together

    dataset_dict, hf_dataset_path = tokenize_and_save_dataset(dataset, tokenizer, data_folder_name,
                                                                                   pad_sequences=True,
                                                                                   test_split_ratio=test_split_ratio)
    tokenized_dataset = datasets.load_from_disk(hf_dataset_path)
    print(tokenized_dataset)


def pick_char_set(max_len):
    """Pick a set of characters in a cyclic method for index hints"""
    # 102 characters
    set_of_chars = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '!', '@', '£', '#', '$', '%', '^', '&', '*', '(', ')', '~', '?', '.', ',', '<', '>', '{', '}', '[', ']', ':', ';','/','|','β','Γ', 'Δ', 'δ', 'ε', 'ζ', 'η', 'θ', 'κ','Λ', 'λ', 'μ', 'Ξ', 'ξ','Π', 'π','Σ', 'ς', 'τ', 'Φ', 'φ', 'χ', 'Ψ', 'ψ', 'Ω', 'ω']
    
    output = []
    start = random.randint(0, len(set_of_chars))
    if start + max_len > len(set_of_chars): # i.e. cycle round
        return set_of_chars[start:len(set_of_chars)] + set_of_chars[:start + max_len-len(set_of_chars)]
    else:
        return set_of_chars[start:start + max_len]

def hints_helper(num_str, chars):
    # returns the positional hints with the number
    result = ""
    for char, digit in zip(chars, num_str):
        result += f"{char}{digit}"
    return result

def bucket_method_gen(n=3, m=3, operation='+', limit=1000, p=0, no_carry_addition=False, reverse_answer=False, start=1, reverse_all=False, keep_0_for_len_1=False, Flags=None):
    """Bucket method generator, samples all operand lengths equally"""
    dataset = []
    while True:
        for i in range(start,n+1):
            for j in range(start,m+1):
                start_i = 10**(i-1)
                start_j = 10**(j-1)
                if keep_0_for_len_1 and i==1: # i.e. use natruals including 0, we just use naturals
                    start_i = 0
                if keep_0_for_len_1 and j==1:
                    start_j = 0
                num1 = random.randint(start_i, (10**i - 1))
                num2 = random.randint(start_j, 10**j - 1)

                if no_carry_addition and operation == '+':
                    num1, num2, _ = generate_no_carry_addition(i,j)
                num1_str = str(num1)
                num2_str = str(num2)

                if operation == '+':
                    result = num1 + num2
                elif operation == '-':
                    result = num1 - num2
                elif operation == 'x':
                    result = num1 * num2
                else:
                    raise ValueError("Invalid operation")

                result = str(result)
                if reverse_answer: # reversals
                    result = result[::-1]
                if reverse_all:
                    result = result[::-1]
                    num1_str = num1_str[::-1]
                    num2_str = num2_str[::-1]
                if Flags.index_hints: # adding the index hints
                    max_len = max(len(result), max(len(num1_str),len(num2_str)))
                    chars = pick_char_set(max_len)
                    result = hints_helper(result, chars)
                    num1_str = hints_helper(num1_str, chars)
                    num2_str = hints_helper(num2_str, chars)
                else:
                    dataset_entry = f"{num1_str}{operation}{num2_str}={result}"

                    if p > 0: # adds random spaces
                        spaced_string = ""
                        for char in dataset_entry:
                            space_p = p
                            while random.random() < space_p:
                                space_p *= 0.1
                                spaced_string += " "
                            spaced_string += char
                        dataset_entry = spaced_string
                
                dataset.append(dataset_entry)
                if len(dataset) == limit:
                    return dataset

def bucket_method_main(n, m, operation, limit, dir_name, p=0, no_carry_addition=False, reverse_answer=False, start=1, reverse_all=False, keep_0_for_len_1=False, Flags=None):
    """Mains method for bucket style generation"""
    dataset = bucket_method_gen(n, m, operation, limit, p, no_carry_addition, reverse_answer, start, reverse_all=reverse_all, keep_0_for_len_1=keep_0_for_len_1, Flags=Flags)
    for i in range(0,10):
        print(dataset[i])
    
    base_directory = "./cramming-data/data"
    os.makedirs(base_directory, exist_ok=True)
    base_directory = f"{base_directory}/arithmetic_data"
    os.makedirs(base_directory, exist_ok=True)
    
    folder_name = f"{base_directory}/{dir_name}"
    os.makedirs(folder_name, exist_ok=True)
    file_name = f"{operation}_n_{n}_m_{m}_examples_{limit}.txt"
    file_path = os.path.join(folder_name, file_name)

    random.seed()
    random.shuffle(dataset)
    with open(file_path, 'w') as file:
        for entry in dataset:
            file.write(entry + '\n')
    print(f"created: {file_path}")
    return dataset, folder_name, file_path


def uniform_distribution_sort_basic(maximum_number_of_digts, maximum_length, limit, FLAGS):
    """sorting dataset generator"""
    dataset = []
    for i in range(0, limit):
        dataset_entry = ""
        chars = pick_char_set(maximum_length)
        local_chars = pick_char_set(maximum_number_of_digts)
        all_nums = []
        for j in range(0, maximum_length):
            # choose a random number of digit between 1 and maximum_number_of_digts
            num_digit = random.randint(1, maximum_number_of_digts)
            # pick a number with num_digit digits
            num = random.randint(10**(num_digit-1), 10**num_digit - 1)
            all_nums.append([chars[j], num])

            num = str(num)
            if FLAGS.reverse_all:
                num = num[::-1]
            if FLAGS.index_hints:
                num = hints_helper(num, local_chars)
            dataset_entry += f"{chars[j]}:{num},"

        dataset_entry = dataset_entry[:-1]
        all_nums = sorted(all_nums, key=lambda x: x[1]) # get the answer
        sorted_chars = [x[0] for x in all_nums]
        dataset_entry += f"={','.join(sorted_chars)}" # convert them into a string separated by ,
        dataset.append(dataset_entry)

    return dataset

def bucket_uniform_distribution(maximum_number_of_digts, maximum_length, limit, FLAGS):
    """Use a uniform distribution over -- i.e. bucket method for sorting"""
    bucket_limit = limit // (maximum_length * maximum_number_of_digts)
    dataset = []
    for i in range(0, maximum_length):
        for j in range(0, maximum_number_of_digts):
            dataset += uniform_distribution_sort_basic(j+1, i+1, bucket_limit, FLAGS)
    return dataset

def uniform_distribution_sort_main(FLAGS, dir_name):
    """Main method for sorting generation"""
    maximum_number_of_digts = FLAGS.n
    maximum_length = FLAGS.m
    limit = FLAGS.limit

    dataset = bucket_uniform_distribution(maximum_number_of_digts, maximum_length, limit, FLAGS)

    for i in range(0, 10):
        print(dataset[i])

    base_directory = "./cramming-data/data"
    os.makedirs(base_directory, exist_ok=True)
    base_directory = f"{base_directory}/arithmetic_data"
    os.makedirs(base_directory, exist_ok=True)

    folder_name = f"{base_directory}/{dir_name}"
    os.makedirs(folder_name, exist_ok=True)
    file_name = f"sort_maximum_number_of_digts_{FLAGS.n}" \
                f"_maximum_length_{FLAGS.m}_examples_{limit}.txt"
    file_path = os.path.join(folder_name, file_name)

    random.seed()
    random.shuffle(dataset)
    with open(file_path, 'w') as file:
        for entry in dataset:
            file.write(entry + '\n')
    print(f"created: {file_path}")
    return dataset, folder_name, file_path


def main():
    parser = argparse.ArgumentParser(description="Train a model")
    # General addition
    parser.add_argument("--dir_name", type=str, required=True, help='name of dataset')
    parser.add_argument("--op", type=str, default='+', help="operation e.g. +,-,x")
    parser.add_argument("--n", default=2, type=int, help="num digits in first number")
    parser.add_argument("--m", default=2, type=int, help="num digits in second number")
    parser.add_argument("--num_samples", default=100, type=int, help="number of samples")
    parser.add_argument("--seed", default=42, type=int, help="seed for random generation")
    parser.add_argument('--keep_places', action='store_true') # i.e. default is different length numbers
    parser.add_argument('--exact', action='store_true') # will only take numbers which are exactly length n,m if turned on
    parser.add_argument('--special', action='store_true') # special flag to do any crazy ideas
    parser.add_argument('--p', default=0.0, type=float, help="prob for adding padding")
    parser.add_argument("--prepend_zeros", default=0, type=int, help="prepend this number of zeros to n, m and answer (adds 1 more to answer)")
    parser.add_argument('--reverse_answer', action='store_true', help="reverses the answer")
    parser.add_argument('--reverse_all', action='store_true', help="reverses the inputs and answer")
    parser.add_argument('--no_carry_addition', action='store_true', help="no carried in the addition")
    parser.add_argument('--test_split_ratio', default=0.05, type=float, help="test split percentage")
    parser.add_argument('--interleave', action='store_true', help="interleave digits of the operands")
    parser.add_argument('--keep_0_for_len_1', action='store_true', help='keep 0 as a possible digit for length 1 digits, i.e. Naturals including 0')
    
    # bucket method to sample all operands equally
    parser.add_argument('--bucket', action='store_true', help='all operand lengths sampled equally')
    parser.add_argument("--limit", default=1000000, type=int, help="number of samples if using the bucket method")
    parser.add_argument('--index_hints', action='store_true', help='use index hints for numbers')

    # tokenize
    parser.add_argument('--tokenize', action='store_true', help='tokenize the all txt files in the dir_name given') # i.e. tokenize the folder
    parser.add_argument("-tt", "--tokenizer_type", type=str, default="pad", help='tokenizer type used')
    
    # sort
    parser.add_argument('--uniform_distribution_sort_data', action='store_true', help='sort data')
    parser.add_argument("--extra_path", type=str, default=None, help='extra path infront of the autogenerated sort data path')

    FLAGS = parser.parse_args()
    random.seed(FLAGS.seed)
    if FLAGS.no_carry_addition and FLAGS.op != '+':
        print("no carries is only for addition")
        exit()
        
    if FLAGS.bucket:
        # automated nameing scheme for the most common flags
        index_hints = "_with_index_hints_circular" if FLAGS.index_hints else ""
        folder_name = f"{FLAGS.op}_bucket_method_n_{FLAGS.n}_m_{FLAGS.m}_{FLAGS.limit}_p_{str(FLAGS.p).replace('.','')}{'_reverse_ans' if FLAGS.reverse_answer else ''}{'_reverse_all' if FLAGS.reverse_all else ''}{'_keep_0_for_len_1' if FLAGS.keep_0_for_len_1 else ''}{index_hints}"
        print(f"folder name = {folder_name}")
        if FLAGS.no_carry_addition:
            folder_name = FLAGS.dir_name
        bucket_method_main(FLAGS.n, FLAGS.m, FLAGS.op, FLAGS.limit, folder_name, FLAGS.p, FLAGS.no_carry_addition, FLAGS.reverse_answer,reverse_all=FLAGS.reverse_all,keep_0_for_len_1=FLAGS.keep_0_for_len_1, Flags=FLAGS)
        print("dataset made")
        character_histogram(folder_name)
        print("char histogram made")
        data_analysis_main(folder_name) # more automated analysis
        exit()

    if FLAGS.uniform_distribution_sort_data:
        index_hints = "_with_index_hints_circular" if FLAGS.index_hints else ""

        # uniform_distribution_steps
        # bucket_uniform_distribution

        # sort
        # n - max length of a number
        # m - number of numbers in the list to sort
        folder_name = f"sort_bucket_uniform_distribution_max_digits_n_{FLAGS.n}_max_length_m_{FLAGS.m}_" \
                      f"{FLAGS.limit}_" \
                      f"p_{str(FLAGS.p).replace('.','')}" \
                      f"{'_reverse_all' if FLAGS.reverse_all else ''}" \
                      f"{index_hints}"
        if FLAGS.extra_path != None:
            folder_name = f"{FLAGS.extra_path}/{folder_name}"
        print(f"folder name = {folder_name}")

        uniform_distribution_sort_main(FLAGS, folder_name)
        FLAGS.dir_name = folder_name

    if FLAGS.tokenize:
        if FLAGS.tokenizer_type != "sort": # do some automated plotting for each dataset
            character_histogram(FLAGS.dir_name)
            print("char histogram made")
        tokenize_main(FLAGS.dir_name, FLAGS.tokenizer_type, test_split_ratio=FLAGS.test_split_ratio)
        print("tokenized")
        if FLAGS.tokenizer_type != "sort": # do some automated plotting for each dataset
            token_histogram(FLAGS.dir_name, FLAGS.tokenizer_type)
            print("token histogram made")
            data_analysis_main(FLAGS.dir_name) # more automated analysis
    else:
        main_dataset_gen(FLAGS.dir_name, FLAGS.op, FLAGS.n, FLAGS.m, FLAGS.num_samples, FLAGS.exact, FLAGS.keep_places, FLAGS.prepend_zeros, FLAGS.reverse_answer, FLAGS.reverse_all, FLAGS.p, FLAGS.no_carry_addition, FLAGS.seed, interleave=FLAGS.interleave)

if __name__ == "__main__":
    main()

================================================
FILE: create_pos_or_variants.py
================================================
import numpy as np
import argparse
import random
import os

def one_hot_vector(length, index=None):
    """return a one hot vector"""
    if index is None:
        index = np.random.randint(length)
    one_hot = np.zeros(length)
    one_hot[index] = 1
    return one_hot

def zero_vector(length):
    """return a zero vector"""
    zeros = np.zeros(length)
    return zeros

def main():
    parser = argparse.ArgumentParser(description="Train a model")
    parser.add_argument("--dir_name", type=str, required=True, help="dir to save to")
    parser.add_argument("--op", type=str, default='+', help="operation")
    parser.add_argument("--n", default=2, type=int, help="num digits in first number")
    parser.add_argument("--m", default=2, type=int, help="num digits in second number")
    parser.add_argument('--p', default=0.0, type=float, help="prob for adding padding")
    parser.add_argument("--max", default=-1, type=int, help="num digits in second number")
    parser.add_argument('--exact', action='store_true', help='only this size')
    parser.add_argument('--eval', action='store_true', help='save as part of eval dataset')
    FLAGS = parser.parse_args()

    p = FLAGS.p
    dir_name = FLAGS.dir_name
    lengths_n = lengths_n_range = list(range(1,FLAGS.n+1))
    lengths_m = lengths_m_range = list(range(1,FLAGS.m+1))
    if FLAGS.exact:
        lengths_n = [FLAGS.n]
        lengths_m = [FLAGS.m]
        
    ds = []
    # 2d loop to sample exaustively
    for i in lengths_n:
        for j in lengths_m:
            i_len=i
            j_len=j
            combined_len=max(i,j)
            for index in list(range(0,min(i,j))):
                if i_len > j_len: # put one hot in longer vector
                    vec1 = zero_vector(i_len)
                    vec2 = one_hot_vector(j_len, index)
                elif i_len < j_len:
                    vec1 = one_hot_vector(i_len, index)
                    vec2 = zero_vector(j_len)
                else: # i.e. same length so either can be the zeros
                    if random.random() > 0.5:
                        vec1 = one_hot_vector(i_len, index)
                        vec2 = zero_vector(j_len)
                    else:
                        vec1 = zero_vector(i_len)
                        vec2 = one_hot_vector(j_len, index)
                ans = one_hot_vector(combined_len, index)

                vec1_str = "".join(map(lambda x: str(int(x)), vec1))
                vec2_str = "".join(map(lambda x: str(int(x)), vec2))
                ans_str = "".join(map(lambda x: str(int(x)), ans))

                dataset_entry = f"{vec1_str}{FLAGS.op}{vec2_str}={ans_str}"
                
                if p>0: # add random padding, exponentially decaying
                    spaced_string = ""
                    for char in dataset_entry:
                        space_p = p
                        while random.random() < space_p:
                            space_p *= 0.1
                            spaced_string += " "
                        spaced_string += char
                    dataset_entry = spaced_string
            
                ds.append(dataset_entry)

    if FLAGS.max != -1:
        ds = random.sample(ds, min(len(ds),FLAGS.max)) # cut to maximum size
    if FLAGS.eval:
        data_dir = f"./cramming-data/data/arithmetic_data/pos_or_one_vec_zeros/{dir_name}"
        file_name = f"positional_arithmetic_n_{FLAGS.n}_m_{FLAGS.m}.txt"
    else:
        data_dir = f"./cramming-data/data/arithmetic_data/{dir_name}"
        file_name = f"positional_or_one_vec_zeros_n_{FLAGS.n}_m_{FLAGS.m}_examples_{len(ds)}.txt"
    os.makedirs(data_dir, exist_ok=True)
    file_path = os.path.join(data_dir, file_name)

    with open(file_path, 'w') as file:
        for entry in ds:
            file.write(entry + '\n')
    print(f"created: {file_path}")

if __name__ == "__main__":
    main()


================================================
FILE: dataset_analysis.py
================================================
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import argparse

def read_dataset(dir_name, condense_white_space=False):
    # open all data files and append to big list
    dataset = []
    for filename in os.listdir(dir_name):
        if filename.endswith(".txt"):
            file_path = os.path.join(dir_name, filename)
            with open(file_path, "r") as file:
                lines = file.readlines()
                stripped_lines = [line.replace("\n", "") for line in lines]
                if condense_white_space:
                    stripped_lines = [re.sub('\s+',' ', line) for line in lines]
                dataset.extend(stripped_lines)

    for i in range(0,min(len(dataset),5)):
        print(dataset[i])
    return dataset

def remove_leading_zeros(match):
    """Removes all leading zeros"""
    return str(int(match.group(0)))

def count_digits(dataset, remove_formatting=False):
    """Count the digits in each operand"""
    pairs = {}
    input_1 = {}
    input_2 = {}
    ans = {}
    for input_string in dataset:
        cleaned_string = input_string.replace(' ', '')
        if remove_formatting:
            cleaned_string = re.sub(r'\b0+\d+', remove_leading_zeros, cleaned_string)

        numbers = re.findall(r'\d+', cleaned_string)
        digit_counts = [len(number) for number in numbers]

        input_1[digit_counts[0]] = input_1.get(digit_counts[0], 0) + 1
        input_2[digit_counts[1]] = input_2.get(digit_counts[1], 0) + 1
        ans[digit_counts[2]] = ans.get(digit_counts[2], 0) + 1

        input_tuple = (digit_counts[0], digit_counts[1])
        pairs[input_tuple] = pairs.get(input_tuple, 0) + 1

    return pairs, input_1, input_2, ans

def plot_pairs_heatmap(pairs, dir_name=".", remove_formatting=False):
    """plot a heatmap of the lengths of the operands"""
    max_length = int(max(max(pair) for pair in pairs.keys()))
    heatmap_matrix = np.zeros((max_length + 1, max_length + 1))

    # Populate the matrix with counts
    for pair, count in pairs.items():
        heatmap_matrix[pair[0],pair[1]] = count

    df = pd.DataFrame.from_dict(heatmap_matrix)

    # Create a heatmap using seaborn
    plt.figure(figsize=(10, 8))
    sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".4g", cbar_kws={'label': 'Count'}, annot_kws={'size': 8,'rotation':45})
    plt.xlabel('Length of First Number')
    plt.ylabel('Length of Second Number')
    plt.title('Input Pairs Length Heatmap')
    plt.savefig(f"{dir_name}/pairs_heatmap{'_removed_prepended_zeros' if remove_formatting else ''}.png", bbox_inches='tight')
    plt.clf()

def line_plotter(data, name, dir_name=".", remove_formatting=False):
    """plot a line graph for the length of the operand """
    data = dict(sorted(data.items()))
    x_values = list(data.keys())
    y_values = list(data.values())

    # Plotting the line plot
    plt.plot(x_values, y_values, marker='o')

    # Adding labels and title
    plt.xlabel('Length of number')
    plt.ylabel('Count')
    plt.title(f"Line Plot for {name}")
    plt.savefig(f"{dir_name}/{name}_line_plot{'_removed_prepended_zeros' if remove_formatting else ''}.png", bbox_inches='tight')
    plt.clf()

def consecutive_digit_counts(input_strings):
    """Count the number of times a digit is repeated"""
    counts_by_digit = {}

    for input_str in input_strings:
        current_digit = None
        consecutive_count = 0

        for char in input_str:
            if char.isdigit():
                if char == current_digit:
                    consecutive_count += 1
                else:
                    if current_digit is not None:
                        # Update the dictionary with consecutive count
                        if consecutive_count != 1:
                            counts_by_digit.setdefault(current_digit, {}).setdefault(consecutive_count, 0)
                            counts_by_digit[current_digit][consecutive_count] += 1

                    current_digit = char
                    consecutive_count = 1

        # Update the dictionary for the last digit in the string
        if current_digit is not None:
            if consecutive_count != 1:
                counts_by_digit.setdefault(current_digit, {}).setdefault(consecutive_count, 0)
                counts_by_digit[current_digit][consecutive_count] += 1

    return counts_by_digit

def create_repetition_heatmap(data, dir_name=".", remove_formatting=False):
    """plot heat map for, consecutive_digit_counts"""
    data = dict(sorted(data.items()))
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame.from_dict(data, orient='index').fillna(0)

    # Create a heatmap using seaborn
    plt.figure(figsize=(10, 8))
    sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".4g", cbar_kws={'label': 'Count'}, annot_kws={'size': 8,'rotation':45})
    plt.title('Consecutive Digit Counts Heatmap')
    plt.xlabel('Consecutive Count')
    plt.ylabel('Digit')
    plt.savefig(f"{dir_name}/repetition_count_heatmap{'_removed_prepended_zeros' if remove_formatting else ''}.png", bbox_inches='tight')
    plt.clf()

def main(dir_name):
    base_directory = "./cramming-data/data/arithmetic_data"
    dir_name = os.path.join(base_directory, dir_name)
    dataset = read_dataset(dir_name)

    options = [True, False]
    for remove_formatting in options:
        pairs, input_1, input_2, ans = count_digits(dataset, remove_formatting=remove_formatting)
        print(f"{'removed prepended zeros' if remove_formatting else 'keeping prepended zeros'}")
        print("pairs: ",pairs)
        print("input 1: ",input_1)
        print("input 2: ",input_2)
        print("answers: ",ans)

        plot_pairs_heatmap(pairs, dir_name=dir_name, remove_formatting=remove_formatting)
        line_plotter(input_1, "input_1", dir_name=dir_name, remove_formatting=remove_formatting)
        line_plotter(input_2, "input_2", dir_name=dir_name, remove_formatting=remove_formatting)
        line_plotter(ans, "answer", dir_name=dir_name, remove_formatting=remove_formatting)

        result_list = consecutive_digit_counts(dataset)
        print("repetitions: ",result_list)
        create_repetition_heatmap(result_list, dir_name=dir_name, remove_formatting=remove_formatting)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Data analysis")
    parser.add_argument("--dir_name", type=str, required=True)
    FLAGS = parser.parse_args()

    main(FLAGS.dir_name)

================================================
FILE: gen_eval_script.py
================================================
# input your model name and base_dir
name = "sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_reycle_with_fire_8x1_1_24_run_1"
base_dir = "cramming-data"

# pick which eval you are doing
add_100 = False
add_110+ = False
add_small = False
mul = False
sort = True
bitwise_or = False

# set the model parameters for eval
print("remember to edit max_rec and tokenizer!!")
max_rec = 1
tokenizer = ' data.sources.arithmetic.tokenizer_type="pad"'
if sort:
    tokenizer = ' data.sources.arithmetic.tokenizer_type="sort"'

## print statements for all tasks below
if add_100:
    for checkerboard_str in [" checkerboard=odd"," checkerboard=even"]:
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=55 big_eval_step_1=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=60 big_eval_step_2=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=70 big_eval_step_3=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=85 big_eval_step_4=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=90 big_eval_step_5=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=100 big_eval_step_6=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=100 big_eval_step_7=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=110 big_eval_step_8=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=110 big_eval_step_9=True reverse_inputs=True{tokenizer}{checkerboard_str}")
        print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=110 big_eval_step_10=True reverse_inputs=True{tokenizer}{checkerboard_str}")

if add_100:
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_1=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_2=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_3=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=105 big_eval_step_4=True reverse_inputs=True checkerboard=even extended_eval=True{tokenizer}")

if add_small:
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 reverse_inputs=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=35 ood_only=True reverse_inputs=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=45 up_to_40=True reverse_inputs=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=55 up_to_50=True reverse_inputs=True{tokenizer}")

if mul:
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 pos_arth=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=50 pos_arth_ood=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 mul=True{tokenizer}")

if sort:
    for i in range(0,30):
        print(f"python sort_eval.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} sort_reverse=True data.sources.arithmetic.tokenizer_type='sort' max_size_given={i+2} start_ind_1_given={i+1} start_ind_2_given={i+1}")

if bitwise_or: # we give data to evaluate up to 100x100 as we show in the paper, but the evaluation loop in only arithmetic_eval_quicker.py evaluates up to 40x40. This can be easily edited if required
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=30 pos_arth=True{tokenizer}")
    print(f"python arithmetic_eval_quicker.py name={name} base_dir={base_dir} data=arithmetic max_rec={max_rec} token_limit=50 pos_arth_ood=True{tokenizer}")
                    

================================================
FILE: load_local_model.py
================================================
"""Example for a script to load a local saved model.

Use as e.g.

python load_local_model.py name=A6000amp_b4096_c5_o3_final base_dir=
> wandb=none impl.push_to_huggingface_hub=True arch=bert-c5 train=bert-o3 train.batch_size=4096
> data=c4-subset-processed dryrun=True +eval=GLUE_sane

"""
import os

import hydra
import time

import logging


import cramming

log = logging.getLogger(__name__)


def main_load_process(cfg, setup):
    """This function controls the central routine."""
    local_time = time.time()

    local_checkpoint_folder = os.path.join(cfg.base_dir, cfg.name, "checkpoints")
    tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg.eval.checkpoint,
                                                                                local_checkpoint_folder,
                                                                                cfg.eval.arch_modifications)

    model = cramming.construct_model(cfg_arch, tokenizer.vocab_size, downstream_classes=None)
    model_engine, _, _, _ = cramming.load_backend(model, tokenizer, cfg.train, cfg.impl, setup=setup)
    model_engine.load_checkpoint(cfg_arch, model_file)

    if cramming.utils.is_main_process():
        if cfg.impl.push_to_huggingface_hub:
            model_engine.push_to_hub(tokenizer, cfg, dryrun=cfg.dryrun)


@hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.3")
def launch(cfg):
    cramming.utils.main_launcher(cfg, main_load_process, job_name="load and push model")


if __name__ == "__main__":
    launch()


================================================
FILE: pretrain.py
================================================
"""Script for a pretraining run."""

import torch
import hydra

import os
import time
import datetime
import logging
from collections import defaultdict

import cramming

log = logging.getLogger(__name__)


def main_training_process(cfg, setup):
    """This function controls the central training loop."""
    model, model_engine, tokenizer, dataloaders, prior_metadata = cramming.backend.get_model_engine_tokenizer_dataloaders(
        cfg, setup, True)

    data_source = list(cfg.data.sources.values())[0]["provider"]
    stats = defaultdict(list)

    # Start the clocks now:
    wallclock_timer = time.time()
    last_save_time = wallclock_timer
    train_time = time.time()  # Crude time measurement for print_loss_every_nth_step
    training_allowed = True
    loss_vals, loss_ppls = [], []

    loss = prior_metadata.get("loss", 0)
    total_steps = prior_metadata.get("steps", 0)
    epochs = prior_metadata.get("epochs", 0)
    elapsed_time = prior_metadata.get("elapsed_time", 0.0)
    prev_data_idx = prior_metadata.get("data_idx", 0)

    # Launch training
    log.info(f"Training run for {cfg.budget} hours{f'' if cfg.overall_budget < 0 else f' and {cfg.overall_budget} hours overall'}{f'' if elapsed_time <= 0 else f' of which {elapsed_time/3600:.2f} hours was used so far.'}")
    run_time = min(cfg.budget, cfg.overall_budget - elapsed_time/3600)
    log.info(f"Running for {run_time:.2f} hours")
    if run_time <= 0:
        log.info(f"Already used budget!")
        return {}

    for data_idx, batch in enumerate(dataloaders["train"], prev_data_idx):
        logged_stats = False

        device_batch = model_engine.to_device(batch)
        model_outputs = {}
        for seq_idx in range(0, max(1, device_batch["input_ids"].shape[1] - cfg.train.stream_depth), cfg.train.stream_depth):
            # Run over seq_dim and dispatch multiple model updates while maintaining state in model_outputs
            # .clone() is required for new nightly so compilation is not stuck recompiling due to StorageOffsets
            input_ids = device_batch["input_ids"][:, seq_idx: seq_idx + cfg.train.stream_depth + 1].clone()  # last token is only a target
            model_outputs = model_engine.forward(input_ids=input_ids, **model_outputs)
            loss = model_outputs["loss"]

            model_engine.backward(loss)
            model_engine.optimizer_step()
            loss_vals.append(loss.detach())
            loss_ppls.append(model_outputs["log_perplexity"].detach())

            if cfg.dryrun:
                break

        # Check stopping criteria
        if check_deadline(wallclock_timer, cfg.budget, elapsed_time, cfg.overall_budget) or data_idx == cfg.train.steps:
            training_allowed = False

            log.info(f"Reached deadline: Used {get_time_elapsed(wallclock_timer)/3600:.2f}/{cfg.budget} hours {'' if cfg.overall_budget < 0 else f' since reset and {get_time_elapsed(wallclock_timer, elapsed_time)/3600:.2f}/{cfg.overall_budget} hours overall'}. "
                     f"Stopping training ...")
                     
        if check_checkpointing(data_idx, cfg.impl, last_save_time):
            if cramming.utils.is_main_process():
                loss_vals, loss_ppls, train_time = collect_stats(
                    data_idx,
                    loss_vals,
                    loss_ppls,
                    model_outputs,
                    train_time,
                    stats,
                    model_engine,
                    dataloaders["train"],
                    cfg,
                )
                logged_stats = True

                # Save intermediate training checkpoint?
                epochs = dataloaders["train"].epoch_counter
                last_save_time = time.time()
                last_save_time_datetime = datetime.datetime.fromtimestamp(last_save_time)
                if cfg.impl.save_intermediate_model_name is None:
                    # if name is given use it (will overwrite), else use time to save
                    checkpoint_name = f"{cfg.arch.model_type}_{last_save_time_datetime.strftime('%Y-%m-%d')}_{loss.item():2.4f}"
                else:
                    checkpoint_name = cfg.impl.save_intermediate_model_name
                checkpoint_path = os.path.join(cfg.model_dir, cfg.name, "checkpoints")

                metadata = {"epochs": epochs,
                            "loss": loss.item(),
                            "data_idx": data_idx,
                            "steps": model_engine.steps,
                            "elapsed_time": (time.time() - wallclock_timer) + elapsed_time
                            }

                saved_path_temp = model_engine.save_model(checkpoint_path, checkpoint_name, cfg.arch, metadata)
                log.info(
                    f"Saving training checkpoint! Number of epochs/optim steps/data steps trained for: {epochs}/{model_engine.steps}/{data_idx},"
                    f"saving to: {saved_path_temp}")

                if cfg.impl.push_to_huggingface_hub:
                    model_engine.push_to_hub(tokenizer, cfg, dryrun=cfg.dryrun)

        # Collect stats and print to console and upload to wandb
        if data_idx % cfg.impl.print_loss_every_nth_step == 0:
            if not logged_stats:
                loss_vals, loss_ppls, train_time = collect_stats(
                    data_idx,
                    loss_vals,
                    loss_ppls,
                    model_outputs,
                    train_time,
                    stats,
                    model_engine,
                    dataloaders["train"],
                    cfg,
                )

            if check_early_termination(wallclock_timer, stats["loss"][-1], cfg.impl.early_termination, elapsed_time):
                training_allowed = False
                log.info("Loss higher than allowed threshold. Stopping training early...")

        if not loss.detach().isfinite():
            log.info(f"Non-finite loss in block {data_idx} on device {cfg.impl.local_rank}.")
            training_allowed = False

        flag_communication(training_allowed)

        if (cfg.dryrun and data_idx > (model_engine.accumulation_steps_expected + 1)) or not training_allowed:
            break

    epochs = dataloaders["train"].epoch_counter
    log.info(f"Number of epochs/optim steps/data steps trained for: {epochs}/{model_engine.steps}/{data_idx}")

    if cramming.utils.is_main_process():
        # Save final checkpoint?
        if cfg.impl.save_final_model:
            metadata = {"epochs": epochs,
                        "loss": loss.item(),
                        "data_idx": data_idx,
                        "steps": model_engine.steps,
                        "elapsed_time": time.time() - wallclock_timer + elapsed_time
                        }
                        
            if cfg.model_dir is None:
                save_dir = cfg.base_dir
            else:
                save_dir = cfg.model_dir
            checkpoint_path = os.path.join(save_dir, cfg.name, "checkpoints")
            checkpoint_name = f"FINAL_{loss.item():2.4f}"
            saved_path = model_engine.save_model(checkpoint_path, checkpoint_name, cfg.arch, metadata, None, save_safe=True)

            log.info(f"Saving training checkpoint to: {saved_path}")

            if cfg.impl.push_to_huggingface_hub:
                model_engine.push_to_hub(tokenizer, cfg, dryrun=cfg.dryrun)
            
            # Print some example completions
        if loss.detach().isfinite():
            generate(model_engine, tokenizer, cfg.impl.example_prompts, token_limit=cfg.impl.example_token_limit)
    
    # Save to summary:
    if loss.detach().isfinite():
        validation_log_p = validate(model_engine, dataloaders["test"], setup, cfg)
    else:
        validation_log_p = float("Inf")
    log.info(f"Log-Perplexity on validation data is {validation_log_p:2.4f}.")
    metrics = dict(
        validation_log_ppl=validation_log_p,
        validation_ppl=torch.as_tensor(validation_log_p).exp().item(),
        num_params=sum([p.numel() for p in model.parameters()]),
    )

    return metrics


def get_time_elapsed(start_time: float, additional_time: float = 0.0) -> float:
    return time.time() - start_time + additional_time

def check_checkpointing(data_idx: int, cfg_impl, last_save_time) -> bool:
    step_condition = cfg_impl.save_every_nth_step > 0 and (data_idx % cfg_impl.save_every_nth_step == 0)
    time_condition = cfg_impl.save_every_n_minutes > 0 and (time.time() - last_save_time) / 60 > cfg_impl.save_every_n_minutes
    return cfg_impl.save_intermediate_checkpoints and (step_condition or time_condition)


def check_deadline(launch_time, hour_limit, prev_budget: float = 0.0, overall_hour_limit: float = 0.0):
    """These measurements are deliberately wall-clock based."""
    current_time = time.time()
    overall_budget = overall_hour_limit if overall_hour_limit >= 0 else hour_limit
    current_violated = (current_time - launch_time) / 3600 > hour_limit
    overall_violated = (prev_budget + (current_time - launch_time)) / 3600 > overall_budget
    return current_violated or overall_violated


def check_early_termination(start_time, loss, early_termination, prev_budget: float = 0.0):
    """Early termination based on terrible loss."""
    if early_termination.enabled and loss > early_termination.loss_threshold:
        current_time = time.time()
        overall_budget = early_termination.overall_budget if early_termination.overall_budget > 0 else early_termination.budget
        current_violated = (current_time - start_time) / 3600 > early_termination.budget
        overall_violated = (prev_budget + (current_time - start_time)) / 3600 > overall_budget
        return current_violated or overall_violated
    else:
        return False


def collect_stats(data_step, loss_vals, log_ppls, model_outputs, train_time, stats, model_engine, dataloader, cfg):
    """ "data_step" here refers to one step on the dataloader, which may be multiple steps on the model_engine."""
    stats["data_step"] += [data_step]
    stats["epoch"] += [dataloader.epoch_counter]
    stats["model_steps"] += [model_engine.steps]

    tokens_per_step = model_engine.record_tokens_per_step()
    stats["tokens"] += [data_step * tokens_per_step]
    stats["loss"] += [torch.stack(loss_vals).mean().item()]  # Averaged loss
    stats["log_ppl"] += [torch.stack(log_ppls).mean().item()]  # Averaged loss
    if "losses" in model_outputs:
        for key, acccum_loss in model_outputs["losses"].items():
            if key != "count":
                stats[key] += [acccum_loss.item()]
    if "logits" in model_outputs:
        try:
            precise_logits = model_outputs["logits"].to(dtype=torch.float32)
            stats["entropy"] += [torch.distributions.Categorical(torch.softmax(precise_logits, dim=-1)).entropy().mean().item()]
        except ValueError:
            stats["entropy"] += [float("NaN")]  # can happen if invalid values in logits, or softmax numerical issues

    current_lr = model_engine.optimizer.param_groups[0].get("lr", float("NaN"))
    log_msg = f"Train loss {loss_vals[-1].item():2.4f} at data block {data_step} with lr {current_lr:.5f}. "
    log_msg += f"[Avg: {stats['loss'][-1]:2.4f}] "
    if data_step > 0:
        stats["train_time"] += [(time.time() - train_time) / cfg.impl.print_loss_every_nth_step]
        estimated_train_finish = str(datetime.timedelta(seconds=stats["train_time"][-1] * cfg.train.steps))
        tokens_per_second = tokens_per_step / stats["train_time"][-1]
        stats["tok/sec"] += [int(tokens_per_second)]
        log_msg += f" Perf: {stats['train_time'][-1]:2.4f}s per block ({tokens_per_second:.0f}t/s). "
        # log_msg += f"Est.for all sched. blocks: {estimated_train_finish}."

    # Adaptive optim stats
    stats["lr"] += [current_lr]
    stats["batch_size"] += [model_engine.record_batch_size()]
    stats["seq_length"] = [model_engine.current_seq_length]

    # Publish
    cramming.utils.wandb_log(stats, cfg)
    log.info(log_msg)

    # Clear:
    loss_vals, log_ppls = [], []
    train_time = time.time()
    return loss_vals, log_ppls, train_time


@torch.no_grad()
def validate(model_engine, validloader, setup, cfg):
    """Evaluate on validation set."""
    log.info("Starting model validation.")
    model_engine.eval()
    val_timer = time.time()
    # Cut up smaller streams so the inductor doesn't break, but keep parallelizable archs at full depth:
    eval_depth = 1 if cfg.train.stream_depth < cfg.data.seq_length else cfg.data.seq_length

    log_perplexity = 0
    len_validloader = len(validloader)

    for step, batch in enumerate(validloader):
        device_batch = model_engine.to_device(batch)
        seq_len = max(1, device_batch["input_ids"].shape[1] - eval_depth)
        num_entries = len(range(0, seq_len))
        # Stream over sequence
        model_outputs = {}
        for seq_idx in range(0, seq_len, eval_depth):
            input_ids = device_batch["input_ids"][:, seq_idx : seq_idx + eval_depth + 1].clone()  # last token is used as target
            model_outputs = model_engine.forward(input_ids=input_ids, **model_outputs)
            log_perplexity += model_outputs.get("log_perplexity", model_outputs["loss"].detach()) / num_entries
            if cfg.dryrun:
                break

        if step % cfg.impl.print_loss_every_nth_step == 0:
            log_msg = f"Avg Log-Perplexity: {log_perplexity/(step + 1):2.4f} at step {step} "
            if step > 1:
                validation_time = (time.time() - val_timer) / cfg.impl.print_loss_every_nth_step
                estimated_train_finish = str(datetime.timedelta(seconds=validation_time * len(validloader)))
                tokens_per_step = cramming.utils.num_processes() * model_engine.record_tokens_per_step()
                tokens_per_second = tokens_per_step / validation_time
                log_msg += f" Perf: {validation_time:2.4f}s per step ({tokens_per_second:.0f}t/s). "
                log_msg += f"Estimated Total validation Time: {estimated_train_finish}."

            val_timer = time.time()
            log.info(log_msg)
        
        if step > 200000: # putting hard limit of 200,000 steps for validation
            len_validloader = step
            break

        if cfg.dryrun:
            break

    model_engine.train(cfg.train.pretrain_in_train_mode)
    return log_perplexity.item() / len_validloader


def generate(model_engine, tokenizer, example_prompts, token_limit=10, temp=1.0):
    model_engine.eval()
    # Just do a dumb generation for now, can implement efficient generation later
    for prompt in example_prompts:

        tokenized_inputs = torch.as_tensor(tokenizer(prompt)["input_ids"], dtype=torch.long)[None, :]#-1]  # cut off EOT NOT ALWAYS SAFE
        print("tokenised input is ",tokenized_inputs)
        device_inputs = model_engine.to_device(dict(input_ids=tokenized_inputs))["input_ids"]
        print("device inputs: ", device_inputs)
        # Generate new tokens
        predicted_ids = model_engine.dynamic_generation(device_inputs, temperature=temp, token_limit=token_limit)
        print("predicted ids: ", predicted_ids, " with length ", predicted_ids.shape)
        # print(type(predicted_ids[0]))
        decoded_completion = tokenizer.decode(predicted_ids[0].tolist())  # drop batch dim before decoding

        log.info(f"[{prompt}] {decoded_completion}")


def flag_communication(training_allowed):
    """A quick and dirty communication through NCCL. Should not be a major burden."""
    if torch.distributed.is_initialized():
        comm_tensor = torch.as_tensor(training_allowed).cuda()
        torch.distributed.all_reduce(comm_tensor, torch.distributed.ReduceOp.MIN, async_op=False)
        if comm_tensor >= 1:
            return True
        else:
            return False
    else:
        return training_allowed


@hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.3")
def launch(cfg):
    cramming.utils.main_launcher(cfg, main_training_process, job_name="pretraining")


if __name__ == "__main__":
    launch()


================================================
FILE: pretty_plotter.py
================================================
## combine multiple testing plots and make a pretty one 

import os
import numpy as np
import json
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from omegaconf import OmegaConf

def find_file(starting_directory, target_file):
    """Find target_file in the tree from starting_directory"""
    for root, dirs, files in os.walk(starting_directory):
        if target_file in files:
            return os.path.join(root, target_file)

def grid_plotter(data, type="accs", path="", title=None, rect_size=20, up_to_50=False):
    """plot the 2d grid (up to 50x50)"""
    if title is None:
        title = "All numbers are percetanges rounded to 1dp"
    data = np.array(data)*100
    df = pd.DataFrame(data)

    plt.figure(figsize=(10, 8))
    sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".0f", annot_kws={'size': 8,'rotation':0})
    if up_to_50:
        rect = patches.Rectangle((0, 0), rect_size, rect_size, linewidth=1.5, edgecolor='red', facecolor='none')
    else:
        rect = patches.Rectangle((0, 0), rect_size, rect_size, linewidth=1, edgecolor='red', facecolor='none')
    plt.gca().add_patch(rect)
    rect_size = data.shape[0]
    plt.xticks(np.arange(1, rect_size+1) - 0.5, labels=np.arange(1, rect_size+1), rotation=90, fontsize=10)
    plt.yticks(np.arange(1, rect_size+1) - 0.5, labels=np.arange(1, rect_size+1), rotation=0, fontsize=10)
    
    # Customize the plot
    plt.title(title)
    plt.ylabel("1st Number Length")
    plt.xlabel("2nd Number Length")
    
    plt.savefig(f"{path}combined_{type}_grid_plot{'_50' if up_to_50 else ''}", bbox_inches='tight', dpi=300)
    plt.clf()

def main():
    # replace with model name
    model_name = "cramming-data/add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1"

    file_path = f"{model_name}/downstream"
    # get latest checkpoint for the model data
    config_path = f"{model_name}/checkpoints"
    all_checkpoints = [f for f in os.listdir(config_path)]
    checkpoint_paths = [os.path.join(config_path, c) for c in all_checkpoints]
    checkpoint_name = max(checkpoint_paths, key=os.path.getmtime)
    with open(os.path.join(checkpoint_name, "model_config.json"), "r") as file:
        cfg_arch = OmegaConf.create(json.load(file))
    max_rec = cfg_arch['maximal_recurrence']
    layers_in_block = cfg_arch['layers_in_recurrent_block']
    mask_bf_eq = cfg_arch['mask_before_equals']
    attn_type = cfg_arch['attention']['type']
    loss_reduc = cfg_arch['loss_reduction']
    throttle = cfg_arch['throttle']
    title = f"Model name:\n{model_name[14:]}\nNum layers in block: {layers_in_block}, Num blocks in training: {max_rec}\n Mask all before equals: {mask_bf_eq}, Train time: 24 hr\n attn: {attn_type}, temp: Greedy{', loss: 'if loss_reduc == 'none' else ''}{', throttle' if throttle else ''}"

    # works up in tiers starting from the smallest grid (large) up to the largest for this size (up_to_50)
    large_path = find_file(file_path, f"accs_grid_quick_large.json")
    with open(large_path, 'r') as file:
        data = json.load(file)
    large_data = np.array(data)

    ood_path = find_file(file_path, f"accs_grid_quick_ood_only.json")
    with open(ood_path, 'r') as file:
        data = json.load(file)
    ood_data = np.array(data)

    num_rows_to_add = ood_data.shape[0] - large_data.shape[0]
    num_cols_to_add = ood_data.shape[1] - large_data.shape[1]

    padded_array = np.pad(large_data, ((0, num_rows_to_add), (0, num_cols_to_add)), mode='constant', constant_values=0)
    combined = padded_array+ood_data

    rect_size=20
    path_40 = find_file(file_path, f"accs_grid_quick_up_to_40.json")
    if path_40 is not None:
        with open(path_40, 'r') as file:
            data = json.load(file)
        data_40 = np.array(data)
        num_rows_to_add = data_40.shape[0] - combined.shape[0]
        num_cols_to_add = data_40.shape[1] - combined.shape[1]
        padded_array = np.pad(combined, ((0, num_rows_to_add), (0, num_cols_to_add)), mode='constant', constant_values=0)
        combined = padded_array+data_40

    path_50 = find_file(file_path, f"accs_grid_quick_up_to_50.json")
    up_to_50 = False
    if path_50 is not None:
        with open(path_50, 'r') as file:
            data = json.load(file)
        data_50 = np.array(data)
        num_rows_to_add = data_50.shape[0] - combined.shape[0]
        num_cols_to_add = data_50.shape[1] - combined.shape[1]
        padded_array = np.pad(combined, ((0, num_rows_to_add), (0, num_cols_to_add)), mode='constant', constant_values=0)
        combined = padded_array+data_50
        up_to_50 = True
        
    grid_plotter(combined, type="accs", path=f"{file_path}/", title=title, rect_size=rect_size, up_to_50=up_to_50)

if __name__ == "__main__":
    main()

================================================
FILE: pretty_plotter_big.py
================================================
## combine multiple testing plots and make a pretty one 

import os
import numpy as np
import json
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from omegaconf import OmegaConf
import glob
import re

def grid_plotter(data, type="accs", path="", title=None, rect_size=20):
    """Plot the large 100x100 grid"""
    if title is None:
        title = "All numbers are percetanges rounded to 1dp"
    data = np.array(data)*100
    df = pd.DataFrame(data)

    plt.figure(figsize=(10, 8))
    annotate = False
    # use interpolant
    sns.heatmap(df, annot=annotate, cmap="YlGnBu", fmt=".0f", annot_kws={'size': 8,'rotation':0})

    rect = patches.Rectangle((0, 0), rect_size, rect_size, linewidth=1.8, edgecolor='red', facecolor='none')
    plt.gca().add_patch(rect)
    rect_size = data.shape[0]
    plt.xticks(np.arange(1, rect_size+1, 2) - 0.5, labels=np.arange(1, rect_size+1, 2), rotation=90, fontsize=10)
    plt.yticks(np.arange(1, rect_size+1, 2) - 0.5, labels=np.arange(1, rect_size+1, 2), rotation=0, fontsize=10)
    
    # Customize the plot
    plt.title(title)
    plt.ylabel("1st Number Length")
    plt.xlabel("2nd Number Length")
    
    plt.savefig(f"{path}combined_accs_grid_plot_big_run", bbox_inches='tight', dpi=300)
    plt.clf()

def main():
    # replace with your model name
    model_name = "cramming-data/add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1"
    rect_size = 20

    directory_path = f"{model_name}/downstream"
    # get latest checkpoint for the model data
    config_path = f"{model_name}/checkpoints"
    all_checkpoints = [f for f in os.listdir(config_path)]
    checkpoint_paths = [os.path.join(config_path, c) for c in all_checkpoints]
    checkpoint_name = max(checkpoint_paths, key=os.path.getmtime)
    with open(os.path.join(checkpoint_name, "model_config.json"), "r") as file:
        cfg_arch = OmegaConf.create(json.load(file))
    max_rec = cfg_arch['maximal_recurrence']
    layers_in_block = cfg_arch['layers_in_recurrent_block']
    mask_bf_eq = cfg_arch['mask_before_equals']
    attn_type = cfg_arch['attention']['type']
    loss_reduc = cfg_arch['loss_reduction']
    throttle = cfg_arch['throttle']
    title = f"Model name:\n{model_name[14:]}\nNum layers in block: {layers_in_block}, Num blocks in training: {max_rec}\n Mask all before equals: {mask_bf_eq}, Train time: 24 hr\n attn: {attn_type}, temp: Greedy{', loss: 'if loss_reduc == 'none' else ''}{', throttle' if throttle else ''}"


    # Define the pattern to search for
    file_pattern = directory_path + "/accs_grid_quick_big_eval_?_even.json"
    matching_files_even = glob.glob(file_pattern, recursive=True)
    file_pattern = directory_path + "/accs_grid_quick_big_eval_??_even.json"
    matching_files_even += glob.glob(file_pattern, recursive=True)

    file_pattern = directory_path + "/accs_grid_quick_big_eval_?_odd.json"
    matching_files_odd = glob.glob(file_pattern, recursive=True)
    file_pattern = directory_path + "/accs_grid_quick_big_eval_??_odd.json"
    matching_files_odd += glob.glob(file_pattern, recursive=True)

    # Print the matching files
    number_pattern_even = re.compile(r'accs_grid_quick_big_eval_(\d+)_even.json')
    number_pattern_odd = re.compile(r'accs_grid_quick_big_eval_(\d+)_odd.json')

    # Print the matching files and the numbers extracted from them
    file_paths = []
    even_nums = []
    odd_nums = []

    for file_path in matching_files_even:
        match = number_pattern_even.search(file_path)
        if match:
            number = match.group(1)
            if number not in even_nums:
                even_nums.append(number)
                print("Number:", number)
            else:
                continue
        print("File:", file_path)
        file_paths.append(file_path)

    for file_path in matching_files_odd:
        match = number_pattern_odd.search(file_path)
        if match:
            number = match.group(1)
            if number not in odd_nums:
                odd_nums.append(number)
                print("Number:", number)
            else:
                continue
        print("File:", file_path)
        file_paths.append(file_path)

    arr = np.zeros((100, 100))
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            data = json.load(file)
            if len(data) == 3:
                data = data[0]
        arr = arr + np.array(data)
        
    title = title + "\n Even: "+', '.join(sorted(even_nums, key=lambda x: int(x))) + "\n Odd: "+', '.join(sorted(odd_nums, key=lambda x: int(x)))
    grid_plotter(arr, type=type, path=f"{directory_path}/", title=title, rect_size=rect_size)
    print(f"{model_name}")

if __name__ == "__main__":
    main()

================================================
FILE: pretty_plotter_sort.py
================================================
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

def grid_plotter(data, title="", path=None):
    data = np.array(data)
    df = pd.DataFrame(data)

    # find the average accuracy
    avg = np.mean(data)

    # Create the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".1f", annot_kws={'size': 8, 'rotation': 0}, vmin=0, vmax=100)

    # Customize the plot
    plt.title(f"Accuracy - percetange, rounded to 1dp : {title}, Avg acc: {avg}")
    plt.ylabel("Maximum n-digit number (1-n)")
    plt.xlabel("Length of array to sort")
    size = data.shape[0]
    plt.xticks(np.arange(0.5, size + 0.5, 1), labels=np.arange(1, size + 1, 1))
    plt.yticks(np.arange(0.5, size + 0.5, 1), labels=np.arange(1, size + 1, 1))

    plt.savefig(f"{path}", bbox_inches='tight')
    plt.clf()


def run(names, short_hand, base_dir, sort_plots_path):
    os.makedirs(sort_plots_path, exist_ok=True)
    all_data_acc_dict = {}
    all_data_top_1_acc_dict = {}

    for i in range(len(names)):
        name = names[i]
        extra_name = short_hand[i]
        dict_key = extra_name[0]
        extra_name = extra_name[0] + "_" + extra_name[1]
        all_data_path = base_dir + name + "/downstream/"

        # get all the directories in the path that start with all_outputs
        all_dirs = os.listdir(all_data_path)
        # remove the ones that are not directories
        all_dirs = [dir for dir in all_dirs if os.path.isdir(all_data_path + dir)]
        all_images = []
        for dir in all_dirs:
            if "all_outputs" in dir:
                # get the recurrence
                recurrence = dir.split("_")[-1]
                if "recurrence" not in recurrence:
                    continue

                # get all the files in the directory
                files = os.listdir(all_data_path + dir + "/")
                all_images_local = []

                all_data_acc = {}
                all_data_top_1_acc = {}
                max_size = 0

                print(extra_name)
                print("dir", dir)

                for file in files:
                    if ".txt" in file:
                        all_info = file.split(".")[0]
                        all_info = all_info.split("_")
                        data_size_1 = int(all_info[-2])
                        data_size_2 = int(all_info[-1])

                        if data_size_1 > max_size:
                            max_size = data_size_1
                        if data_size_2 > max_size:
                            max_size = data_size_2

                        # get the accuracy
                        with open(all_data_path + dir + "/" + file, "r") as f:
                            acc = float(f.read())
                            if "top_1_acc" in file:
                                all_data_top_1_acc[(data_size_1, data_size_2)] = acc
                            else:
                                all_data_acc[(data_size_1, data_size_2)] = acc

                # create the grid plot
                data = np.zeros((max_size, max_size))
                for key in all_data_acc.keys():
                    data[key[0] - 1][key[1] - 1] = all_data_acc[key]
                grid_plotter(data,
                            title=f"{extra_name} {recurrence} acc",
                            path=f"./{sort_plots_path}/{extra_name}_{recurrence}_acc.png")

                if dict_key not in all_data_acc_dict.keys():
                    all_data_acc_dict[dict_key] = []
                    all_data_top_1_acc_dict[dict_key] = []

                all_data_acc_dict[dict_key].append(data)

                data = np.zeros((max_size, max_size))
                for key in all_data_top_1_acc.keys():
                    data[key[0] - 1][key[1] - 1] = all_data_top_1_acc[key]
                grid_plotter(data,
                            title=f"{extra_name} {recurrence} top_1_acc",
                            path=f"./{sort_plots_path}/{extra_name}_{recurrence}_top_1_acc.png")

                all_data_top_1_acc_dict[dict_key].append(data)


                all_images_local.append(cv2.imread(f"./{sort_plots_path}/{extra_name}_{recurrence}_acc.png"))
                all_images_local.append(cv2.imread(f"./{sort_plots_path}/{extra_name}_{recurrence}_top_1_acc.png"))
                all_images_local = cv2.hconcat(all_images_local)
                # write this image
                all_images.append((all_images_local, f"{extra_name}_{recurrence}.png"))

        os.makedirs(f"./{sort_plots_path}/final/", exist_ok=True)
        if len(all_images) == 1:
            all_images_local, name = all_images[0]
            cv2.imwrite(f"./{sort_plots_path}/final/{name}", all_images_local)
        else:
            os.makedirs(f"./{sort_plots_path}/final/{extra_name}/", exist_ok=True)
            for all_images_local, name in all_images:
                cv2.imwrite(f"./{sort_plots_path}/final/{extra_name}/{name}", all_images_local)

if __name__ == "__main__":
    names = ["sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_8x1_1_24_run_1"]
    short_hand = [("rev_abacus_fire_8x1", "v1")] # the shrothand names for the runs you want to plot in the same order

    base_dir = "cramming-data/"
    sort_plots_path = "./sort_plots/"
    run(names, short_hand, base_dir, sort_plots_path)

================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.black]
line-length = 140


================================================
FILE: setup.cfg
================================================


[metadata]
name = cramming
version = 0.1.0
author = Sean McLeish
author_email = smcleish@umd.edu
url = https://github.com/mcleish7/arithmetic
description = Fork of cramming for next token predicition
long_description = file: README.md, LICENSE.md
long_description_content_type = text/markdown
license = MIT
license_file = LICENSE.md
platform = any
keywords = Machine Learning, Language Modeling
classifiers =
    License :: OSI Approved :: MIT License
    Operating System :: OS Independent
    Programming Language :: Python
homepage = "https://github.com/mcleish7/arithmetic"
repository = "https://github.com/mcleish7/arithmetic"
documentation = """

[options]
zip_safe = False
include_package_data = True
python_requires = >= 3.10
packages = find:

setup_requires =
    setuptools

install_requires =
    torch >= 2.0.0
    hydra-core >= 1.1
    datasets
    tokenizers
    transformers
    evaluate
    scipy
    scikit-learn # for metrics
    pynvml
    psutil
    einops
    safetensors
    apache-beam  # only used for wikipedia ...
    zstandard    # only used for the Pile
    wandb # if you want to use it
    matplotlib==3.8.3 # the versions of plt and sns are fixed for annotating the heatmaps
    seaborn==0.13.2
    opencv-python

scripts =
  pretrain.py
  arithmetic_eval_quicker.py

[options.package_data]
* =  "*.yaml", "*.txt"


[check-manifest]
ignore =
    .ipynb
    .sh


#basically the pytorch flake8 setting from https://github.com/pytorch/pytorch/blob/master/.flake8
[flake8]
select = B,C,E,F,P,T4,W,B9
max-line-length = 140
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
per-file-ignores = __init__.py: F401 torch/utils/cpp_extension.py: B950
optional-ascii-coding = True
exclude =
    .git,
    __pycache__,
    scripts,
    tables,
    outputs,
    *.pyi


================================================
FILE: shells/addition_ff.sh
================================================
## FF
# nope
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None

# fire
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" 

# abacus
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus

## FF w/ II
# nope
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.forward_only_model_with_skip=True
# fire
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"  arch.forward_only_model_with_skip=True
# abacus
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True


## FF w/ II
# Abacus + FIRE
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.forward_only_model_with_skip=True arch.embedding.pos_embedding=abacus 
# Abacus + RoPE
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_rope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True arch.attention.type="self-attention" arch.attention.rotary_embedding=true

================================================
FILE: shells/addition_lt.sh
================================================
### Looped Transformer experiments
# vary number of layers in recurrent_block: arch.layers_in_recurrent_block
# vary number of recurrences: arch.maximal_recurrence

# NOPE
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None
# FIRE
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" 
# ABACUS
python pretrain.py name=add_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/+_bucket_method_n_20_m_20_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus


================================================
FILE: shells/bitwise_or.sh
================================================
# bitwise or is sometimes refered to as pos_arth in the code

## LT
# NOPE
python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None
#  FIRE
python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_nope_attn_emb_fire_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"
# abacus
python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_1_16_TBPTT_1024_batch_size_512_mask_before_equals_true_start_emb_abacus_attn_emb_nope_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=16 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus

## FF
#nope
python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.forward_only_model_with_skip=True
# fire
python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.forward_only_model_with_skip=True
# abacus
python pretrain.py name=pos_or_one_vec_zeros_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=1 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/or_one_vec_zeros/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True

## FF w/ II
# nope
python pretrain.py name=pos_or_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/pos_arith_add_20_20_p_00/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.forward_only_model_with_skip=True
# fire
python pretrain.py name=pos_or_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_nope_attn_emb_fire_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/pos_arith_add_20_20_p_00/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=None arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"  arch.forward_only_model_with_skip=True
# abacus
python pretrain.py name=pos_or_bucket_20_20_reverse_all_pad_00_depthrec_16_1_TBPTT_1024_batch_size_256_mask_before_equals_true_start_emb_abacus_attn_emb_nope_with_skip_connections_run_1 wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=256 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=16 arch.maximal_recurrence=1 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/pos_arith_add_20_20_p_00/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True arch.embedding.pos_embedding=abacus arch.forward_only_model_with_skip=True


================================================
FILE: shells/evaluation.sh
================================================
# there is an automated helper in gen_eval_script.py for generating these evaluation scripts

# Addition
python arithmetic_eval_quicker.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> token_limit=105 big_eval_step_<STEP_NUM>=True reverse_inputs=True checkerboard=<EVEN/ODD> remove_padding=True data.sources.arithmetic.tokenizer_type="pad"

# Extended Addition Eval, i.e. 100
python arithmetic_eval_quicker.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_Rec> token_limit=105 big_eval_step_5=True reverse_inputs=True checkerboard=even remove_padding=True extended_eval=True data.sources.arithmetic.tokenizer_type="pad"

# Multiplication
python arithmetic_eval_quicker.py name=<NAME> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> token_limit=30 mul=True data.sources.arithmetic.tokenizer_type="pad"

# Sorting
# max_size_given = end of grid, start_ind_... = start of grid, i.e. this evaluates from 1,1 to final_size, final_size
python sort_eval.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> sort_reverse=True data.sources.arithmetic.tokenizer_type='sort' max_size_given={final_size + 1} start_ind_1_given={1} start_ind_2_given={1}

# Bitwise OR
python arithmetic_eval_quicker.py name=<name> base_dir=$cramming_base_dir data=arithmetic max_rec=<max_rec> token_limit=105 big_eval_step_<STEP_NUM>=True checkerboard=<EVEN/ODD> pos_arth_ood=True data.sources.arithmetic.tokenizer_type="pad" remove_padding=False

================================================
FILE: shells/generate_and_tokenize_data.sh
================================================
## Training Data -- these commands approximately correspond to the zipped data we provide

# bitwise or
python create_pos_or_variants.py --n 20 --m 20 --dir_name <NAME> --max 100
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01

# addition
python create_data_split.py --bucket --op + --n 20 --m 20 --limit 20000000 --p 0.0 --dir_name <NAME> --reverse_all
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01

# addition with index hints
python create_data_split.py --bucket --op + --n 20 --m 20 --limit 20000000 --p 0.0 --dir_name <NAME> --reverse_all --index_hints
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type index

# multiplication
python create_data_split.py --bucket --op x --n 15 --m 15 --limit 20000000 --dir_name <NAME>  --reverse_all --p 0.0
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01

# sorting
python create_data_split.py --uniform_distribution_sort_data --continue_to_tokenize --tokenize --tokenizer_type sort --test_split_ratio 0.01 --n 10 --m 10 --limit 20000000 --dir <NAME> --sort_generation_method bucket_uniform_distribution --reverse_all

## Evaluation Data -- run line and tokenize once for each operand length
# bitwise or
python create_pos_or_variants.py --n <i> --m <j> --dir_name <NAME> --exact --eval --max 100
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0

# addition
python create_data_split.py --op + --n <i> --m <j> --num_samples 100 --dir_name <NAME> --exact
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0

# multiplication
python create_data_split.py --op x --n <i> --m <j> --num_samples 100 --dir_name <NAME> --exact
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0

# sorting
python create_data_split.py --uniform_distribution_sort_data --continue_to_tokenize --tokenize --tokenizer_type sort --test_split_ratio 0.01 --n <i> --m <j> --limit 100 --dir <NAME> --sort_generation_method bucket_uniform_distribution --reverse_all --exact

================================================
FILE: shells/multiplication.sh
================================================
## only Looped Transformer experiments for multiplication
torchrun --nproc_per_node=8 --standalone pretrain.py name=mul_bucket_15_15_reverse_all_pad_00_depthrec_4_4_TBPTT_1024_nope_mask_before_equals_batch_512_fire_abacus_8_gpu wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=4 arch.maximal_recurrence=4 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/x_bucket_method_n_15_m_15_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.00006 data.sources.arithmetic.tokenizer_type="pad" arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.mask_before_equals=True impl.fullgraph=false arch.loss_reduction=none arch.throttle=True arch.embedding.pos_embedding="abacus"

torchrun --nproc_per_node=8 --standalone pretrain.py name=mul_bucket_15_15_reverse_all_pad_00_depthrec_4_4_TBPTT_1024_nope_mask_before_equals_batch_512_fire_nope_8_gpu wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=4 arch.maximal_recurrence=4 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/x_bucket_method_n_15_m_15_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.00006 data.sources.arithmetic.tokenizer_type="pad" arch.attention.type="self-attention" arch.attention.rotary_embedding="fire" arch.mask_before_equals=True impl.fullgraph=false arch.loss_reduction=none arch.throttle=True arch.embedding.pos_embedding=None

torchrun --nproc_per_node=8 --standalone pretrain.py name=mul_bucket_15_15_reverse_all_pad_00_depthrec_4_4_TBPTT_1024_nope_mask_before_equals_batch_512_abacus_8_gpu wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir impl.microbatch_size=512 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=4 arch.maximal_recurrence=4 arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True data.sources.arithmetic.tokenized_dataset_path="arithmetic_data/x_bucket_method_n_15_m_15_20000000_p_00_reverse_all/hf_tokenized_dataset" train.optim.lr=0.0001 data.sources.arithmetic.tokenizer_type="pad" arch.mask_before_equals=True impl.fullgraph=false arch.loss_reduction=none arch.throttle=True arch.embedding.pos_embedding="abacus"

================================================
FILE: shells/sorting.sh
================================================
# REMINDER SET BASE DIR


## fire reverse
## fire reverse recall
## fire reverse recurrence

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_fire_8x1_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.attention.type='self-attention' \
	arch.attention.rotary_embedding='fire' impl.fullgraph=false impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last'

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_fire_recall_8x1_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.attention.type='self-attention' \
	arch.attention.rotary_embedding='fire' impl.fullgraph=false impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last' arch.forward_only_model_with_skip=True

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_fire_1x8_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=8 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.attention.type='self-attention' \
	arch.attention.rotary_embedding='fire' impl.fullgraph=false impl.save_every_n_minutes=60 impl.save_intermediate_model_name='last'

## abacus reverse
## abacus reverse recall
## abacus reverse recurrence

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_8x1_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus"

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_8x1_skip_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" arch.forward_only_model_with_skip=True

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_1x8_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=8 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus"


## abacus fire reverse
## abacus fire reverse recall
## abacus fire reverse recurrence

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_8x1_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" \
	arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_8x1_skip_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=8 arch.maximal_recurrence=1 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" \
	arch.forward_only_model_with_skip=True arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"

torchrun --nproc_per_node=1 --standalone pretrain.py name=sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all_abacus_with_fire_1x8_1_24_run_1 \
	wandb=none arch=crammed-depthrecurrent data=arithmetic base_dir=$cramming_base_dir \
	impl.microbatch_size=32 budget=24 impl.compile_torch=False arch.objective_layout=TBPTT arch.layers_in_recurrent_block=1 arch.maximal_recurrence=8 \
	arch.hidden_size=1024 arch.intermed_size=2048 impl.forbid_dataset_preprocessing=False impl.save_intermediate_checkpoints=True impl.save_final_model=True \
	data.sources.arithmetic.tokenized_dataset_path='arithmetic_data/sort_bucket_uniform_distribution_max_digits_n_10_max_length_m_10_20000000_p_00_reverse_all/hf_tokenized_dataset' \
	train.optim.lr=0.0001 arch.embedding.pos_embedding=None data.sources.arithmetic.tokenizer_type='sort' arch.mask_before_equals=True arch.embedding.pos_embedding="abacus" \
	arch.attention.type="self-attention" arch.attention.rotary_embedding="fire"

================================================
FILE: sort_eval.py
================================================
import logging
import hydra
from omegaconf import OmegaConf
import cramming
import torch
from safetensors.torch import load_file
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np
import re
import pandas as pd
import datasets
import os
from typing import List, Dict
from cramming.data.tokenizer_preparation import get_tokenizer
import random

log = logging.getLogger(__name__)

def grid_plotter(data, type="accs", name='_large', extra_path=None):
    """plot a 2d accuracy grid"""
    data = np.array(data)*100
    df = pd.DataFrame(data)

    # Create the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".1f", annot_kws={'size': 8,'rotation':0})
    
    # Customize the plot
    plt.title("Accuracy - percetange, rounded to 1dp")
    plt.ylabel("1st Number Length")
    plt.xlabel("2nd Number Length")
    size = data.shape[0]
    plt.xticks(np.arange(0.5, size+0.5, 1), labels=np.arange(1, size+1, 1))
    plt.yticks(np.arange(0.5, size+0.5, 1), labels=np.arange(1, size+1, 1))

    if extra_path is not None:
        plt.savefig(f"{extra_path}{type}{name}_grid_plot", bbox_inches='tight')
    else:
        plt.savefig(f"{type}{name}_grid_plot", bbox_inches='tight')
    plt.clf()

def grid_logic(cfg):
    """logic to select function to control which part of a 2d grid this run should be responsible for evaling"""

    # origional testing
    def logic_func_large(data_size_1, data_size_2):
        return (data_size_1 <= 23 or data_size_2 <=23)
    logic_func = logic_func_large
    name = '_large'
    max_size = 23+1
    
    if cfg.ood_only:
        def logic_func_ood(data_size_1, data_size_2):
            return (data_size_1 >=24 or data_size_2 >=24) and (data_size_1 <= 30 or data_size_2 <=30)
        logic_func = logic_func_ood
        name = '_ood_only'
        max_size = 30+1
        
    if cfg.up_to_40:
        def logic_func_40(data_size_1, data_size_2):
            return (data_size_1 >=31 or data_size_2 >=31) and (data_size_1 <=40 or data_size_2 <=40)
        logic_func = logic_func_40
        name = '_up_to_40'
        max_size = 40+1
        
    if cfg.up_to_50:
        def logic_func_50(data_size_1, data_size_2):
            return (data_size_1 >=41 or data_size_2 >=41) and (data_size_1 <=50 or data_size_2 <=50)
        logic_func = logic_func_50
        name = '_up_to_50'
        max_size = 50+1

    # checkerboarding: for the large eval we can checkerboard:

    if cfg.checkerboard is not None:
        if cfg.checkerboard == 'even':
            def checkerboard_even(data_size_1, data_size_2):
                return ((data_size_1+data_size_2)%2 ==0)
            checkerboard_func = checkerboard_even
            checkerboard_str = "_even"
        elif cfg.checkerboard == 'odd':
            def checkerboard_odd(data_size_1, data_size_2):
                return ((data_size_1+data_size_2)%2 ==1)
            checkerboard_func = checkerboard_odd
            checkerboard_str = "_odd"
        else:
            print("checkerboard config not allowed")
            exit()
    else:
        def always_true(data_size_1, data_size_2):
            return True
        checkerboard_func = always_true
        checkerboard_str = ""


    # if we are testing up to 100, split into 10 steps each of approximately equal number of forward passes required
    if cfg.big_eval_step_1: # 1 -> 46
        def logic_func_big_1(data_size_1, data_size_2):
            return (data_size_1 <= 46 and data_size_2 <= 46) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_1
        name = '_big_eval_1'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_2: # 47 -> 58
        def logic_func_big_2(data_size_1, data_size_2):
            return (data_size_1 >=47 or data_size_2 >=47) and (data_size_1 <=58 and data_size_2 <=58) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_2
        name = '_big_eval_2'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_3: # 59 -> 67
        def logic_func_big_3(data_size_1, data_size_2):
            return (data_size_1 >=59 or data_size_2 >=59) and (data_size_1 <=67 and data_size_2 <=67) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_3
        name = '_big_eval_3'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_4: # 68 -> 74
        def logic_func_big_4(data_size_1, data_size_2):
            return (data_size_1 >=68 or data_size_2 >=68) and (data_size_1 <=74 and data_size_2 <=74) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_4
        name = '_big_eval_4'+checkerboard_str
        max_size = 100+1
      
    if cfg.big_eval_step_5: # 75 -> 80
        def logic_func_big_5(data_size_1, data_size_2):
            return (data_size_1 >= 75 or data_size_2 >=75) and (data_size_1 <=80 and data_size_2 <=80) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_5
        name = '_big_eval_5'+checkerboard_str
        max_size = 100+1

    if cfg.big_eval_step_6: # 81 -> 85
        def logic_func_big_6(data_size_1, data_size_2):
            return (data_size_1 >= 81 or data_size_2 >=81) and (data_size_1 <=85 and data_size_2 <=85) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_6
        name = '_big_eval_6'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_7: # 86 -> 90
        def logic_func_big_7(data_size_1, data_size_2):
            return (data_size_1 >= 86 or data_size_2 >=86) and (data_size_1 <=90 and data_size_2 <=90) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_7
        name = '_big_eval_7'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_8: # 91 -> 94
        def logic_func_big_8(data_size_1, data_size_2):
            return (data_size_1 >= 91 or data_size_2 >=91) and (data_size_1 <=94 and data_size_2 <=94) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_8
        name = '_big_eval_8'+checkerboard_str
        max_size = 100+1
    
    if cfg.big_eval_step_9: # 95 -> 97
        def logic_func_big_9(data_size_1, data_size_2):
            return (data_size_1 >= 95 or data_size_2 >=95) and (data_size_1 <=97 and data_size_2 <=97) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_9
        name = '_big_eval_9'+checkerboard_str
        max_size = 100+1
        
    if cfg.big_eval_step_10: # 98 -> 100
        def logic_func_big_10(data_size_1, data_size_2):
            return (data_size_1 >= 98 or data_size_2 >=98) and (data_size_1 <=100 and data_size_2 <=100) and checkerboard_func(data_size_1, data_size_2)
        logic_func = logic_func_big_10
        name = '_big_eval_10'+checkerboard_str
        max_size = 100+1

    # boolean_list_precidence = [large, ood_only, up_to_40, up_to_50, big_eval_step_1, big_eval_step_2, big_eval_step_3, big_eval_step_4, big_eval_step_5]

    log.info(f"large = {cfg.large}")
    log.info(f"ood only = {cfg.ood_only}")
    log.info(f"up to 40 = {cfg.up_to_40}")
    log.info(f"up to 50 = {cfg.up_to_50}")
    log.info(f"big eval 1 = {cfg.big_eval_step_1}")
    log.info(f"big eval 2 = {cfg.big_eval_step_2}")
    log.info(f"big eval 3 = {cfg.big_eval_step_3}")
    log.info(f"big eval 4 = {cfg.big_eval_step_4}")
    log.info(f"big eval 5 = {cfg.big_eval_step_5}")
    log.info(f"big eval 6 = {cfg.big_eval_step_6}")
    log.info(f"big eval 7 = {cfg.big_eval_step_7}")
    log.info(f"big eval 8 = {cfg.big_eval_step_8}")
    log.info(f"big eval 9 = {cfg.big_eval_step_9}")
    log.info(f"big eval 10 = {cfg.big_eval_step_10}")
    log.info(f"the last true value in the above list will be run, mul and pos arith can take control after this")

    return logic_func, name, max_size

def main(cfg):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    local_checkpoint_folder = os.path.join(cfg.base_dir, cfg.name, "checkpoints")
    tokenizer, cfg_arch, model_file = cramming.utils.find_pretrained_checkpoint(cfg.eval.checkpoint,
                                                                                local_checkpoint_folder,
                                                                                cfg.eval.arch_modifications)
    if cfg.max_rec is not None: # can have more/less recurrences for eval
        cfg_arch.maximal_recurrence_in_eval = cfg.max_rec
    else:
        cfg_arch.maximal_recurrence_in_eval = cfg_arch.maximal_recurrence
    log.info(f"cfg_arch.maximal_recurrence_in_eval changed to {cfg_arch.maximal_recurrence_in_eval}")
    cfg_arch.throttle = False # turn throttle off

    logic_func, name, max_size = grid_logic(cfg)

    # import tokeniser
    cfg_data_sources_values_list = list(cfg.data.sources.values())[0]
    if cfg_data_sources_values_list["provider"] == "arithmetic":
        tokenizer = get_tokenizer(cfg_data_sources_values_list["tokenizer_type"])
    else: 
        log.info("exiting as this is only for arithmetic")
        exit()
    vocab = tokenizer.ids_to_tokens
    EOS_token = tokenizer._convert_token_to_id(tokenizer.eos_token)
    PAD_token = tokenizer._convert_token_to_id(tokenizer.pad_token)
    assert PAD_token == 0, "PAD token must be token zero for our code to work"

    # Load model
    if 'alpha' not in cfg_arch:
        cfg_arch['alpha'] = 1.0

    model = cramming.construct_model(cfg_arch, tokenizer).to(device)
    model = cramming.backend.load_model_checkpoint(model, model_file)
    model.to(device)
    model.eval()

    log.info(f"greedy = {cfg.greedy}, note: if greedy = True this overrides any temperature arguments")
    ## Greedy decoding will overide any temperature arguments

    if cfg.max_size_given is not None: # allows unique splits for eval
        max_size = cfg.max_size_given

    # Grid plots - grid search from 1x1 to 12x12 data
    data_sizes = list(range(1, max_size))
    acc_grid = np.zeros((len(data_sizes),len(data_sizes)))
    start_ind_1 = 0
    start_ind_2 = 0
    tuple_method = False
    completed_one = False
    if "big_eval" in name:
        tuple_method = True
        # go up two layers and search for grid
        try:
            with open(f"../../accs_grid_quick{name}.json", 'r') as file:
                data = json.load(file)
            start_ind_1 = data[1]
            start_ind_2 = data[2]
            acc_grid = np.array(data[0])
            log.info("loaded grid from previous run")
        except:
            pass

    if cfg.start_ind_1_given is not None: # allows unique splits for eval
        start_ind_1 = cfg.start_ind_1_given
    if cfg.start_ind_2_given is not None:
        start_ind_2 = cfg.start_ind_2_given
    log.info(f"start_ind_1 = {start_ind_1}, start_ind_2 = {start_ind_2}")

    os.makedirs("outputs", exist_ok=True)

    all_outputs_folder_path = f"../../all_outputs_max_recurrence={cfg_arch.maximal_recurrence_in_eval}"
    os.makedirs(all_outputs_folder_path, exist_ok=True)

    if not cfg.extended_eval:
        # main 2d loop
        for data_size_1 in data_sizes:
            for data_size_2 in data_sizes:
                proceed = False
                if data_size_1 >= start_ind_1 or data_size_2 >= start_ind_2:
                    proceed = True

                if not proceed:
                    continue

                # check if done
                # if done it will be done and saved in f"../../acc_for_{data_size_1}_{data_size_2}.txt"
                if os.path.exists(f"{all_outputs_folder_path}/acc_for_{data_size_1}_{data_size_2}.txt"):
                    with open(f"{all_outputs_folder_path}/acc_for_{data_size_1}_{data_size_2}.txt", 'r') as file:
                        acc = float(file.read())
                    acc_grid[data_size_1-1, data_size_2-1] = acc
                    continue

                if logic_func(data_size_1, data_size_2):
                    completed_one = True
                    log.info(f"Starting iteration in grid eval for size: {data_size_1} and {data_size_2}")
                    # only one option -- sorting with reversed numbers
                    file_path = f"../../../../data/arithmetic_data/sort_reverse/sort_uniform_distribution_sort_basic_max_digits_n_{data_size_1}_max_length_m_{data_size_2}_200_p_00_reverse_all/hf_tokenized_dataset"
                   
                    tokenized_dataset = datasets.load_from_disk(file_path)["test"]
                    data_loader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=100, shuffle=False)

                    # keep track of totals for a batch as we only eval one sample at a time
                    correct_total = 0
                    all_total = 0
                    top_1_total = 0
                    for batch in data_loader:
                        input_ids = batch["input_ids"]
                        input_ids = torch.stack(input_ids).to(device)
                        input_ids = torch.transpose(input_ids, 0, 1)

                        all = 0
                        correct = 0
                        top_1 = 0
                        for i in range(len(input_ids)):
                            example = input_ids[i]
                            equals_token = tokenizer._convert_token_to_id("=")
                            equals_indices = torch.where(example == equals_token)[0].item()
                            question = example[:equals_indices + 1]
                            answer = example[equals_indices + 1:]
                            
                            question = question.unsqueeze(0)

                            local_token_limit = int(len(answer) * 2)
                            predicted_ids = model._generate(question,
                                                            token_limit=local_token_limit,
                                                            temperature=cfg.temp,
                                                            steps_at_generation_time=cfg_arch.maximal_recurrence_in_eval,
                                                            greedy=cfg.greedy, quick=True)
                            predicted_ids = predicted_ids.squeeze()

                            # get the answer
                            eos_token = tokenizer._convert_token_to_id(tokenizer.eos_token)
                            eos_indices = torch.where(answer == eos_token)[0].item()
                            answer = answer[:eos_indices]

                            predicted_ids = predicted_ids[:len(answer)]
                            if torch.equal(predicted_ids, answer):
                                correct += 1

                            top_1_target = answer[0]
                            top_1_predicted = predicted_ids[0]
                            if torch.equal(top_1_target, top_1_predicted):
                                top_1 += 1

                            all += 1

                        correct_total += correct
                        top_1_total += top_1
                        all_total += all


                    acc = correct_total / all_total
                    acc_top_1 = top_1_total / all_total

                    log.info(f"accuracy for data that has numbers "
                             f"with maximum number of digits as {data_size_1} , "
                             f"and the array of length {data_size_2} is {acc * 100}")
                    log.info(f"Top 1 accuracy for data that has numbers "
                             f"with maximum number of digits as {data_size_1} , "
                             f"and the array of length {data_size_2} is {acc_top_1 * 100}")

                    question = tokenizer.decode(question.squeeze().tolist())
                    answer = tokenizer.decode(answer.tolist())
                    predicted = tokenizer.decode(predicted_ids.tolist())
                    log.info(f"For example : sort {question} for which the answer is {answer} , "
                             f"and the predicted is {predicted}")
                    acc_grid[(data_size_1-1), (data_size_2-1)] = acc * 100
                    
                    # save all in case of crash
                    with open(f"{all_outputs_folder_path}/acc_for_{data_size_1}_{data_size_2}.txt", "w") as file:
                        file.write(f"{acc * 100}")
                    with open(f"{all_outputs_folder_path}/top_1_acc_for_{data_size_1}_{data_size_2}.txt", "w") as file:
                        file.write(f"{acc_top_1 * 100}")

        log.info(f"acc grid: {acc_grid}")

        with open(f"accs_grid_quick_{start_ind_1}_{start_ind_2}_{max_size}.json", "w") as file:
            json.dump(acc_grid.tolist(), file)

        # Grid plots - one for accs one for contains
        grid_plotter(acc_grid, name=f"{start_ind_1}_{start_ind_2}_{max_size}")
        grid_plotter(acc_grid, name=f"{start_ind_1}_{start_ind_2}_{max_size}", extra_path=all_outputs_folder_path)

    log.info("Eval complete")

@hydra.main(config_path="cramming/config", config_name="cfg_eval", version_base="1.3")
def launch(cfg):
    log.info("calling main launch")
    cfg = cramming.utils.pathfinder(cfg)
    log.info(OmegaConf.to_yaml(cfg, resolve=True))
    main(cfg)

if __name__ == "__main__":
    launch()

================================================
FILE: upload_processed_dataset.py
================================================
"""Script to upload a processed dataset to the huggingface hub. You probably don't need this :)"""


import hydra
import logging
from omegaconf import OmegaConf
import tempfile
import os

from datasets import load_dataset

import cramming


log = logging.getLogger(__name__)


def upload(cfg, setup):
    dataset, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl)
    checksum = cramming.data.utils.checksum_config(cfg.data)
    processed_dataset_name = f"{cfg.data.name}_{checksum}"

    use_own_chunking = True
    chunk_size = 8192 * 32
    num_files = len(dataset) // chunk_size + 1
    target_types = ["input_ids"]

    files = []
    # Split dataset in parquet files
    with tempfile.TemporaryDirectory() as tmpdirname:
        if use_own_chunking:
            # Loop through the dataset and write each chunk to a Parquet file
            # This is not really necessary, but nice to save only target_types and to match chunk sizes to target batch sizes
            for idx in range(num_files):
                chunk = dataset.select(range(idx * chunk_size, min(len(dataset), (idx + 1) * chunk_size)))
                filename = f"{tmpdirname}/train_{idx}.parquet"
                chunk.to_pandas()[target_types].to_parquet(filename, index=False)
                files.append(filename)
                log.info(f"Chunk {idx} written to file {filename}.")

            # Re-assemble parqueted dataset
            dataset = load_dataset("parquet", data_files=files)

        # Define the dataset info
        description = f"""This is a preprocessed dataset for the cramming-project.

                                Use only with the tokenizer prescribed here.
                                This version is {processed_dataset_name}, which corresponds to the following setup:
                                {OmegaConf.to_yaml(cfg, resolve=True)}

                                Limitations and bias:
                                This training data was further filtered and sorted beyond the normal preprocessing.
                                These modifications were not tested for unintended consequences.

                              """
        dataset["train"].info.description = description
        # dataset_tags = ["cramming", "English", "preprocessed"]

        # Launch upload
        log.info("Preparing for dataset upload ...")
        dataset.push_to_hub(processed_dataset_name, private=True)

        # Upload tokenizer to same adress - this is annoying because by default tokenizers are pushed to model directories
        # tokenizer.push_to_hub(processed_dataset_name) -> this will push to a new directory in HF models
        from huggingface_hub import HfApi

        api = HfApi()
        log.info("Preparing for tokenizer upload ...")
        tokenizer_loc = os.path.join(os.path.join(cfg.impl.path, processed_dataset_name), "tokenizer")
        for file in os.listdir(tokenizer_loc):
            api.upload_file(
                path_or_fileobj=os.path.join(tokenizer_loc, file),
                path_in_repo=os.path.join("tokenizer", file),
                repo_id=f"{api.whoami()['name']}/{processed_dataset_name}",
                repo_type="dataset",
            )
        log.info("Upload completed succesfully.")


@hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.3")
def launch(cfg):
    cramming.utils.main_launcher(cfg, upload, job_name="upload")


if __name__ == "__main__":
    launch()