Full Code of kmccleary3301/nested_learning for AI

main 56463f385355 cached

337 files

1.3 MB

371.3k tokens

723 symbols

1 requests

Download .txt

Showing preview only (1,467K chars total). Download the full file or copy to clipboard to get everything.

Repository: kmccleary3301/nested_learning
Branch: main
Commit: 56463f385355
Files: 337
Total size: 1.3 MB

Directory structure:
gitextract_wq324oq_/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── config.yml
│   │   ├── eval_request.md
│   │   ├── faithfulness_gap.md
│   │   └── perf_regression.md
│   └── workflows/
│       ├── ci.yml
│       ├── packages.yml
│       ├── release.yml
│       └── security.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── TODO.md
├── configs/
│   ├── ablations/
│   │   ├── cms_sparse.yaml
│   │   ├── selfmod_chunked_8_64.yaml
│   │   ├── selfmod_momentum_off.yaml
│   │   ├── selfmod_momentum_on.yaml
│   │   ├── selfmod_no_alpha.yaml
│   │   ├── selfmod_no_cms.yaml
│   │   └── selfmod_rank1_precond_off.yaml
│   ├── data/
│   │   ├── continual_segments_sample.yaml
│   │   ├── fineweb_edu_longdoc_filtered_sample.yaml
│   │   ├── fineweb_edu_mixture_full.yaml
│   │   ├── fineweb_edu_mixture_sample.yaml
│   │   ├── refinedweb_mixture.yaml
│   │   ├── refinedweb_mixture_filtered.yaml
│   │   ├── refinedweb_mixture_full.yaml
│   │   └── refinedweb_mixture_sample.yaml
│   ├── deepspeed/
│   │   └── zero3.json
│   ├── hope/
│   │   ├── mid.yaml
│   │   ├── mid_fsdp.yaml
│   │   ├── pilot.yaml
│   │   ├── pilot_attention.yaml
│   │   ├── pilot_selfmod.yaml
│   │   ├── pilot_transformer.yaml
│   │   ├── target.yaml
│   │   └── target_fsdp.yaml
│   ├── mid_smoke.yaml
│   ├── mid_stage2.yaml
│   ├── mid_stage2_smoke.yaml
│   ├── mid_titan_baseline.yaml
│   ├── pilot.yaml
│   ├── pilot_paper_faithful.yaml
│   ├── pilot_selfmod_paper_faithful.yaml
│   ├── pilot_smoke.yaml
│   └── resolved/
│       ├── cms_sparse_eval.yaml
│       ├── phase2_pilot_attention_eval.yaml
│       └── phase2_pilot_transformer_eval.yaml
├── docker/
│   └── Dockerfile.dist
├── docs/
│   ├── BUG_REPORT_CHECKLIST.md
│   ├── COMPATIBILITY_MATRIX.md
│   ├── FSDP_SCALING_GUIDE.md
│   ├── IMPLEMENTATION_STATUS.md
│   ├── P4_REMEDIATION_PLAN.md
│   ├── PACKAGE_RELEASE_CHECKLIST.md
│   ├── PAPER_COMPLIANCE.md
│   ├── PHASE2_LONG_CONTEXT_COMPARISON.md
│   ├── PHASE_2_PLAN.md
│   ├── PYPI_TRUSTED_PUBLISHING.md
│   ├── STREAMING_CONTRACT.md
│   ├── VERSIONING_POLICY.md
│   ├── compute_plan.md
│   ├── continual_classification_eval.md
│   ├── continual_eval.md
│   ├── data_pipeline.md
│   ├── env_matrix.md
│   ├── experiments_report.md
│   ├── future_directions.md
│   ├── phase2_comparison.md
│   ├── release_checklist.md
│   ├── scaling_guidance.md
│   ├── spec_interfaces.md
│   ├── sprint_next_plan.md
│   ├── stage2_plan.md
│   ├── stage2_progress.md
│   ├── templates/
│   │   └── checkpoint_report.md
│   └── zeroshot_eval.md
├── eval/
│   ├── continual_dummy.json
│   ├── continual_mid_stage2.json
│   ├── continual_mid_stage2_smoke.json
│   ├── continual_mid_stage2_ts10.json
│   ├── continual_mid_stage2_ts10_single120_clip.json
│   ├── continual_mid_stage2_ts10_single140_schedC.json
│   ├── continual_mid_stage2_ts10_single220_schedD.json
│   ├── continual_mid_stage2_ts10_single80.json
│   ├── continual_mid_stage2_ts10_single80lr2e5.json
│   ├── continual_mid_stage2_ts20.json
│   ├── continual_mid_titan_baseline.json
│   ├── continual_pilot.json
│   ├── continual_pilot_cms_nochunk_step5000.json
│   ├── continual_pilot_cms_sparse_step5000.json
│   ├── continual_pilot_multi.json
│   ├── continual_pilot_opt_adamw_step5000.json
│   ├── continual_pilot_opt_muon_step5000.json
│   ├── continual_pilot_selfmod_off_step5000.json
│   ├── continual_pilot_step22000.json
│   ├── continual_pilot_step230000.json
│   ├── continual_pilot_teach05_long_step25000.json
│   ├── continual_pilot_teach05_step2000.json
│   ├── continual_pilot_teach15_long_step25000.json
│   ├── continual_pilot_teach15_step2000.json
│   ├── continual_smoke.json
│   ├── continual_titan.json
│   ├── continual_titan_relaunch_step001000.json
│   ├── continual_titan_step25000.json
│   ├── niah_dummy.json
│   ├── niah_mid_stage2.json
│   ├── niah_mid_stage2_smoke.json
│   ├── niah_mid_stage2_ts10.json
│   ├── niah_mid_stage2_ts10_single120_clip.json
│   ├── niah_mid_stage2_ts10_single140_schedC.json
│   ├── niah_mid_stage2_ts10_single220_schedD.json
│   ├── niah_mid_stage2_ts10_single80.json
│   ├── niah_mid_stage2_ts10_single80lr2e5.json
│   ├── niah_mid_stage2_ts20.json
│   ├── niah_mid_titan_baseline.json
│   ├── niah_pilot.json
│   ├── niah_pilot_cms_nochunk_step5000.json
│   ├── niah_pilot_cms_sparse_step5000.json
│   ├── niah_pilot_opt_adamw_step5000.json
│   ├── niah_pilot_opt_muon_step5000.json
│   ├── niah_pilot_selfmod_off_step5000.json
│   ├── niah_pilot_step22000.json
│   ├── niah_pilot_step230000.json
│   ├── niah_pilot_teach05_long_step25000.json
│   ├── niah_pilot_teach05_step2000.json
│   ├── niah_pilot_teach15_long_step25000.json
│   ├── niah_pilot_teach15_step2000.json
│   ├── niah_smoke.json
│   ├── niah_titan.json
│   ├── niah_titan_relaunch_step001000.json
│   ├── niah_titan_step25000.json
│   ├── passkey_pilot.json
│   ├── passkey_pilot_step230000.json
│   ├── passkey_titan.json
│   ├── passkey_titan_relaunch_step001000.json
│   ├── passkey_titan_step25000.json
│   ├── pg19_pilot.json
│   ├── pg19_pilot_step230000.json
│   ├── pg19_titan.json
│   ├── pg19_titan_relaunch_step001000.json
│   ├── pg19_titan_step25000.json
│   ├── phase2_compare_smoke_lastlayer_metrics.json
│   ├── zeroshot_full_smoke.json
│   ├── zeroshot_mid_stage2.json
│   ├── zeroshot_mid_stage2_smoke.json
│   ├── zeroshot_mid_stage2_smoke_piqa_baseline.json
│   ├── zeroshot_mid_stage2_smoke_piqa_mem.json
│   ├── zeroshot_mid_stage2_ts10.json
│   ├── zeroshot_mid_stage2_ts10_single120_clip.json
│   ├── zeroshot_mid_stage2_ts10_single140_schedC.json
│   ├── zeroshot_mid_stage2_ts10_single220_schedD.json
│   ├── zeroshot_mid_stage2_ts10_single80.json
│   ├── zeroshot_mid_stage2_ts10_single80lr2e5.json
│   ├── zeroshot_mid_stage2_ts20.json
│   ├── zeroshot_mid_titan_baseline.json
│   ├── zeroshot_pilot.json
│   ├── zeroshot_pilot_cms_nochunk_step5000.json
│   ├── zeroshot_pilot_cms_sparse_step5000.json
│   ├── zeroshot_pilot_debug.json
│   ├── zeroshot_pilot_dummy_piqa.json
│   ├── zeroshot_pilot_opt_adamw_step5000.json
│   ├── zeroshot_pilot_opt_muon_step5000.json
│   ├── zeroshot_pilot_selfmod_off_step5000.json
│   ├── zeroshot_pilot_step22000.json
│   ├── zeroshot_pilot_step230000.json
│   ├── zeroshot_pilot_teach05_long_step25000.json
│   ├── zeroshot_pilot_teach05_step2000.json
│   ├── zeroshot_pilot_teach15_long_step25000.json
│   ├── zeroshot_pilot_teach15_step2000.json
│   ├── zeroshot_smoke.json
│   ├── zeroshot_titan.json
│   ├── zeroshot_titan_relaunch_step001000.json
│   └── zeroshot_titan_step25000.json
├── google_papers/
│   ├── Nested_Learning/
│   │   ├── Nested_Learning.json
│   │   └── Nested_Learning.md
│   └── TITANs/
│       ├── TITANs.json
│       └── TITANs.md
├── pyproject.toml
├── reports/
│   ├── ablations.md
│   ├── cadence_mechanism_audit_smoke.json
│   ├── compliance_mechanism_audit_smoke.json
│   ├── compliance_summary_pilot.json
│   ├── compliance_summary_pilot_paper_faithful.json
│   ├── next_backlog_scoped.md
│   ├── security_release_gate.md
│   ├── sprint_completion_report.md
│   └── stage2_smoke.md
├── scripts/
│   ├── __init__.py
│   ├── checkpoint/
│   │   └── verify.py
│   ├── checks/
│   │   ├── check_data_script_help.sh
│   │   ├── check_git_tracked_sizes.sh
│   │   ├── check_readme_commands.sh
│   │   ├── compliance_report.py
│   │   ├── run_fidelity_ci_subset.sh
│   │   ├── tokenizer_coverage_guard.py
│   │   ├── verify_docs_refs.py
│   │   └── verify_update_cadence.py
│   ├── compute/
│   │   └── create_reservations.sh
│   ├── data/
│   │   ├── __init__.py
│   │   ├── check_tokenizer.py
│   │   ├── check_tokenizer_coverage.py
│   │   ├── filter_corpus.py
│   │   ├── process_mixture.py
│   │   ├── run_full.sh
│   │   ├── run_sample.sh
│   │   ├── shard_corpus.py
│   │   ├── train_tokenizer.py
│   │   └── validate_mixture.py
│   ├── eval/
│   │   ├── __init__.py
│   │   ├── compare_variants.py
│   │   ├── continual.py
│   │   ├── continual_classification.py
│   │   ├── niah.py
│   │   ├── niah_suite.py
│   │   ├── passkey.py
│   │   ├── pg19_perplexity.py
│   │   ├── phase2_memorization_delta_smoke.py
│   │   ├── plot_continual_classification.py
│   │   ├── plot_forgetting.py
│   │   ├── plot_niah_suite.py
│   │   ├── run_pilot_suite.sh
│   │   ├── summarize_eval.py
│   │   └── zeroshot.py
│   ├── package_pilot_release.sh
│   ├── run_cpu_ddp_smoke.sh
│   ├── run_e2e_smoke.sh
│   ├── run_mechanism_audit_smoke.sh
│   ├── run_smoke.sh
│   └── tests/
│       └── run_passkey_smoke.sh
├── src/
│   └── nested_learning/
│       ├── __init__.py
│       ├── __main__.py
│       ├── assoc_memory.py
│       ├── backbones.py
│       ├── capabilities.py
│       ├── cli.py
│       ├── cms.py
│       ├── config_utils.py
│       ├── continual_classification.py
│       ├── continual_streaming.py
│       ├── data.py
│       ├── device.py
│       ├── eval_state.py
│       ├── fast_state.py
│       ├── functional.py
│       ├── hope/
│       │   ├── __init__.py
│       │   ├── block.py
│       │   └── self_mod.py
│       ├── instrumentation.py
│       ├── levels.py
│       ├── logging_utils.py
│       ├── memorize.py
│       ├── model.py
│       ├── optim/
│       │   ├── __init__.py
│       │   ├── deep.py
│       │   ├── factory.py
│       │   ├── m3.py
│       │   └── manager.py
│       ├── titan/
│       │   ├── __init__.py
│       │   ├── memory.py
│       │   ├── model.py
│       │   └── self_modifying.py
│       ├── tokenizer.py
│       ├── tokenizer_coverage.py
│       ├── training.py
│       └── transformer.py
├── tests/
│   ├── conftest.py
│   ├── data/
│   │   ├── passkey_corpus.txt
│   │   ├── tiny_tokenizer.model
│   │   └── tiny_tokenizer.vocab
│   ├── test_algorithm_mode_grad.py
│   ├── test_attention_cache.py
│   ├── test_attention_features.py
│   ├── test_boundary_state_mode.py
│   ├── test_boundary_state_training_loop.py
│   ├── test_build_model_from_cfg_selfmod.py
│   ├── test_checkpoint_metadata_and_eval_loaders.py
│   ├── test_cli_tooling.py
│   ├── test_cms.py
│   ├── test_cms_cross_call.py
│   ├── test_cms_delta_rule.py
│   ├── test_cms_flush_partial.py
│   ├── test_compare_variants_cli.py
│   ├── test_compile_toggle.py
│   ├── test_compliance_report.py
│   ├── test_continual_classification.py
│   ├── test_continual_eval_state_mode.py
│   ├── test_data_scripts_help.py
│   ├── test_data_split_fallbacks.py
│   ├── test_determinism_seed.py
│   ├── test_device_resolution.py
│   ├── test_distributed_fail_fast.py
│   ├── test_eval_builders.py
│   ├── test_eval_state.py
│   ├── test_eval_state_cli.py
│   ├── test_faithfulness_harness.py
│   ├── test_fast_state_batch_semantics.py
│   ├── test_fast_state_forward_equivalence.py
│   ├── test_fast_state_meta_grads.py
│   ├── test_fast_state_selfmod_meta_grads.py
│   ├── test_git_tracked_sizes_check.py
│   ├── test_hope_block.py
│   ├── test_hope_selfmod_fast_state_meta_unchanged.py
│   ├── test_hope_selfmod_integration.py
│   ├── test_hope_selfmod_update_pass.py
│   ├── test_levels.py
│   ├── test_m3.py
│   ├── test_m3_slow_timing.py
│   ├── test_memorization.py
│   ├── test_model.py
│   ├── test_model_streaming_cadence.py
│   ├── test_online_chunking.py
│   ├── test_optim.py
│   ├── test_optimizer_param_policy.py
│   ├── test_package_release_script.py
│   ├── test_paper_faithful_configs.py
│   ├── test_phase2_memorization_delta.py
│   ├── test_residual_mlp_memory.py
│   ├── test_run_features.py
│   ├── test_self_modifying_titans.py
│   ├── test_selfmod_adaptive_q.py
│   ├── test_selfmod_dgd_linear.py
│   ├── test_selfmod_grad_flow.py
│   ├── test_selfmod_local_conv.py
│   ├── test_selfmod_online.py
│   ├── test_strict_streaming_contract.py
│   ├── test_surprise_metric.py
│   ├── test_surprise_override.py
│   ├── test_teach_signal.py
│   ├── test_tied_weight_guard.py
│   ├── test_variants.py
│   ├── test_verify_docs_refs.py
│   └── test_verify_update_cadence.py
├── train.py
├── train_deepspeed.py
├── train_dist.py
└── train_fsdp.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links: []


================================================
FILE: .github/ISSUE_TEMPLATE/eval_request.md
================================================
---
name: Evaluation request
about: Propose a new benchmark or diagnostic to add
title: "[Eval] "
labels: ["evaluation", "needs-triage"]
assignees: []
---

## Motivation
Why is this evaluation important for HOPE/TITAN reproduction?

## Task details
- Dataset / benchmark:
- Metric(s):
- Expected runtime / hardware:

## Environment target
- OS:
- Python:
- Torch:
- Preferred backend (`cpu` / `cuda` / `mps` / `rocm`):

## Implementation sketch
Outline scripts/flags needed (e.g., extend `scripts/eval/zeroshot.py`).

## Acceptance criteria
Describe what needs to be captured (JSON fields, plots, etc.).


================================================
FILE: .github/ISSUE_TEMPLATE/faithfulness_gap.md
================================================
---
name: Faithfulness gap
about: Report deviations vs. the Nested Learning / HOPE specs
title: "[Faithfulness] "
labels: ["faithfulness", "needs-triage"]
assignees: []
---

## Summary
Describe the suspected deviation (cite paper section/equation).

## Evidence
- Config(s) / checkpoints affected
- Logs / screenshots / metrics
- Steps to reproduce

## Environment
- OS:
- Python:
- Torch:
- Backend (`cpu` / `cuda` / `mps` / `rocm`):
- GPU/accelerator model (if any):

If using ROCm: this project currently treats ROCm support as best-effort. Include HIP/ROCm version and exact torch build.

## Expected behavior
What should happen according to the paper?

## Additional context
Add any extra notes, e.g., suggested fix or related PRs.


================================================
FILE: .github/ISSUE_TEMPLATE/perf_regression.md
================================================
---
name: Performance regression
about: Report a training / eval performance drop vs. baseline
title: "[Perf] "
labels: ["performance", "needs-triage"]
assignees: []
---

## Summary
Describe the regression and the baseline you’re comparing against.

## Baseline
- Config / checkpoint:
- Metrics (loss / ppl / eval scores):

## Repro steps
Exact commands with overrides, plus hardware details.

## Environment
- OS:
- Python:
- Torch:
- Backend (`cpu` / `cuda` / `mps` / `rocm`):
- GPU/accelerator model (if any):

If using ROCm: this project currently treats ROCm support as best-effort. Include HIP/ROCm version and exact torch build.

## Logs / artifacts
Attach relevant logs, W&B links, or JSON eval files.

## Suspected cause
Optional theory or related commits/PRs.


================================================
FILE: .github/workflows/ci.yml
================================================
name: CI

on:
  push:
    branches: ["main"]
  pull_request:
    branches: ["main"]

jobs:
  lint-and-test:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Sync dependencies
        run: uv sync --all-extras --dev

      - name: Ruff
        run: uv run ruff check .

      - name: Mypy
        run: uv run mypy src

      - name: Verify docs path references
        run: uv run python scripts/checks/verify_docs_refs.py

      - name: Verify README critical commands
        run: bash scripts/checks/check_readme_commands.sh

      - name: Guard tracked file sizes / artifact extensions
        run: bash scripts/checks/check_git_tracked_sizes.sh

      - name: Verify scripts/data help exits cleanly
        run: bash scripts/checks/check_data_script_help.sh

      - name: Pytest
        run: uv run pytest

  cross-platform-smoke:
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        python-version: ["3.10", "3.12"]
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Sync dependencies
        run: uv sync --dev

      - name: CLI help + doctor + smoke
        run: |
          uv run nl --help
          uv run nl doctor --json
          uv run nl smoke --config-name pilot_smoke --device cpu --batch-size 1 --seq-len 8
          uv run python -m nested_learning --help

  wheel-install-smoke:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Build wheel
        run: uv build

      - name: Install wheel in isolated venv
        run: |
          python -m venv /tmp/wheel-smoke
          /tmp/wheel-smoke/bin/python -m pip install --upgrade pip
          /tmp/wheel-smoke/bin/python -m pip install dist/*.whl

      - name: Verify wheel entrypoints outside repo configs
        run: |
          /tmp/wheel-smoke/bin/python -m nested_learning --help
          /tmp/wheel-smoke/bin/python -m nested_learning doctor --json
          /tmp/wheel-smoke/bin/python - <<'PY'
          import subprocess
          import sys
          import tempfile

          tmp = tempfile.mkdtemp(prefix="nl-wheel-smoke-")
          cmd = [
              sys.executable,
              "-m",
              "nested_learning",
              "smoke",
              "--config-name",
              "pilot_smoke",
              "--device",
              "cpu",
              "--batch-size",
              "1",
              "--seq-len",
              "8",
          ]
          subprocess.run(cmd, cwd=tmp, check=True)
          PY

  cpu-ddp-smoke:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Sync dependencies
        run: uv sync --all-extras --dev

      - name: CPU DDP smoke (gloo backend)
        run: bash scripts/run_cpu_ddp_smoke.sh

  passkey-smoke:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Sync dependencies
        run: uv sync --all-extras --dev

      - name: Run synthetic passkey memorization test
        run: bash scripts/tests/run_passkey_smoke.sh

  fidelity-subset:
    runs-on: ubuntu-latest
    timeout-minutes: 20
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Sync dependencies
        run: uv sync --all-extras --dev

      - name: Run fidelity subset + compliance report
        run: bash scripts/checks/run_fidelity_ci_subset.sh


================================================
FILE: .github/workflows/packages.yml
================================================
name: Packages

on:
  push:
    tags:
      - "v*"
  workflow_dispatch:

permissions:
  contents: read
  packages: write

env:
  REGISTRY: ghcr.io
  IMAGE_NAME: ${{ github.repository_owner }}/nested-learning-dist

jobs:
  publish-ghcr:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Build source and wheel distributions
        run: uv build

      - name: Generate checksums
        run: |
          cd dist
          sha256sum * > SHA256SUMS.txt

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Login to GHCR
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Extract OCI metadata
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
          tags: |
            type=raw,value=${{ github.ref_name }},enable=${{ github.event_name == 'push' }}
            type=raw,value=latest,enable=${{ github.event_name == 'push' && !contains(github.ref_name, 'rc') }}
            type=raw,value=edge,enable=${{ github.event_name == 'workflow_dispatch' }}
            type=sha,format=short,enable=${{ github.event_name == 'workflow_dispatch' }}

      - name: Build and publish GHCR package image
        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./docker/Dockerfile.dist
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}


================================================
FILE: .github/workflows/release.yml
================================================
name: Release

on:
  push:
    tags:
      - "v*"

permissions:
  contents: write
  id-token: write

jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Build source and wheel distributions
        run: uv build

      - name: Twine check
        run: uvx twine check dist/*

      - name: Upload dist artifacts
        uses: actions/upload-artifact@v4
        with:
          name: dist
          path: dist/*

  publish-testpypi:
    if: contains(github.ref_name, 'rc')
    needs: build
    runs-on: ubuntu-latest
    environment:
      name: testpypi
      url: https://test.pypi.org/p/nested-learning
    steps:
      - name: Download dist artifacts
        uses: actions/download-artifact@v4
        with:
          name: dist
          path: dist

      - name: Publish to TestPyPI via Trusted Publishing
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://test.pypi.org/legacy/
          packages-dir: dist/

  publish-pypi:
    if: ${{ !contains(github.ref_name, 'rc') }}
    needs: build
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/nested-learning
    steps:
      - name: Download dist artifacts
        uses: actions/download-artifact@v4
        with:
          name: dist
          path: dist

      - name: Publish to PyPI via Trusted Publishing
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          packages-dir: dist/

  publish-github-release:
    needs:
      - build
      - publish-testpypi
      - publish-pypi
    if: |
      always() &&
      needs.build.result == 'success' &&
      (needs.publish-testpypi.result == 'success' || needs.publish-testpypi.result == 'skipped') &&
      (needs.publish-pypi.result == 'success' || needs.publish-pypi.result == 'skipped')
    runs-on: ubuntu-latest
    steps:
      - name: Download dist artifacts
        uses: actions/download-artifact@v4
        with:
          name: dist
          path: dist

      - name: Generate checksums
        run: |
          cd dist
          sha256sum * > SHA256SUMS.txt

      - name: Build release preamble
        run: |
          cat > release_preamble.md <<EOF
          Package release for \`${GITHUB_REF_NAME}\`.

          Install:
          \`\`\`bash
          pip install nested-learning==${GITHUB_REF_NAME#v}
          \`\`\`

          Included assets:
          - source distribution (`.tar.gz`)
          - wheel (`.whl`)
          - \`SHA256SUMS.txt\` checksums

          For compatibility/support details:
          - https://github.com/${GITHUB_REPOSITORY}/blob/main/docs/COMPATIBILITY_MATRIX.md
          - https://github.com/${GITHUB_REPOSITORY}/blob/main/docs/VERSIONING_POLICY.md
          EOF

      - name: Publish GitHub Release
        uses: softprops/action-gh-release@v2
        with:
          prerelease: ${{ contains(github.ref_name, 'rc') }}
          generate_release_notes: true
          body_path: release_preamble.md
          files: |
            dist/*


================================================
FILE: .github/workflows/security.yml
================================================
name: Security

on:
  push:
    branches: ["main"]
  pull_request:
    branches: ["main"]
  schedule:
    - cron: "0 6 * * 1"

jobs:
  dependency-audit:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up uv
        uses: astral-sh/setup-uv@v3
        with:
          version: "0.9.8"

      - name: Export requirements
        run: uv export --all-extras --dev --format requirements-txt --output-file /tmp/requirements.txt

      - name: pip-audit
        run: uvx pip-audit -r /tmp/requirements.txt
        continue-on-error: true

  secret-scan:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Gitleaks scan
        uses: gitleaks/gitleaks-action@v2
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}



================================================
FILE: .gitignore
================================================
# Environment / tooling
.venv/
__pycache__/
*.pyc
.pytest_cache/
.ruff_cache/
.mypy_cache/

# Local artifacts
logs/
artifacts/
/data/
outputs/
checkpoints/
*.pt
train.log
train_dist.log
ref_repos/
configs/_tmp*
git.env
docs/POSTS.md
docs/EX_*.md
docs/CHECK_2_PLANNING_MODEL_REQUEST.md
docs/CHECK_2_PLANNING_MODEL_RESPONSE.md
docs/planner_check2_attachments.zip
docs/tmp/
docs_tmp/
wandb/
eval/*_ci.json

# Local paper scans / scratch references (keep tracked references separate)
google_papers/*_arXiv_v1.pdf
google_papers/*_arXiv_v1/
google_papers/Nested_Learning_Full_Paper.pdf
google_papers/Nested_Learning_Full_Paper/

# Editors
.DS_Store
.idea/
.vscode/


================================================
FILE: CHANGELOG.md
================================================
# Changelog

All notable changes to this project will be documented here. The format loosely follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and uses semantic versioning once tagged releases begin.

## [Unreleased]
### Added
- Optional attention KV-cache path for continuous streaming inference (`init_attention_cache`, `attention_cache`, `return_attention_cache`) across HOPE/TITAN/Transformer blocks.
- Boundary-target online chunking mode (`train.online_boundary_targets`) and optional training-time attention-cache carry (`train.online_carry_attention_cache`) for stronger chunk-boundary semantics.
- Evaluation streaming-state utilities (`src/nested_learning/eval_state.py`) plus continual-eval controls (`--eval-state-mode`, `--eval-use-fast-state`, `--eval-use-attention-cache`).
- Compliance report automation (`scripts/checks/compliance_report.py`) with CI subset + mechanism smoke integration.
- Flash/SDPA-backed self-attention path with safe fallbacks, unlocking PyTorch 2.9 SDPA kernels by default.
- Hydra toggles for bf16 autocast (`train.mixed_precision.*`), `torch.compile` (`train.compile.*`), and fused optimizers.
- Muon + AdamW hybrid optimizer option exposed via `optim.type=muon`, routing ≥2D matrices through `torch.optim.Muon`.
- Test-time memorization flags (`--memorize*`) documented in README + `docs/guide.md`, matching TITAN eval behavior.
- Automation helpers: `scripts/run_e2e_smoke.sh` documented in Quickstart, plus new `scripts/run_cpu_ddp_smoke.sh` for CPU-only DDP/gloo smoke coverage.
- Streaming contract doc (`docs/STREAMING_CONTRACT.md`) defining sequence/segment/chunk semantics and fast-state lifecycle.
- Cadence verification utility (`scripts/checks/verify_update_cadence.py`) with synthetic tests and release-checklist integration.
- Fidelity CI subset runner (`scripts/checks/run_fidelity_ci_subset.sh`) and mechanism-auditing smoke runner (`scripts/run_mechanism_audit_smoke.sh`).
- Progress/status docs for P7 execution (`docs/PLAN_PROGRESS_P7.md`, `docs/IMPLEMENTATION_STATUS.md`).
- Bug-report reproducibility checklist (`docs/BUG_REPORT_CHECKLIST.md`).
- Boundary-state training-loop regression coverage (`tests/test_boundary_state_training_loop.py`) plus eval-loader/metadata roundtrip coverage (`tests/test_checkpoint_metadata_and_eval_loaders.py`).
- `scripts/checks/check_data_script_help.sh` to guarantee `scripts/data/* --help` exits cleanly; wired into CI.
- Markdown anchor verification in `scripts/checks/verify_docs_refs.py` with dedicated unit coverage.
- Tag release automation now creates GitHub Release entries with attached wheel/sdist artifacts plus `SHA256SUMS.txt`.
- Added GHCR package publishing workflow (`.github/workflows/packages.yml`) so the Packages tab contains a versioned `nested-learning-dist` OCI bundle.

### Changed
- README / compliance / streaming docs now reflect boundary-target mode, optional KV-cache carry, and explicit scope boundaries.
- CPU DDP smoke now includes strict-mode fail-fast verification.
- Repository license metadata now matches the shipped Apache-2.0 text; badges updated accordingly.
- README and guide refreshed with performance knobs, optimizer guidance, and memorization instructions so release consumers have a single source of truth.
- Release checklist tracks the new CPU DDP smoke script to keep packaging instructions aligned with available tooling.
- Training loop strict-mode guardrails: `train.strict_streaming_contract` now fail-fasts on known semantics violations (DDP feature downgrades, shared-batch fast-state, non paper-defined variant in strict mode).
- CMS telemetry now includes cadence metrics (`updates_applied`, `tokens_flushed`, `pending_tokens`, `gate_hits`) to make update-frequency behavior auditable.
- Paper-auditing preset now explicitly enables strict streaming contract checks.
- `configs/pilot_paper_faithful.yaml` now explicitly sets `train.online_updates=true` and tests verify no implicit algorithm-mode fallback.
- Boundary-state mode now emits an explicit startup warning code (`experimental_boundary_state_mode`) and validates cache/chunk constraints early.
- Checkpoint metadata now records algorithm/online flags (`algorithm_mode`, `online_updates`, `online_boundary_targets`, `online_carry_attention_cache`, `use_fast_state`), and release manifest includes those flags.
- Data split fallback policy is deterministic across data scripts (`train -> validation -> test -> first available`) with explicit available-splits logging.

### Upcoming
- GitHub Actions workflow covering `ruff`, `mypy`, and `pytest`.
- End-to-end release dry-run ahead of the `v0.1.0` tag.

## [0.1.0] - 2025-11-09
### Added
- PyTorch **2.9.0** / torchvision **0.24.0** environment managed via `uv` with reproducible `pyproject.toml` + `uv.lock`.
- HOPE block implementation (attention → TITAN memory → CMS + deep optimizers) with configurable level clocks and self-modifier wiring.
- Hydrated Hydra config tree for pilot, mid, target, and CPU-only smoke runs plus DDP/FSDP/DeepSpeed entrypoints.
- Data tooling: tokenizer trainer, corpus filtering, mixture processing, and `scripts/data/run_sample.sh` shortcut emitting stats under `data/mixtures/`.
- Evaluation suite: zero-shot benchmark CLI (PIQA/HellaSwag/WinoGrande/ARC/BoolQ/SIQA), Needle-in-a-Haystack generator, continual-learning forgetting analyzer.
- Sample artifacts (`artifacts/examples/pilot_dummy.pt`, `logs/pilot_smoke.json`, `logs/mid_smoke.json`) for reproducing eval commands without lengthy training.
- Documentation set (`docs/stage1_plan.md`, `docs/stage2_plan.md`, `docs/data_pipeline.md`, `docs/guide.md`) outlining architecture, scaling strategy, and onboarding.

### Changed
- README rewritten with badges, quickstart commands, and references to the new guide + release checklist.
- Logging defaults clarified (`logging.backend=json|wandb`), with instructions for saving structured metrics under `logs/`.

### Known gaps
- Release automation and CI are tracked in `docs/release_plan.md`.
- Scaling guidance for >100 B token corpora pending additional storage + GPU availability.


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Nested Learning Reproduction

![CI](https://github.com/kmccleary3301/nested_learning/actions/workflows/ci.yml/badge.svg)
![Security](https://github.com/kmccleary3301/nested_learning/actions/workflows/security.yml/badge.svg)
![Python](https://img.shields.io/badge/python-3.10%20to%203.12-blue)
![PyTorch](https://img.shields.io/badge/pytorch-2.9.0-red)
![License](https://img.shields.io/badge/license-Apache--2.0-green)
![Status](https://img.shields.io/badge/tests-smoke--ready-lightgrey)

Mechanism-level reproduction of Google's Nested Learning (HOPE) architecture (HOPE blocks, CMS, and Self‑Modifying TITANs), matching the quality bar set by lucidrains' TITAN reference while remaining fully open-source and `uv` managed.

Faithfulness scope (high level):
- ✅ HOPE / CMS / Self‑Modifying Titans update rules + wiring (mechanism-level)
- ✅ Tensor-level invariants covered by unit tests (teach-signal, δℓ, CMS chunking, causality)
- ✅ Boundary-target online chunking + optional attention-cache carry path are implemented
- ⚠️ Stable default uses stop-grad online writes; an experimental single-process boundary-state mode supports differentiable write paths
- ⚠️ Multi‑GPU mechanism-auditing online updates are not supported in this repo (DDP disables some features)

Paper reference pin:
- Source: `google_papers/Nested_Learning_Full_Paper/Nested_Learning_Full_Paper.md`
- SHA-256: `7524af0724ac8e3bad9163bf0e79c85b490a26bc30b92d96b0bdf17a27f9febc`

## Quickstart
```bash
uv python install 3.12
uv sync --all-extras
uv run nl doctor --json > logs/runtime_doctor.json
uv run bash scripts/data/run_sample.sh
uv run nl smoke --config-name pilot_smoke --device cpu
uv run bash scripts/run_smoke.sh pilot  # CPU-friendly HOPE block smoke test
uv run bash scripts/run_e2e_smoke.sh    # sync + sample data + smoke train + zeroshot eval
uv run bash scripts/run_mechanism_audit_smoke.sh
uv run python scripts/eval/zeroshot.py \
  --config configs/hope/pilot.yaml \
  --checkpoint artifacts/examples/pilot_dummy.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --tasks piqa --max-samples 32 --device cpu
```

## Requirements
- Python 3.10-3.12
- PyTorch 2.9.x+ (golden environment in this repo uses 2.9.x)
- `uv` (recommended for development) or `pip` for package-style usage

## Compatibility
- Support tiers and OS/runtime matrix: `docs/COMPATIBILITY_MATRIX.md`
- Versioning/stability policy: `docs/VERSIONING_POLICY.md`
- Golden repro environment: Python 3.12 + `uv lock` + PyTorch 2.9.x

macOS / Apple Silicon expectations:
- Mac users can run install + CLI + eval/smoke workflows.
- `train.device=mps` is supported for small/local runs.
- Linux + CUDA remains the only Tier 1 full-training path in this repo.
- Cross-backend numerical parity (CUDA vs MPS) is not guaranteed.
- If MPS is unavailable, device selection falls back to CPU (`nl doctor --json` shows this clearly).

## Installation (pip-first)
1. Create and activate a virtual environment.
2. Install Torch first (CPU/CUDA wheel selection is backend-specific).
3. Install this project.

CPU example:
```bash
python -m venv .venv
source .venv/bin/activate
python -m pip install --upgrade pip
python -m pip install "torch>=2.9,<3" --index-url https://download.pytorch.org/whl/cpu
python -m pip install -e .
```

CUDA example (adjust index URL to your CUDA runtime):
```bash
python -m venv .venv
source .venv/bin/activate
python -m pip install --upgrade pip
python -m pip install "torch>=2.9,<3" --index-url https://download.pytorch.org/whl/cu128
python -m pip install -e .
```

## Setup (uv dev workflow)
```bash
uv python install 3.12
uv sync --all-extras
```

Developer checks:
- `uv run ruff check .`
- `uv run mypy src`
- `uv run pytest`
- `uv run bash scripts/checks/run_fidelity_ci_subset.sh`
- `uv run python scripts/checks/compliance_report.py --config configs/pilot.yaml --output eval/compliance_report.json`

## CLI
The package ships with `nl` for portable workflows across local/dev/prod environments.

```bash
# runtime compatibility snapshot
uv run nl doctor --json

# architecture/config smoke on chosen device
uv run nl smoke --config-name pilot_smoke --device cpu --batch-size 1 --seq-len 8

# static fidelity checks for a config
uv run nl audit --config-name pilot_paper_faithful

# train with Hydra overrides
uv run nl train --config-name pilot --override train.device=cuda:1 --override train.steps=100
```

`python -m nested_learning ...` is also supported.

## First 30 Minutes
Use this path for a fast first success on CPU:

```bash
uv sync --all-extras
uv run bash scripts/data/run_sample.sh
uv run bash scripts/run_smoke.sh pilot
uv run bash scripts/run_mechanism_audit_smoke.sh
```

This confirms:
- data/tokenizer pipeline is operational,
- model/training loop runs end-to-end,
- cadence checks pass for a mechanism-auditing smoke run.

## Data Pipeline
1. **Tokenizer training**
   ```bash
   uv run python scripts/data/train_tokenizer.py \
     --manifest configs/data/refinedweb_mixture.yaml \
     --vocab-size 32000 \
     --output-dir artifacts/tokenizer/refinedweb_mix \
     --log-file data/mixtures/refinedweb_mix_tokenizer.json
   ```
2. **Corpus filtering + sharding**
   ```bash
   uv run python scripts/data/process_mixture.py \
     configs/data/refinedweb_mixture_filtered.yaml \
     --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
     --log-file data/mixtures/refinedweb_mix_filtered_shards.json
   ```
3. **Sample pipeline** (downloads/licensed datasets, filters, shards, records stats)
   ```bash
   uv run bash scripts/data/run_sample.sh
   ```
4. **Full pipeline** (set env vars like `RW_LIMIT`, `WIKI_LIMIT`, etc. to scale ingestion)
  ```bash
  uv run bash scripts/data/run_full.sh  # default ~50k docs per corpus; increase limits as needed
  ```

### Data Troubleshooting
- If `scripts/data/run_sample.sh` cannot find `artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model`, rerun:
  ```bash
  uv run bash scripts/data/run_sample.sh
  ```
  The script auto-trains the tokenizer when missing.
- If `scripts/data/run_full.sh` fails with `Bad split: train. Available splits: ['test']`, use split fallback:
  ```bash
  FALLBACK_SPLIT=test uv run bash scripts/data/run_full.sh
  ```
  You can also override per-corpus splits (for example `RW_SPLIT=test`).

## Training
- Single GPU / CPU:
  ```bash
  uv run nl train --config-name pilot_smoke
  ```
- Apple Silicon (MPS, if available):
  ```bash
  uv run nl train --config-name pilot_smoke --override train.device=mps
  ```
  Use this path for smoke and small local runs; long/full-scale paper-regime runs are not a supported Mac target in this repository.
- Script-based entrypoint (legacy-compatible):
  ```bash
  uv run python train.py --config-name pilot_smoke
  ```
- DDP (torchrun):
  ```bash
  torchrun --nproc_per_node=2 train_dist.py --config-name mid
  ```
- CPU-only DDP smoke (verifies `gloo` backend and deterministic seeding):
  ```bash
  uv run bash scripts/run_cpu_ddp_smoke.sh
  ```
- FSDP (see `docs/FSDP_SCALING_GUIDE.md` for VRAM/batch sizing):
  ```bash
  # 760M run
  torchrun --nproc_per_node=2 train_fsdp.py --config-name hope/mid_fsdp
  # 1.3B run
  torchrun --nproc_per_node=2 train_fsdp.py --config-name hope/target_fsdp
  ```
- DeepSpeed (requires `deepspeed` installed separately):
  ```bash
  deepspeed --num_gpus=2 train_deepspeed.py --config-name target \
    deepspeed.config=configs/deepspeed/zero3.json
  ```

### Mechanism-auditing presets (HOPE / Nested Learning)

Use the mechanism-auditing preset configs (single GPU):

```bash
uv run python train.py --config-name pilot_paper_faithful
# HOPE self-mod variant:
uv run python train.py --config-name pilot_selfmod_paper_faithful
```

Notes:
- These presets set `data.batch_size=1` to avoid cross-sample fast-memory sharing.
- Online chunking supports one-token overlap **or** explicit boundary-target mode (`train.online_boundary_targets=true`).
- Optional attention-state carry across chunks is available in training via `train.online_carry_attention_cache=true`.
- The exact sequence/segment/chunk/buffer semantics are documented in `docs/STREAMING_CONTRACT.md`.

Overrides:
- `optim.type=m3` (paper optimizer option)
- `train.steps=...` / `train.device=...`

See `docs/PAPER_COMPLIANCE.md` for full fidelity notes.
See `docs/STREAMING_CONTRACT.md` for the precise streaming/update contract used by this repo.

## Scope Boundaries (Current)
- This repo targets mechanism-auditing fidelity, not full paper-scale results parity.
- Boundary-state gradient-through-write exists as an experimental constrained path; it is not yet treated as production/full-scale paper reproduction.
- Distributed mechanism-auditing path for boundary-target + attention-cache carry is not implemented.

### Pilot (3 B tokens) workflow
1. Ensure TMUX session:
   ```bash
   tmux new -s pilot_train
   ```
2. Launch the long run on `cuda:1` (≈52 h wall clock):
   ```bash
   set -a && source git.env && set +a
   export UV_CACHE_DIR=/tmp/uv-cache UV_LINK_MODE=copy
   uv run python train.py --config-name pilot \
     logging.enabled=true logging.backend=wandb \
     logging.project=nested-learning logging.run_name=pilot-main-$(date +%Y%m%d%H%M%S) \
     train.device=cuda:1
   ```
3. Checkpoints appear in `artifacts/checkpoints/pilot/step_*.pt` every 1 000 steps; the accompanying W&B run captures full telemetry.
4. Copy the final checkpoint, config, logs, and eval JSON/CSV into `artifacts/pilot_release/` for distribution.

## Logging
Set `logging.enabled=true` in Hydra configs (or override via CLI) to send metrics to W&B (default). For local JSON logs, use `logging.backend=json logging.path=logs/run.json`. Sample outputs reside in `logs/` and `artifacts/examples/`.

## Evaluation
- Zero-shot:
  ```bash
  uv run python scripts/eval/zeroshot.py \
  --config configs/hope/mid.yaml \
  --checkpoint checkpoints/mid/step_000100.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --tasks all --max-samples 200 --device cuda:0
  ```
  Use `uv run python scripts/eval/zeroshot.py --list-tasks` to display the full benchmark roster (PIQA, HellaSwag, WinoGrande, ARC-E/C, BoolQ, SIQA, CommonsenseQA, OpenBookQA). See `docs/zeroshot_eval.md` for details.
- Needle-in-a-Haystack:
  ```bash
  uv run python scripts/eval/niah.py \
    --config configs/hope/mid.yaml \
    --checkpoint checkpoints/mid/step_000100.pt \
    --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
    --context-lengths 2048 4096 8192 --samples-per-length 20
  ```
- Continual-learning forgetting:
  ```bash
  uv run python scripts/eval/continual.py \
    --config configs/hope/mid.yaml \
    --checkpoints checkpoints/mid/step_000050.pt checkpoints/mid/step_000100.pt \
    --segments-yaml configs/data/continual_segments_sample.yaml \
    --batch-size 4 --max-batches 10 --memorize --memorize-steps 2
  ```
  Plot forgetting curves via `uv run python scripts/eval/plot_forgetting.py --continual-json eval/continual_mid.json`.
- Long-context diagnostics:
  ```bash
  uv run python scripts/eval/passkey.py --config configs/hope/pilot.yaml --checkpoint artifacts/checkpoints/pilot/step_230000.pt \
    --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model --samples 64 --memorize

  uv run python scripts/eval/pg19_perplexity.py --config configs/hope/pilot.yaml --checkpoint artifacts/checkpoints/pilot/step_230000.pt \
    --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model --max-samples 64
  ```

Evaluation summaries are written to `eval/` alongside per-task JSON metrics.

### Test-time memorization toggles
Every evaluator supports TITAN-style memorization so you can reproduce test-time adaptation:
```bash
uv run python scripts/eval/zeroshot.py \
  ... \
  --memorize \
  --memorize-steps 2 \
  --memorize-use-correct-answer \
  --memorize-no-reset  # optional: retain updates across samples
  --memorize-paths titan,cms_fast \
  --memorize-surprise-threshold 0.01
```
- `--memorize` turns on the learner with one LMS step per example by default.
- `--memorize-steps` controls the number of adaptation passes per prompt.
- `--memorize-use-correct-answer` injects ground-truth text during memorization for ablations.
- `--memorize-no-reset` carries memories across samples; omit it to reset every question.
- `--memorize-paths` restricts which levels receive teach-signal updates (`titan`, `cms_fast`, or `all`).
- `--memorize-surprise-threshold` gates updates on average teach-signal norm, matching the paper’s surprise trigger.

Memorization metrics (baseline vs adaptive) are emitted alongside task accuracy for easy comparisons.

## Architecture variants
Select the paper-defined variant via `model.block_variant` in Hydra configs:
- `hope_attention` (paper HOPE-Attention): `Attention → CMS` (paper-defined).
- `hope_selfmod` (paper HOPE scaffold): `Self-modifying Titans (Eqs. 83–93; Eq. 91 residual MLP memories) → CMS` with (by default) **fixed q** and **local conv window=4**, plus chunked updates via `model.self_mod_chunk_size` (others) and `model.self_mod_chunk_size_memory` (M_memory). See `docs/PAPER_COMPLIANCE.md` for the “differentiable read / update-pass writes” semantics.
- `hope_hybrid` (legacy): `Attention + TitanMemory + CMS` (exploratory; not paper-defined).
- `transformer` (baseline): `Attention → MLP` (no TITAN/CMS learning updates; useful for Phase 2 comparisons).

Self-modifying Titans knobs (ablation-friendly, paper-aligned):
- `model.self_mod_objective` (`l2` vs `dot`), `model.self_mod_use_rank1_precond` (DGD-like preconditioner), `model.self_mod_use_alpha` (weight-decay/retention gate), `model.self_mod_stopgrad_vhat`, `model.self_mod_momentum`, `model.self_mod_adaptive_q`, `model.self_mod_local_conv_window`.

## Fast state (Nested Learning semantics)
In-context updates can run against a per-context fast state so meta parameters never change:
- `HOPEModel.init_fast_state()` / `TitanOnlyModel.init_fast_state()` returns a `ModelFastState`.
- `MemorizeConfig.use_fast_state=true` (default) requires passing `fast_state` into `memorize_tokens()` / `memorize_sequence()`; evaluation scripts handle this automatically.
- Training can also run update passes against a per-batch fast state via `train.use_fast_state=true` (meta+delta fast state: meta params are learnable; online updates write deltas only). If `data.batch_size>1`, CMS/TITAN fast state is shared across the batch; use `data.batch_size=1` for strict per-context semantics. See `docs/PAPER_COMPLIANCE.md`.

## Releases
Before tagging or announcing a new checkpoint, work through:
- `docs/release_checklist.md` (model/eval artifact release bundle)
- `docs/PACKAGE_RELEASE_CHECKLIST.md` (package/GitHub/PyPI release flow)
- `docs/PYPI_TRUSTED_PUBLISHING.md` (one-time OIDC setup for TestPyPI/PyPI)

Tag pushes (`v*`) automatically publish:
- PyPI/TestPyPI package artifacts (via Trusted Publishing), and
- a GitHub Release entry with wheel, sdist, and `SHA256SUMS.txt` in the Releases tab.
- a GitHub Packages (GHCR) OCI bundle (`nested-learning-dist`) containing `dist/*`.

GitHub Packages note:
- The repo publishes an OCI artifact bundle to GHCR (shown under the Packages tab), not a Python package registry endpoint.
- Python installs should still use PyPI (`pip install nested-learning`).

Example (pull/extract dist artifacts from GHCR):
```bash
docker pull ghcr.io/kmccleary3301/nested-learning-dist:latest
cid=$(docker create ghcr.io/kmccleary3301/nested-learning-dist:latest)
docker cp "$cid:/dist" ./dist_from_ghcr
docker rm "$cid"
```

For versioning semantics and breaking-change expectations, see `docs/VERSIONING_POLICY.md`.

For reproducibility bug reports, use `docs/BUG_REPORT_CHECKLIST.md`.

## Performance & optimizer options
- **Mixed precision:** enable bf16 autocast via `train.mixed_precision.enabled=true train.mixed_precision.dtype=bf16` (already enabled in pilot/mid/target configs).
- **`torch.compile`:** accelerate attention/core loops by toggling `train.compile.enable=true train.compile.mode=max-autotune`; failure falls back to eager unless `train.compile.strict=true`.
- **Muon hybrid (default):** all HOPE configs now set `optim.type=muon`, routing ≥2D tensors through PyTorch 2.9's Muon optimizer while embeddings/norms stay on AdamW. Training logs emit `optim.muon_param_elems` / `optim.adamw_param_elems` so you can confirm the split.
- **Fused AdamW fallback:** override with `optim.type=adamw optim.fused=auto` if Muon is unavailable or if you want to compare against the AdamW ablation in `reports/ablations.md`.
- **Surprise gating:** set `model.surprise_threshold=<float>` to gate all inner updates. By default the surprise metric is the average L2 norm of the (scaled/clipped) teach signal (`model.surprise_metric=l2`); you can also use `loss` or `logit_entropy` for ablations. Evaluation CLIs expose `--memorize-surprise-threshold` for ad-hoc gating.

All Hydra knobs can be overridden from the CLI or composed via config groups (`configs/hope/*.yaml`). Use these flags in tandem with `scripts/run_e2e_smoke.sh` (automation) or `scripts/run_cpu_ddp_smoke.sh` (CPU-only determinism check) to validate releases quickly.

## Documentation & References
- `docs/IMPLEMENTATION_STATUS.md` – current mechanism-level status matrix.
- `docs/PAPER_COMPLIANCE.md` – equation-to-code fidelity notes and explicit boundaries.
- `docs/STREAMING_CONTRACT.md` – exact sequence/segment/chunk/update semantics.
- `docs/release_checklist.md` – release readiness checklist.
- `docs/data_pipeline.md` – large-scale sharding/tokenizer workflow.
- `docs/scaling_guidance.md` – roadmap for expanding data + compute footprints.
- `docs/stage2_plan.md` – Stage 2 architecture + experiment roadmap.
- `docs/PHASE_2_PLAN.md` – detailed Phase 2 execution plan.
- `docs/stage2_progress.md` – progress tracker for the latest faithfulness remediation sprint.
- `docs/experiments_report.md` – draft paper covering completed experiments.
- `docs/future_directions.md` – prioritized roadmap after the initial release.
- `reports/stage2_smoke.md` – exact commands/artifacts for the release-ready smoke workflow.
- `docs/FSDP_SCALING_GUIDE.md` – dual-RTX 6000 Ada instructions for the mid/target FSDP configs.
- `google_papers/` – PDFs/markdown of Nested Learning & TITAN papers.
- `CHANGELOG.md` – user-facing changes per release.

## Contributing
1. Run formatting/tests (`uv run ruff check .`, `uv run pytest`).
2. Document new configs or scripts in the relevant docs under `docs/` and update `CHANGELOG.md`.
3. Open a PR referencing the relevant NL/TITAN spec sections and tests.


================================================
FILE: TODO.md
================================================
# Project TODOs

## Planner Finalization – P0 Foundation
- [x] Add first-class package CLI (`nl`) with `doctor`, `smoke`, `train`, and `audit` commands.
- [x] Support module entrypoint (`python -m nested_learning`).
- [x] Register CLI script in `pyproject.toml` for pip/uv installs.
- [x] Implement runtime capability detection and JSON doctor output.
- [x] Add cross-platform smoke tests for CLI/config composition.
- [x] Validate with lint + mypy + full pytest.

## Planner Finalization – P1 Distribution/CI
- [x] Relax package compatibility ranges (`python>=3.10`, `torch>=2.9,<3`) while keeping lockfile golden env.
- [x] Split optional dependencies into extras (`gpu`, `logging`, `viz`) for lighter base installs.
- [x] Add compatibility/support-tier documentation (`docs/COMPATIBILITY_MATRIX.md`).
- [x] Add versioning/stability policy (`docs/VERSIONING_POLICY.md`).
- [x] Add package release checklist (`docs/PACKAGE_RELEASE_CHECKLIST.md`).
- [x] Expand CI with cross-platform smoke and wheel-install smoke lanes.
- [x] Add release automation workflow (`.github/workflows/release.yml`) for tag-based TestPyPI/PyPI publish.
- [x] Update README to pip-first install + compatibility/versioning links + CLI usage.

## Stage 2 – Results Reproduction
- [ ] **Data Engineering**
  - [ ] Acquire RefinedWeb + supplement corpora under `data/raw/`.
  - [x] Implement filtering/dedup scripts (language ID, length bounds).
  - [x] Run `scripts/data/train_tokenizer.py` on combined corpus and store tokenizer artifacts.
  - [x] Shard each corpus component with `scripts/data/process_mixture.py`; log mixture stats.
  - [x] Automate `sample` and `full` pipelines via `scripts/data/run_sample.sh` / `scripts/data/run_full.sh`.
- [ ] **Infrastructure & Configs**
  - [x] Build Hydra config tree (`configs/hope/`) for pilot/mid/target, including optimizer + level schedules.
  - [x] Integrate logging (W&B/MLflow) hooks into training loop and configs.
  - [x] Provide DeepSpeed + FSDP launcher scripts with resume support.
  - [x] Add CI workflow (`.github/workflows/ci.yml`) for lint/type/tests via `uv`.
- [ ] **Scaling Training**
  - [x] Run pilot (160 M, 3 B tokens) to validate pipeline + self-mod updates. *(Step 230 k packaged 13 Nov; resume after TITAN baseline catches up.)*
  - [ ] Scale to 760 M / 30 B tokens; capture checkpoints + metrics. *(100-step mid run stable; longer runs waiting on teach-scale tuning + compute.)*
  - [ ] Execute 1.3 B / 100 B training with long-context curriculum.
- [ ] **Evaluation Harness**
  - [x] Implement `scripts/eval/zeroshot.py` scaffolding (PIQA baseline).
  - [x] Extend zero-shot harness to cover PIQA/HellaSwag/WinoGrande/ARC-E/C/BoolQ/SIQA/CommonsenseQA/OpenBookQA and document usage.
  - [x] Build NIAH long-context scaffolding script (`scripts/eval/niah.py`).
  - [x] Add continual-learning scripts measuring forgetting over streaming domains.
  - [x] Capture Stage 2 eval packs (zeroshot/NIAH/continual) from pilot checkpoints once stable (step 230 k release).
- [ ] **Ablations & Analysis**
  - [x] Run teach-scale sweep (0.05/0.10/0.15) on pilot checkpoints. *(0.05 & 0.15 short + 25 k long runs logged; see `logs/pilot-teach05-20251114010549.json` and `logs/pilot-teach15-long-20251114185448.json`.)*
  - [x] Run self-modifier off/on comparison at pilot scale.
  - [ ] Test CMS depth variations and optimizer variants.
  - [ ] Compare attention backbones (full vs. sliding vs. DeltaNet).
- [ ] **Baseline Monitoring**
  - [x] Finish TITAN long run (25 k steps, `cuda:0`, TMPDIR `/mnt/drive_4/tmp_titan`) and mirror HOPE packaging/eval workflow.
- [ ] **Documentation & Release**
  - [ ] Maintain experiment logs under `reports/`.
  - [ ] Publish data pipeline instructions + provenance for each corpus.
  - [ ] Summarize final metrics vs. baselines in Stage 2 report.

## Immediate Sprint Focus (Nov 15)
- [x] Design CMS sparse-chunk ablation config that stays within 49 GB (dim 384, seq 1024, batch 2, update periods 8/32/128/512).
- [x] Run CMS sparse-chunk experiment, package checkpoint (`artifacts/checkpoints/pilot_cms_sparse/step_005000.pt`), and produce evals (`eval/*_pilot_cms_sparse_step5000.json`).
- [x] Launch optimizer ablation comparing Muon hybrid vs fused AdamW on pilot-scale smoke (5–10 k steps) and archive eval metrics.
- [x] Roll the new CMS + optimizer findings into `reports/ablations.md`, `docs/stage2_progress.md`, and outline the resulting Stage 2 training plan updates.

## Planner Follow-up (P2)
- [x] Manifest validation report (`scripts/data/validate_mixture.py`) + token overlap stats.
- [x] Tokenizer coverage JSON via `scripts/data/check_tokenizer_coverage.py` + regression guard (`scripts/checks/tokenizer_coverage_guard.py`).
- [x] Extend eval suite with passkey, PG‑19, and continual forgetting plots (see `scripts/eval/run_pilot_suite.sh` + `reports/plots/` output).
- [x] Generate long-context/continual eval artifacts for pilot & TITAN checkpoints (`eval/passkey_*`, `eval/pg19_*`, `eval/continual_*`).
- [x] Fill checkpoint reports (`reports/checkpoints/pilot_step230000.md`, `.../titan_step25000.md`, `.../pilot_teach05_long.md`, CMS variants, optimizer ablations, self-mod off).
- [x] Run the same reporting workflow for future checkpoints (teach15 long, CMS sparse/no chunk, optimizer ablations) before publishing.

## Planner Follow-up (P1)
- [x] Make Muon the default outer optimizer (pilot/mid/target configs), log Muon vs AdamW param counts, and confirm bf16/SDPA/compile flags in training logs.
- [x] Finalize FSDP/ZeRO configs for 760 M / 1.3 B (with grad checkpointing + VRAM notes) and document usage.
- [x] Implement atomic checkpoint sidecars (SHA256, RNG state, tokenizer hash) plus a strict `scripts/checkpoint/verify.py`.
- [x] Extend CI with CPU DDP determinism smoke + synthetic passkey memorization test.

## Stage 2 – Execution Sprint (Nov 17)
- [x] Relaunch HOPE pilot run on `cuda:1` (Muon + surprise gating) and produce fresh checkpoints/logs.
  - Status (Jan 9): relaunch stopped at `artifacts/checkpoints/pilot_relaunch/step_477000.pt` and verified via `scripts/checkpoint/verify.py`.
- [x] Package the new pilot checkpoint via `scripts/package_pilot_release.sh` and rerun the full eval suite (zeroshot/NIAH/continual/passkey/PG19) with memorize path/threshold metadata.
  - Done: `reports/checkpoints/pilot_relaunch_step477000.md` + `eval/*_pilot.json` and refreshed `artifacts/pilot_release/`.
- [x] Restart TITAN long baseline, mirror the eval suite, and record surprise gating stats.
  - Status (Jan 9): packaged + evaluated `artifacts/checkpoints/mid_titan_long/step_032000.pt` (see `reports/checkpoints/titan_long_step32000.md` and `eval/*_titan.json`).
- [ ] Run the mid-scale FSDP config (`configs/hope/mid_fsdp.yaml`), monitor VRAM, and archive checkpoints/logs.
  - Status (Jan 10): 2×GPU FSDP smoke runs (synthetic) complete, including update pass and checkpoint saving (FSDP ranks now all participate in FULL_STATE_DICT gathering).
- [x] Update `reports/checkpoints/` + `reports/ablations.md` with the new HOPE/TITAN results (include memorize paths/surprise thresholds).
- [x] Refresh `docs/stage2_progress.md`, `docs/experiments_report.md`, and `docs/stage2_plan.md` with the latest execution status and next scaling steps.


================================================
FILE: configs/ablations/cms_sparse.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  dim: 384
  num_layers: 8
  heads: 6
  titan_level:
    name: titan
    update_period: 8
    optimizer_key: titan_opt
  cms_hidden_multiplier: 2
  cms_levels:
    - name: cms_fast
      update_period: 8
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 32
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 128
      optimizer_key: cms_opt
    - name: cms_ultra
      update_period: 512
      optimizer_key: cms_opt

data:
  seq_len: 1024
  batch_size: 2
  num_workers: 2

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  device: "cuda:1"
  checkpoint:
    dir: artifacts/checkpoints/pilot_cms_sparse
    save_interval: 1000
  log_interval: 25

logging:
  path: logs/pilot_cms_sparse_metrics.json
  run_name: pilot-cms-sparse


================================================
FILE: configs/ablations/selfmod_chunked_8_64.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_selfmod
  self_mod_chunk_size: 8
  self_mod_chunk_size_memory: 64

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  device: "cuda:1"
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod_chunked_8_64
    save_interval: 1000

logging:
  enabled: true
  backend: json
  path: logs/pilot_selfmod_chunked_8_64_metrics.json
  run_name: pilot-selfmod-chunked-8-64


================================================
FILE: configs/ablations/selfmod_momentum_off.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_selfmod
  self_mod_momentum: 0.0

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  device: "cuda:1"
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod_momentum_off
    save_interval: 1000

logging:
  enabled: true
  backend: json
  path: logs/pilot_selfmod_momentum_off_metrics.json
  run_name: pilot-selfmod-momentum-off


================================================
FILE: configs/ablations/selfmod_momentum_on.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_selfmod
  self_mod_momentum: 0.9

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  device: "cuda:1"
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod_momentum_on
    save_interval: 1000

logging:
  enabled: true
  backend: json
  path: logs/pilot_selfmod_momentum_on_metrics.json
  run_name: pilot-selfmod-momentum-on


================================================
FILE: configs/ablations/selfmod_no_alpha.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_selfmod
  self_mod_use_alpha: false

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  device: "cuda:1"
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod_no_alpha
    save_interval: 1000

logging:
  enabled: true
  backend: json
  path: logs/pilot_selfmod_no_alpha_metrics.json
  run_name: pilot-selfmod-no-alpha


================================================
FILE: configs/ablations/selfmod_no_cms.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_selfmod
  cms_levels: []

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  device: "cuda:1"
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod_no_cms
    save_interval: 1000

logging:
  enabled: true
  backend: json
  path: logs/pilot_selfmod_no_cms_metrics.json
  run_name: pilot-selfmod-no-cms


================================================
FILE: configs/ablations/selfmod_rank1_precond_off.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_selfmod
  self_mod_use_rank1_precond: false

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  device: "cuda:1"
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod_rank1_off
    save_interval: 1000

logging:
  enabled: true
  backend: json
  path: logs/pilot_selfmod_rank1_off_metrics.json
  run_name: pilot-selfmod-rank1-off


================================================
FILE: configs/data/continual_segments_sample.yaml
================================================
segments:
  - name: refinedweb_2018
    shards_dir: data/shards/refinedweb_sample
  - name: wikipedia_sample
    shards_dir: data/shards/wikipedia_sample
  - name: c4_sample
    shards_dir: data/shards/c4_sample
  - name: redpajama_sample
    shards_dir: data/shards/redpajama_sample


================================================
FILE: configs/data/fineweb_edu_longdoc_filtered_sample.yaml
================================================
name: fineweb_edu_longdoc_filtered_sample
tokenizer_output_dir: artifacts/tokenizer/fineweb_edu_longdoc
datasets:
  - name: fineweb_edu_longdoc
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/fineweb_edu_longdoc_en_sample.txt
    sample_limit: 5000
    seq_len: 4096
    sequences_per_shard: 1024
    output_dir: data/shards/fineweb_edu_longdoc_sample
    max_records: null



================================================
FILE: configs/data/fineweb_edu_mixture_full.yaml
================================================
name: fineweb_edu_full
tokenizer_output_dir: artifacts/tokenizer/fineweb_edu
datasets:
  - name: fineweb_edu
    dataset: HuggingFaceFW/fineweb-edu
    subset: sample-100BT
    split: train
    text_column: text
    sample_limit: 100000
    seq_len: 4096
    sequences_per_shard: 1024
    output_dir: data/shards/fineweb_edu_full
    max_records: null



================================================
FILE: configs/data/fineweb_edu_mixture_sample.yaml
================================================
name: fineweb_edu_sample
tokenizer_output_dir: artifacts/tokenizer/fineweb_edu
datasets:
  - name: fineweb_edu
    dataset: HuggingFaceFW/fineweb-edu
    subset: sample-10BT
    split: train
    text_column: text
    sample_limit: 5000
    seq_len: 2048
    sequences_per_shard: 1024
    output_dir: data/shards/fineweb_edu_sample
    max_records: 10000



================================================
FILE: configs/data/refinedweb_mixture.yaml
================================================
name: refinedweb_mix_v1
tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
datasets:
  - name: refinedweb
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/refinedweb_en_full.txt
    seq_len: 2048
    sequences_per_shard: 2048
    output_dir: data/shards/refinedweb
    max_records: null
  - name: books
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/wikipedia_en_full.txt
    seq_len: 2048
    sequences_per_shard: 2048
    output_dir: data/shards/wikipedia
    max_records: null
  - name: c4
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/c4_en_full.txt
    seq_len: 2048
    sequences_per_shard: 2048
    output_dir: data/shards/c4
    max_records: null
  - name: redpajama
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/redpajama_en_full.txt
    seq_len: 2048
    sequences_per_shard: 2048
    output_dir: data/shards/redpajama
    max_records: null
  - name: code
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/code_en_full.txt
    seq_len: 2048
    sequences_per_shard: 2048
    output_dir: data/shards/code
    max_records: null


================================================
FILE: configs/data/refinedweb_mixture_filtered.yaml
================================================
name: refinedweb_mix_filtered
tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
datasets:
  - name: refinedweb
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/refinedweb_en_sample.txt
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/refinedweb_filtered
    max_records: null
  - name: wikipedia
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/wikipedia_en_sample.txt
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/wikipedia_filtered
    max_records: null
  - name: c4
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/c4_en_sample.txt
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/c4_filtered
    max_records: null
  - name: redpajama
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/redpajama_en_sample.txt
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/redpajama_filtered
    max_records: null
  - name: code
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/code_en_sample.txt
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/code_filtered
    max_records: null


================================================
FILE: configs/data/refinedweb_mixture_full.yaml
================================================
name: refinedweb_mix_full
tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
datasets:
  - name: refinedweb
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/refinedweb_en_full.txt
    seq_len: 2048
    sequences_per_shard: 1024
    output_dir: data/shards/refinedweb_full
    max_records: null
  - name: wikipedia
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/wikipedia_en_full.txt
    seq_len: 2048
    sequences_per_shard: 1024
    output_dir: data/shards/wikipedia_full
    max_records: null
  - name: c4
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/c4_en_full.txt
    seq_len: 2048
    sequences_per_shard: 1024
    output_dir: data/shards/c4_full
    max_records: null
  - name: redpajama
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/redpajama_en_full.txt
    seq_len: 2048
    sequences_per_shard: 1024
    output_dir: data/shards/redpajama_full
    max_records: null
  - name: code
    dataset: text
    split: train
    text_column: text
    data_files: data/filtered/code_en_full.txt
    seq_len: 2048
    sequences_per_shard: 1024
    output_dir: data/shards/code_full
    max_records: null


================================================
FILE: configs/data/refinedweb_mixture_sample.yaml
================================================
name: refinedweb_mix_sample
tokenizer_output_dir: artifacts/tokenizer/refinedweb_mix
datasets:
  - name: refinedweb
    dataset: HuggingFaceFW/fineweb
    subset: sample-10BT
    split: train
    text_column: text
    sample_limit: 5000
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/refinedweb_sample
    max_records: 10000
  - name: books
    dataset: wikimedia/wikipedia
    subset: 20231101.en
    split: train
    text_column: text
    sample_limit: 2000
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/wikipedia_sample
    max_records: 5000
  - name: c4
    dataset: allenai/c4
    subset: en
    split: train
    text_column: text
    sample_limit: 2000
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/c4_sample
    max_records: 4000
  - name: redpajama
    dataset: cerebras/SlimPajama-627B
    split: train
    text_column: text
    sample_limit: 2000
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/redpajama_sample
    max_records: 4000
  - name: code
    dataset: codeparrot/codeparrot-clean-train
    split: train
    text_column: content
    sample_limit: 2000
    seq_len: 512
    sequences_per_shard: 512
    output_dir: data/shards/code_sample
    max_records: 4000


================================================
FILE: configs/deepspeed/zero3.json
================================================
{
  "bf16": {
    "enabled": true
  },
  "train_batch_size": 64,
  "gradient_accumulation_steps": 1,
  "zero_optimization": {
    "stage": 3,
    "reduce_bucket_size": 50000000,
    "stage3_param_persistence_threshold": 100000,
    "stage3_prefetch_bucket_size": 50000000
  },
  "optimizer": {
    "type": "AdamW",
    "params": {
      "lr": 0.0002,
      "betas": [
        0.9,
        0.95
      ],
      "eps": 1e-08,
      "weight_decay": 0.01
    }
  }
}


================================================
FILE: configs/hope/mid.yaml
================================================
defaults:
  - _self_

hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  vocab_size: 32000
  dim: 1024
  num_layers: 24
  heads: 16
  surprise_threshold: null
  freeze_backbone: false
  titan_level:
    name: titan
    update_period: 16
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 32
      optimizer_key: cms_opt
    - name: cms_ultra
      update_period: 128
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 8.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond
    cms_opt:
      type: deep_momentum
      lr: 4.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond

data:
  source: mixture
  batch_size: 16
  num_workers: 4
  mixture:
    samples_per_epoch: 8192
    seed: 42
    sources:
      - name: refinedweb
        shards_dir: data/shards/refinedweb_full
        weight: 0.4
      - name: wikipedia
        shards_dir: data/shards/wikipedia_full
        weight: 0.2
      - name: c4
        shards_dir: data/shards/c4_full
        weight: 0.15
      - name: redpajama
        shards_dir: data/shards/redpajama_full
        weight: 0.15
      - name: code
        shards_dir: data/shards/code_full
        weight: 0.1

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 100
  log_interval: 10
  device: "cuda:1"
  seed: 808
  deterministic: false
  step_offset: 0
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: true
    mode: max-autotune
  fsdp:
    auto_wrap_min_params: 2000000
    cpu_offload: false
  checkpoint:
    enable: true
    dir: checkpoints/mid
    save_interval: 50
    resume_path: null
    resume_tag: null

optim:
  type: muon
  lr: 2.0e-4
  weight_decay: 0.02
  momentum: 0.95
  betas:
    - 0.9
    - 0.999

logging:
  enabled: false
  backend: wandb
  project: nested-learning
  run_name: mid-${now:%Y%m%d%H%M%S}
  path: logs/mid_metrics.json


================================================
FILE: configs/hope/mid_fsdp.yaml
================================================
defaults:
  - mid
  - _self_

model:
  gradient_checkpointing: true

data:
  batch_size: 8  # per-rank micro-batch for 2× RTX 6000 Ada
  num_workers: 6

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 250000
  log_interval: 20
  device: "cuda"
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: false
  fsdp:
    auto_wrap_min_params: 2000000
    cpu_offload: false
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/mid_fsdp
    save_interval: 1000
    resume_path: null
    resume_tag: null

optim:
  type: muon
  lr: 2.0e-4
  weight_decay: 0.01

logging:
  enabled: true
  backend: wandb
  project: nested-learning
  run_name: hope-mid-fsdp-${now:%Y%m%d%H%M%S}
  path: logs/mid_fsdp_metrics.json


================================================
FILE: configs/hope/pilot.yaml
================================================
defaults:
  - /pilot


================================================
FILE: configs/hope/pilot_attention.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_attention
  qk_l2_norm: true
  local_conv_window: 4



================================================
FILE: configs/hope/pilot_selfmod.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: hope_selfmod
  # Chunk update cadence (paper §8.2): other memories update more often than M_memory.
  self_mod_chunk_size: 8
  self_mod_chunk_size_memory: 64

train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod

logging:
  run_name: pilot-selfmod
  path: logs/pilot_selfmod_metrics.json


================================================
FILE: configs/hope/pilot_transformer.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  block_variant: transformer
  qk_l2_norm: true
  local_conv_window: 4



================================================
FILE: configs/hope/target.yaml
================================================
defaults:
  - _self_

hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  vocab_size: 32000
  dim: 1536
  num_layers: 32
  heads: 24
  surprise_threshold: null
  freeze_backbone: false
  titan_level:
    name: titan
    update_period: 32
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_fast_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_mid_opt
    - name: cms_slow
      update_period: 32
      optimizer_key: cms_slow_opt
    - name: cms_ultra
      update_period: 128
      optimizer_key: cms_slow_opt
    - name: cms_anchor
      update_period: 512
      optimizer_key: cms_anchor_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 6.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond
    cms_fast_opt:
      type: deep_momentum
      lr: 3.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond
    cms_mid_opt:
      type: deep_momentum
      lr: 2.5e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond
    cms_slow_opt:
      type: deep_momentum
      lr: 2.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond
    cms_anchor_opt:
      type: deep_momentum
      lr: 1.5e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond

data:
  source: mixture
  batch_size: 32
  num_workers: 8
  mixture:
    samples_per_epoch: 32768
    seed: 123
    sources:
      - name: refinedweb
        shards_dir: data/shards/refinedweb_filtered
        weight: 0.35
      - name: wikipedia
        shards_dir: data/shards/wikipedia_filtered
        weight: 0.2
      - name: c4
        shards_dir: data/shards/c4_filtered
        weight: 0.15
      - name: redpajama
        shards_dir: data/shards/redpajama_filtered
        weight: 0.2
      - name: code
        shards_dir: data/shards/code_filtered
        weight: 0.1

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 200
  log_interval: 10
  device: "cuda:1"
  seed: 9001
  deterministic: false
  step_offset: 0
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: true
    mode: max-autotune
  fsdp:
    auto_wrap_min_params: 2000000
    cpu_offload: false
  checkpoint:
    enable: true
    dir: checkpoints/target
    save_interval: 100
    resume_path: null
    resume_tag: null

optim:
  type: muon
  lr: 1.5e-4
  weight_decay: 0.02
  momentum: 0.95
  betas:
    - 0.9
    - 0.999

logging:
  enabled: false
  backend: wandb
  project: nested-learning
  run_name: target-${now:%Y%m%d%H%M%S}
  path: logs/target_metrics.json

deepspeed:
  config: configs/deepspeed/zero3.json


================================================
FILE: configs/hope/target_fsdp.yaml
================================================
defaults:
  - target
  - _self_

model:
  gradient_checkpointing: true

data:
  batch_size: 4  # per-rank micro-batch
  num_workers: 8

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 300000
  log_interval: 20
  device: "cuda"
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: false
  fsdp:
    auto_wrap_min_params: 2500000
    cpu_offload: false
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/target_fsdp
    save_interval: 1000
    resume_path: null
    resume_tag: null

optim:
  type: muon
  lr: 1.5e-4
  weight_decay: 0.01

logging:
  enabled: true
  backend: wandb
  project: nested-learning
  run_name: hope-target-fsdp-${now:%Y%m%d%H%M%S}
  path: logs/target_fsdp_metrics.json


================================================
FILE: configs/mid_smoke.yaml
================================================
hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  vocab_size: 32000
  dim: 256
  num_layers: 4
  heads: 8
  titan_level:
    name: titan
    update_period: 16
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 16
      optimizer_key: cms_opt
    - name: cms_ultra
      update_period: 64
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 8.0e-4
      params:
        beta: 0.9
        beta2: 0.999
    cms_opt:
      type: deep_momentum
      lr: 4.0e-4
      params:
        beta: 0.9
        beta2: 0.999

data:
  source: mixture
  batch_size: 4
  num_workers: 0
  mixture:
    samples_per_epoch: 128
    seed: 0
    sources:
      - name: refinedweb
        shards_dir: data/shards/refinedweb_filtered
        weight: 0.4
      - name: wikipedia
        shards_dir: data/shards/wikipedia_filtered
        weight: 0.2
      - name: c4
        shards_dir: data/shards/c4_filtered
        weight: 0.15
      - name: redpajama
        shards_dir: data/shards/redpajama_filtered
        weight: 0.15
      - name: code
        shards_dir: data/shards/code_filtered
        weight: 0.1

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 10
  log_interval: 1
  device: "cpu"
  seed: 2024
  deterministic: true
  mixed_precision:
    enabled: false
    dtype: bf16
  compile:
    enable: false
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/mid_smoke
    save_interval: 10
    save_last: true

optim:
  type: adamw
  lr: 2.0e-4
  fused: false

logging:
  enabled: true
  backend: json
  path: logs/mid_smoke.json


================================================
FILE: configs/mid_stage2.yaml
================================================
hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  vocab_size: 32000
  dim: 768
  num_layers: 18
  heads: 12
  teach_scale: 0.05
  teach_clip: 5.0
  teach_schedule:
    warmup_steps: 20
    decay_start: 80
    decay_duration: 40
  titan_level:
    name: titan
    update_period: 16
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 32
      optimizer_key: cms_opt
    - name: cms_ultra
      update_period: 128
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 8.0e-4
      params:
        beta: 0.9
        beta2: 0.999
    cms_opt:
      type: deep_momentum
      lr: 4.0e-4
      params:
        beta: 0.9
        beta2: 0.999

data:
  source: mixture
  batch_size: 8
  num_workers: 2
  mixture:
    samples_per_epoch: 1024
    seed: 42
    sources:
      - name: refinedweb
        shards_dir: data/shards/refinedweb_full
        weight: 0.4
      - name: wikipedia
        shards_dir: data/shards/wikipedia_full
        weight: 0.2
      - name: c4
        shards_dir: data/shards/c4_full
        weight: 0.15
      - name: redpajama
        shards_dir: data/shards/redpajama_full
        weight: 0.15
      - name: code
        shards_dir: data/shards/code_full
        weight: 0.1

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 100
  log_interval: 10
  device: "cuda"
  seed: 3401
  deterministic: false
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: true
    mode: max-autotune
  fsdp:
    auto_wrap_min_params: 2000000
    cpu_offload: false
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/mid_stage2
    save_interval: 100
    resume_path: null
    resume_tag: null

optim:
  type: adamw
  lr: 3.0e-5
  fused: auto

logging:
  enabled: true
  backend: json
  path: logs/mid_stage2.json


================================================
FILE: configs/mid_stage2_smoke.yaml
================================================
hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  vocab_size: 32000
  dim: 512
  num_layers: 12
  heads: 8
  teach_scale: 0.2
  teach_clip: 2.0
  titan_level:
    name: titan
    update_period: 16
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 16
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 6.0e-4
      params:
        beta: 0.9
        beta2: 0.999
    cms_opt:
      type: deep_momentum
      lr: 3.0e-4
      params:
        beta: 0.9
        beta2: 0.999

data:
  source: mixture
  batch_size: 8
  num_workers: 2
  mixture:
    samples_per_epoch: 512
    seed: 0
    sources:
      - name: refinedweb
        shards_dir: data/shards/refinedweb_filtered
        weight: 0.4
      - name: wikipedia
        shards_dir: data/shards/wikipedia_filtered
        weight: 0.2
      - name: c4
        shards_dir: data/shards/c4_filtered
        weight: 0.15
      - name: redpajama
        shards_dir: data/shards/redpajama_filtered
        weight: 0.15
      - name: code
        shards_dir: data/shards/code_filtered
        weight: 0.1

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 60
  log_interval: 5
  device: "cuda"
  seed: 777
  deterministic: false
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: false
  fsdp:
    auto_wrap_min_params: 1500000
    cpu_offload: false
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/mid_stage2_smoke
    save_interval: 60
    resume_path: null
    resume_tag: null

optim:
  type: adamw
  lr: 1.0e-4
  fused: auto

logging:
  enabled: true
  backend: json
  path: logs/mid_stage2_smoke.json


================================================
FILE: configs/mid_titan_baseline.yaml
================================================
hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  type: titan
  vocab_size: 32000
  dim: 768
  num_layers: 18
  heads: 12
  surprise_threshold: 0.02
  freeze_backbone: false
  titan_level:
    name: titan
    update_period: 16
    optimizer_key: titan_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 8.0e-4
      params:
        beta: 0.9
        beta2: 0.999
  teach_scale: 0.10
  teach_clip: 4.0
  teach_schedule:
    warmup_steps: 60
    decay_start: 140
    decay_duration: 80

data:
  source: mixture
  batch_size: 4
  num_workers: 2
  mixture:
    samples_per_epoch: 1024
    seed: 42
    sources:
      - name: refinedweb
        shards_dir: data/shards/refinedweb_full
        weight: 0.4
      - name: wikipedia
        shards_dir: data/shards/wikipedia_full
        weight: 0.2
      - name: c4
        shards_dir: data/shards/c4_full
        weight: 0.15
      - name: redpajama
        shards_dir: data/shards/redpajama_full
        weight: 0.15
      - name: code
        shards_dir: data/shards/code_full
        weight: 0.1

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 220
  log_interval: 20
  device: "cuda:1"
  seed: 451
  deterministic: false
  step_offset: 0
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: false
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/mid_titan_baseline
    save_interval: 100
    resume_path: null
    resume_tag: null

optim:
  type: adamw
  lr: 1.0e-5
  fused: auto

logging:
  enabled: true
  backend: json
  path: logs/mid_titan_baseline.json
  run_name: mid_titan_baseline


================================================
FILE: configs/pilot.yaml
================================================
defaults:
  - _self_

hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  vocab_size: 32000
  dim: 512
  num_layers: 12
  heads: 8
  teach_scale: 0.10
  teach_clip: 5.0
  surprise_threshold: 0.02
  freeze_backbone: false
  self_mod_lr: 0.001
  teach_schedule:
    warmup_steps: 2000
    decay_start: 120000
    decay_duration: 20000
  titan_level:
    name: titan
    update_period: 8
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 32
      optimizer_key: cms_opt
    - name: cms_ultra
      update_period: 128
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 6.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        # Best-effort paper mapping: rank-1 context projection preconditioner.
        variant: nl_l2_precond
    cms_opt:
      type: deep_momentum
      lr: 3.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        # Best-effort paper mapping: rank-1 context projection preconditioner.
        variant: nl_l2_precond

data:
  source: mixture
  seq_len: 2048
  batch_size: 6
  num_workers: 4
  mixture:
    samples_per_epoch: 65536
    seed: 1337
    sources:
      - name: refinedweb
        shards_dir: data/shards/refinedweb_filtered
        weight: 0.4
      - name: wikipedia
        shards_dir: data/shards/wikipedia_filtered
        weight: 0.2
      - name: c4
        shards_dir: data/shards/c4_filtered
        weight: 0.15
      - name: redpajama
        shards_dir: data/shards/redpajama_filtered
        weight: 0.15
      - name: code
        shards_dir: data/shards/code_filtered
        weight: 0.1

train:
  algorithm_mode: two_pass_stopgrad_updates
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 246667
  log_interval: 50
  device: "cuda:1"
  seed: 1337
  deterministic: false
  step_offset: 0
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: false
    mode: max-autotune
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/pilot
    save_interval: 1000
    save_last: true
    resume_path: null
    resume_tag: null

optim:
  type: muon
  lr: 2.5e-4
  weight_decay: 0.02
  momentum: 0.95
  betas:
    - 0.9
    - 0.999

logging:
  enabled: true
  backend: json
  path: logs/pilot_metrics.json
  project: nested-learning
  run_name: pilot-main


================================================
FILE: configs/pilot_paper_faithful.yaml
================================================
defaults:
  - /pilot
  - _self_

model:
  # Explicit paper-defined variant (avoid inheriting repo default `hope_hybrid`).
  block_variant: hope_attention
  # Paper-faithful: treat "surprise" as the (scaled) teach signal itself, without threshold gating.
  surprise_threshold: null
  # Paper updates on the last (possibly partial) chunk; enable flush for non-multiple seq lengths.
  cms_flush_partial_at_end: true
  # Paper: q is non-adaptive and uses a fixed projection.
  self_mod_adaptive_q: false
  # Paper: local causal conv in the HOPE self-mod module.
  self_mod_local_conv_window: 4

data:
  # Paper-faithful semantics: CMS/TITAN fast state is per-context; this repo currently treats
  # each *batch* as a single shared context when batch_size>1.
  batch_size: 1

train:
  algorithm_mode: two_pass_stopgrad_updates
  # Keep this explicit (instead of inherited) so paper-faithful behavior is visible in one file.
  online_updates: true
  # Paper: re-initialize fast memories per context (sequence).
  use_fast_state: true
  strict_streaming_contract: true
  # Use explicit boundary-token supervision (no overlap approximation).
  online_boundary_targets: true
  # Carry attention state across chunks during online updates.
  online_carry_attention_cache: true
  # Fail fast if DDP would silently disable paper-critical features.
  fail_if_paper_faithful_disabled: true

optim:
  # Ensure meta-learning updates include memory module initial states (paper §8.2).
  param_policy: all

logging:
  run_name: pilot-paper-faithful
  path: logs/pilot_paper_faithful_metrics.json


================================================
FILE: configs/pilot_selfmod_paper_faithful.yaml
================================================
defaults:
  - /pilot_paper_faithful
  - _self_

model:
  block_variant: hope_selfmod
  # Chunk update cadence (paper §8.2): other memories update more often than M_memory.
  self_mod_chunk_size: 8
  self_mod_chunk_size_memory: 64
  self_mod_use_skip: false

train:
  checkpoint:
    dir: artifacts/checkpoints/pilot_selfmod_paper_faithful

logging:
  run_name: pilot-selfmod-paper-faithful
  path: logs/pilot_selfmod_paper_faithful_metrics.json


================================================
FILE: configs/pilot_smoke.yaml
================================================
hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false

model:
  vocab_size: 32000
  dim: 128
  num_layers: 2
  heads: 4
  titan_level:
    name: titan
    update_period: 8
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 16
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 1.0e-3
      params:
        beta: 0.9
        beta2: 0.999
    cms_opt:
      type: deep_momentum
      lr: 5.0e-4
      params:
        beta: 0.9
        beta2: 0.999

data:
  source: synthetic
  vocab_size: 32000
  seq_len: 64
  dataset_size: 1024
  batch_size: 4
  num_workers: 0

train:
  strict_streaming_contract: false
  online_updates: true
  online_chunk_size: 0
  online_boundary_targets: false
  online_carry_attention_cache: false
  per_layer_teach_signal: true
  steps: 10
  log_interval: 1
  device: "cpu"
  seed: 1234
  deterministic: true
  mixed_precision:
    enabled: false
    dtype: bf16
  compile:
    enable: false
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/pilot_smoke
    save_interval: 10
    save_last: true

optim:
  type: adamw
  lr: 3.0e-4
  fused: false

logging:
  enabled: true
  backend: json
  path: logs/pilot_smoke.json


================================================
FILE: configs/resolved/cms_sparse_eval.yaml
================================================
hydra:
  run:
    dir: .
  output_subdir: null
  job:
    chdir: false
model:
  vocab_size: 32000
  dim: 384
  num_layers: 8
  heads: 6
  teach_scale: 0.1
  teach_clip: 5.0
  self_mod_lr: 0.001
  teach_schedule:
    warmup_steps: 2000
    decay_start: 120000
    decay_duration: 20000
  titan_level:
    name: titan
    update_period: 8
    optimizer_key: titan_opt
  cms_levels:
  - name: cms_fast
    update_period: 8
    optimizer_key: cms_opt
  - name: cms_mid
    update_period: 32
    optimizer_key: cms_opt
  - name: cms_slow
    update_period: 128
    optimizer_key: cms_opt
  - name: cms_ultra
    update_period: 512
    optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 0.0006
      params:
        beta: 0.9
        beta2: 0.999
    cms_opt:
      type: deep_momentum
      lr: 0.0003
      params:
        beta: 0.9
        beta2: 0.999
  cms_hidden_multiplier: 2
data:
  source: mixture
  seq_len: 1024
  batch_size: 2
  num_workers: 2
  mixture:
    samples_per_epoch: 65536
    seed: 1337
    sources:
    - name: refinedweb
      shards_dir: data/shards/refinedweb_filtered
      weight: 0.4
    - name: wikipedia
      shards_dir: data/shards/wikipedia_filtered
      weight: 0.2
    - name: c4
      shards_dir: data/shards/c4_filtered
      weight: 0.15
    - name: redpajama
      shards_dir: data/shards/redpajama_filtered
      weight: 0.15
    - name: code
      shards_dir: data/shards/code_filtered
      weight: 0.1
train:
  online_updates: true
  online_chunk_size: 0
  per_layer_teach_signal: true
  steps: 5000
  log_interval: 25
  device: cuda:1
  seed: 1337
  deterministic: false
  mixed_precision:
    enabled: true
    dtype: bf16
  compile:
    enable: false
    mode: max-autotune
  checkpoint:
    enable: true
    dir: artifacts/checkpoints/pilot_cms_sparse
    save_interval: 1000
    save_last: true
    resume_path: null
    resume_tag: null
optim:
  type: adamw
  lr: 0.00025
  fused: auto
logging:
  enabled: true
  backend: json
  path: logs/pilot_cms_sparse_metrics.json
  project: nested-learning
  run_name: pilot-cms-sparse


================================================
FILE: configs/resolved/phase2_pilot_attention_eval.yaml
================================================
model:
  vocab_size: 32000
  dim: 512
  num_layers: 12
  heads: 8
  teach_scale: 0.10
  teach_clip: 5.0
  surprise_threshold: 0.02
  freeze_backbone: false
  qk_l2_norm: true
  local_conv_window: 4
  block_variant: hope_attention
  teach_schedule:
    warmup_steps: 2000
    decay_start: 120000
    decay_duration: 20000
  titan_level:
    name: titan
    update_period: 8
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 32
      optimizer_key: cms_opt
    - name: cms_ultra
      update_period: 128
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 6.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond
    cms_opt:
      type: deep_momentum
      lr: 3.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond



================================================
FILE: configs/resolved/phase2_pilot_transformer_eval.yaml
================================================
model:
  vocab_size: 32000
  dim: 512
  num_layers: 12
  heads: 8
  teach_scale: 0.10
  teach_clip: 5.0
  surprise_threshold: 0.02
  freeze_backbone: false
  qk_l2_norm: true
  local_conv_window: 4
  block_variant: transformer
  teach_schedule:
    warmup_steps: 2000
    decay_start: 120000
    decay_duration: 20000
  titan_level:
    name: titan
    update_period: 8
    optimizer_key: titan_opt
  cms_levels:
    - name: cms_fast
      update_period: 1
      optimizer_key: cms_opt
    - name: cms_mid
      update_period: 4
      optimizer_key: cms_opt
    - name: cms_slow
      update_period: 32
      optimizer_key: cms_opt
    - name: cms_ultra
      update_period: 128
      optimizer_key: cms_opt
  optimizers:
    titan_opt:
      type: deep_momentum
      lr: 6.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond
    cms_opt:
      type: deep_momentum
      lr: 3.0e-4
      params:
        beta: 0.9
        beta2: 0.999
        variant: nl_l2_precond



================================================
FILE: docker/Dockerfile.dist
================================================
FROM scratch

LABEL org.opencontainers.image.title="nested-learning-dist"
LABEL org.opencontainers.image.description="OCI bundle containing nested-learning wheel/sdist/checksums from CI release builds."

# Built artifacts generated in workflow before image build.
COPY dist/ /dist/


================================================
FILE: docs/BUG_REPORT_CHECKLIST.md
================================================
# Bug Report Checklist

Use this checklist when filing reproducibility or correctness issues.

## Required Context

- Commit SHA (`git rev-parse --short HEAD`)
- Exact command line used
- Config name and CLI overrides
- Device/runtime details (`python --version`, `uv --version`, `nvidia-smi` if CUDA)

## Required Artifacts

- JSON training log path (if training path involved)
- Full traceback/error output
- Minimal failing input or dataset pointer
- If streaming/cadence related: include `scripts/checks/verify_update_cadence.py` output
- Include `scripts/checks/compliance_report.py` output (or note why unavailable)

## Fast Reproduction Path

1. Run `uv run bash scripts/checks/run_fidelity_ci_subset.sh`.
2. Run `uv run bash scripts/run_mechanism_audit_smoke.sh`.
3. Attach outputs and note which step failed.

## Streaming/Cadence Issues

- Specify `train.strict_streaming_contract` value.
- Specify `train.online_updates`, `train.online_chunk_size`, `train.online_boundary_targets`, `train.online_carry_attention_cache`, `model.cms_flush_partial_at_end`.
- Include expected vs observed update counts per level.


================================================
FILE: docs/COMPATIBILITY_MATRIX.md
================================================
# Compatibility Matrix

This document defines the support contract for runtime/backends.

## Support Tiers

- **Tier 1 (Supported):** CI-tested on every PR; regressions treated as bugs.
- **Tier 2 (Supported with caveats):** tested periodically/partially; backend caveats apply.
- **Tier 3 (Best-effort):** community-supported; no guaranteed CI lane.
- **Unsupported:** intentionally out of scope; fail-fast when correctness is at risk.

## Matrix

| OS | Python | CPU | CUDA (NVIDIA) | MPS | ROCm |
|---|---|---|---|---|---|
| Linux x86_64 | 3.10-3.12 | Tier 1 (import/eval/smoke) | Tier 1 (import/eval/smoke/full training) | Unsupported | Tier 3 |
| macOS Apple Silicon | 3.10-3.12 | Tier 2 (import/eval/smoke) | Unsupported | Tier 2 (import/eval), Tier 3 (smoke) | Unsupported |
| macOS Intel | 3.10-3.12 | Tier 2 (import/eval), Tier 3 (smoke) | Unsupported | Unsupported | Unsupported |
| Windows | 3.10-3.12 | Tier 2 (import/eval), Tier 3 (smoke) | Tier 3 (user-managed) | Unsupported | Unsupported |

Notes:
- CPU full-scale training is not a supported target.
- Strict paper-faithful online-update semantics in distributed settings remain constrained by design.
- Numerical parity across backend families (CUDA/MPS/ROCm) is not guaranteed.

## Apple Silicon (MPS) practical expectations

On macOS Apple Silicon, this repo is intended to support:
- install/import,
- CLI diagnostics (`nl doctor`),
- smoke/eval workflows,
- small local runs with `train.device=mps`.

This repo does not currently treat macOS/MPS as a full paper-scale training target.
For full-size training and published artifact reproduction, prefer Linux + CUDA Tier 1 environments.

## Runtime Degradation Policy

At runtime, unsupported performance features should degrade gracefully:
- if flash/mem-efficient SDPA is unavailable, use math SDPA;
- if `torch.compile` is unavailable/disabled, continue without compile;
- if requested mixed precision is unsupported on the backend, degrade to fp32 and log it.

Use `nl doctor --json` to capture capability snapshots in machine-readable form.

## Golden Environment

For reproducibility of this repository’s published artifacts, prefer:
- Python 3.12
- PyTorch 2.9.x
- `uv lock` / `uv sync --all-extras --dev`

The package metadata allows broader install ranges for portability, while the lockfile remains the canonical dev/test environment.


================================================
FILE: docs/FSDP_SCALING_GUIDE.md
================================================
# FSDP/ZeRO Scaling Guide (RTX 6000 Ada Dual-GPU Rig)

This note captures the configuration we will use for the Stage 2 mid (≈760 M) and target (≈1.3 B) HOPE models when running on the dual RTX 6000 Ada workstation (2× 48 GB). It accompanies the new Hydra configs `configs/hope/mid_fsdp.yaml` and `configs/hope/target_fsdp.yaml`.

## Hardware & Software Assumptions
- 2× NVIDIA RTX 6000 Ada (48 GB each)
- CUDA 12.4, PyTorch 2.9, `uv` environment
- NCCL backend, FSDP via `torch.distributed.fsdp`
- Checkpoints stored under `artifacts/checkpoints/{mid_fsdp,target_fsdp}`

## Config summary

| Model | Params | Config | Per-rank micro-batch | Global batch (nranks=2) | Expected VRAM | Notes |
|-------|--------|--------|----------------------|-------------------------|---------------|-------|
| HOPE mid | ~760 M (dim 1024, 24L) | `configs/hope/mid_fsdp.yaml` | 8 sequences × 2048 tokens | 16×2048 tokens | 43–45 GB | bf16 activations, Muon outer optimizer, NL inner optimizer, gradient checkpointing, FSDP auto-wrap ≥2 M params |
| HOPE target | ~1.3 B (dim 1536, 32L) | `configs/hope/target_fsdp.yaml` | 4 sequences × 2048 tokens | 8×2048 tokens | 46–48 GB | Slightly smaller per-rank batch to stay under 48 GB; Muon + checkpointing identical to mid config |

Both configs default to:
- `optim.type = muon` (outer optimizer) with `nl_l2_precond` inner updates already wired through model lvl optimizers.
- bf16 autocast (`train.mixed_precision.enabled = true, dtype = bf16`).
- Gradient checkpointing via `model.gradient_checkpointing = true` (saves ~3 GB per rank).
- `train.compile.enable = false` (Torch.compile can be toggled on after validation).
- FSDP auto-wrap policy set via `train.fsdp.auto_wrap_min_params`.

## Launch commands

```bash
# Mid model, 2 GPUs
UV_CACHE_DIR=/tmp/uv-cache UV_LINK_MODE=copy \
uv run torchrun --nproc_per_node=2 train_fsdp.py \
  --config-name hope/mid_fsdp logging.run_name=mid-fsdp-${USER}

# Target model, 2 GPUs
UV_CACHE_DIR=/tmp/uv-cache UV_LINK_MODE=copy \
uv run torchrun --nproc_per_node=2 train_fsdp.py \
  --config-name hope/target_fsdp logging.run_name=target-fsdp-${USER}
```

To resume from a checkpoint, set `train.checkpoint.resume_path` (path to `step_xxxxxx.pt`). State dicts use FSDP’s full-state sharding with CPU offload for rank 0.

## ZeRO / DeepSpeed note

For multi-node runs or larger batch sizes, leverage `train_deepspeed.py` with `configs/deepspeed/zero3.json`. The per-model configs above can be reused by passing `--config-name hope/mid_fsdp` together with `DEEPSPEED_CONFIG=configs/deepspeed/zero3.json`.

## Logging & Monitoring

- JSON metrics live at `logs/mid_fsdp_metrics.json` or `logs/target_fsdp_metrics.json`.
- W&B logging is enabled by default (`project = nested-learning`).
- Additional telemetry (teach-signal norms, projector stats, CMS chunk samples) already flows through the model update metrics; ensure your W&B dashboard visualizes:
  - `layer*.titan.titan.grad_norm`
  - `layer*.titan.titan.ctx_norm` / `proj_norm`
  - `layer*.cms.cms_fast.chunk_samples`, etc.

## Checklist before starting a long run
1. `uv run pytest` (ensure faithfulness tests pass).
2. `nvidia-smi` — GPUs idle and temps normal.
3. Confirm dataset shards (`data/shards/*_full/`) available locally.
4. W&B credentials set (`WANDB_API_KEY`).
5. For target config, consider setting `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` to reduce fragmentation.

This guide should let collaborators pick up the FSDP configs immediately without reverse-engineering the Hydra hierarchy.


================================================
FILE: docs/IMPLEMENTATION_STATUS.md
================================================
# Implementation Status (Source of Truth)

This table is the canonical mechanism-status map for this repo.

| Mechanism | Status | Evidence |
|---|---|---|
| Teach-signal alignment (LM head tied to embeddings) | Implemented | `src/nested_learning/model.py`, `src/nested_learning/training.py`, `tests/test_teach_signal.py`, `tests/test_tied_weight_guard.py` |
| Per-layer local teach signals (`δ_l`) | Implemented (single-process path) | `src/nested_learning/model.py`, `src/nested_learning/training.py`, `tests/test_teach_signal.py` |
| CMS chunk accumulation + cross-call cadence | Implemented | `src/nested_learning/hope/block.py`, `tests/test_cms.py`, `tests/test_cms_cross_call.py`, `tests/test_model_streaming_cadence.py` |
| CMS finalize/partial flush semantics | Implemented | `src/nested_learning/hope/block.py`, `tests/test_cms_flush_partial.py`, `docs/STREAMING_CONTRACT.md` |
| Online chunking (overlap mode) | Implemented | `src/nested_learning/training.py`, `tests/test_online_chunking.py` |
| Online chunking (boundary-target mode) | Implemented | `src/nested_learning/training.py`, `configs/pilot_paper_faithful.yaml`, `tests/test_online_chunking.py` |
| Optional attention-cache carry across chunk calls | Implemented (single-process path) | `src/nested_learning/backbones.py`, `src/nested_learning/model.py`, `tests/test_attention_cache.py`, `tests/test_eval_state.py` |
| Strict runtime guardrails | Implemented | `src/nested_learning/training.py`, `tests/test_strict_streaming_contract.py`, `tests/test_distributed_fail_fast.py`, `tests/test_fast_state_batch_semantics.py` |
| Training algorithm mode banner/validation | Implemented (`two_pass_stopgrad_updates`, `boundary_state_grad_through_write`) | `src/nested_learning/training.py`, `tests/test_strict_streaming_contract.py` |
| Boundary-state gradient-through-write algorithm mode | Experimental (single-process constrained path) | `src/nested_learning/training.py`, `tests/test_boundary_state_mode.py`, `tests/test_algorithm_mode_grad.py`, `docs/PAPER_COMPLIANCE.md` |
| Online-updates fast-state invariant (`online_updates && !use_fast_state`) | Implemented (warn/error guard) | `src/nested_learning/training.py`, `tests/test_strict_streaming_contract.py` |
| Inner optimizer mapping (`nl_l2_precond`) | Implemented (best-effort mapping) | `src/nested_learning/optim/deep.py`, `tests/test_optim.py`, `docs/PAPER_COMPLIANCE.md` |
| Surprise-gated update flow | Implemented | `src/nested_learning/model.py`, `src/nested_learning/hope/block.py`, `src/nested_learning/titan/model.py` |
| Test-time memorization path in eval harnesses | Implemented | `src/nested_learning/memorize.py`, `scripts/eval/*.py`, `tests/test_memorization.py` |
| Compliance automation report | Implemented | `scripts/checks/compliance_report.py`, `scripts/checks/run_fidelity_ci_subset.sh`, `scripts/run_mechanism_audit_smoke.sh` |
| Doc-to-code reference guard (anti-overclaim drift) | Implemented | `scripts/checks/verify_docs_refs.py`, `.github/workflows/ci.yml`, `tests/test_verify_docs_refs.py` |
| Portable package/CLI entrypoints (`nl`, `python -m nested_learning`) | Implemented | `src/nested_learning/cli.py`, `src/nested_learning/__main__.py`, `tests/test_cli_tooling.py`, `pyproject.toml` |
| Cross-platform smoke + wheel install CI gates | Implemented | `.github/workflows/ci.yml` (`cross-platform-smoke`, `wheel-install-smoke`) |
| Package release automation (tag -> TestPyPI/PyPI) | Implemented | `.github/workflows/release.yml`, `docs/PACKAGE_RELEASE_CHECKLIST.md` |
| Full boundary-state gradient-through-write algorithm from paper | Partially implemented (experimental) | Constrained single-process mode exists; not yet treated as production/full-scale parity (`docs/PAPER_COMPLIANCE.md`) |
| Distributed mechanism-auditing parity for online/per-layer/boundary-cache path | Deferred | DDP strict fail-fast + documented limits (`src/nested_learning/training.py`, `scripts/run_cpu_ddp_smoke.sh`) |
| Paper-scale training/eval reproduction | Deferred | Explicitly out of sprint scope (`docs/PAPER_COMPLIANCE.md`) |

## Validation Entrypoints

- Fidelity subset: `scripts/checks/run_fidelity_ci_subset.sh`
- Mechanism-auditing smoke: `scripts/run_mechanism_audit_smoke.sh`
- Full tests: `uv run pytest`


================================================
FILE: docs/P4_REMEDIATION_PLAN.md
================================================
# P4 Remediation Plan — Status & Tracking (Paper-Faithful HOPE/Nested Learning)

This file started as an execution checklist for the P4 “paper faithfulness” sprint. It is now maintained as a **status page** so contributors can quickly see what’s implemented, what is verified by tests, and what follow‑ups remain.

For the canonical “how to run paper‑faithful mode” guide, see `docs/PAPER_COMPLIANCE.md`.

## Status (core remediation)

**P0/P1 core faithfulness items:** complete.

Implemented behaviors (with pointers):
- **Self‑modifying TITAN path always-on** during the inner/update pass (does not require an external teach signal).  
  Code: `src/nested_learning/hope/block.py`  
  Test: `tests/test_selfmod_online.py`
- **CMS update semantics** use per‑token δ targets and **sum‑over‑chunk** accumulation (no chunk‑mean broadcast).  
  Code: `src/nested_learning/hope/block.py` (`_chunk_loss`, `_CmsBuffer`, `_pop_buffer_chunk`)  
  Test: `tests/test_cms.py`
- **Online CMS read‑after‑write** behavior (later tokens can see updated CMS weights when using the online training path).  
  Code: `src/nested_learning/hope/block.py` (`_cms_forward_online`) + `src/nested_learning/training.py` (`train.online_updates`)  
  Test: `tests/test_cms.py` (`test_cms_online_updates_affect_later_tokens`)
- **Per‑layer local error signals (δℓ)** computed via autograd and routed into each block.  
  Code: `src/nested_learning/model.py` (`forward_with_block_outputs`, `teach_signals`) + `src/nested_learning/training.py` (`_compute_layer_teach_signals`)  
  Test: `tests/test_teach_signal.py`
- **Paper optimizer option (M3)** implemented and selectable via `optim.type=m3`.  
  Code: `src/nested_learning/optim/m3.py`  
  Test: `tests/test_m3.py`

Docs/telemetry added:
- Paper‑faithful run flags + code mapping: `docs/PAPER_COMPLIANCE.md`
- README “paper‑faithful mode” snippet: `README.md`
- Per‑layer update telemetry (e.g. `layerX.cms.*`) emitted via `HOPEModel._gather_block_stats()`.

## Remaining follow-ups (optional hardening, not required for “implemented correctly”)

These are improvements that strengthen the validation story or reduce ambiguity, but they are not required to claim the core mechanism is implemented:

- [ ] Add an explicit unit test that demonstrates **per‑token δ vs chunk‑mean broadcast** leads to different update directions (sanity test on a toy CMS).
- [ ] Add a “two chunks vs one chunk” regression test to lock in chunk boundary semantics in `train.online_updates` mode.
- [ ] Expose `cms_online_updates` / `cms_chunk_reduction` / `selfmod_online_updates` as Hydra config toggles (currently paper‑faithful defaults live in the HOPE block configs).
- [ ] Port the `train.py` online/per‑layer δℓ path to multi‑GPU (FSDP or custom DDP) so paper‑faithful mode scales beyond single GPU.



================================================
FILE: docs/PACKAGE_RELEASE_CHECKLIST.md
================================================
# Package Release Checklist (PyPI/GitHub)

Use this checklist for package distribution releases (separate from checkpoint/artifact releases).

## Pre-Release (RC)

- [ ] `uv run ruff check .`
- [ ] `uv run mypy src`
- [ ] `uv run pytest -q`
- [ ] `uv build`
- [ ] `uvx twine check dist/*`
- [ ] wheel install smoke works outside repo tree:
  - [ ] `python -m venv /tmp/nl-wheel`
  - [ ] `pip install dist/*.whl`
  - [ ] `python -m nested_learning --help`
  - [ ] `python -m nested_learning doctor --json`
  - [ ] `python -m nested_learning smoke --config-name pilot_smoke --device cpu --batch-size 1 --seq-len 8`
- [ ] `CHANGELOG.md` updated with:
  - [ ] release highlights
  - [ ] breaking changes (or explicit “none”)
- [ ] `README.md` reflects current compatibility tiers and install guidance.
- [ ] Trusted Publishing configured per `docs/PYPI_TRUSTED_PUBLISHING.md`.
- [ ] Tag created for RC (`vX.Y.ZrcN`) and TestPyPI publish succeeds.

## Final Release

- [ ] Re-run validation checks listed above.
- [ ] Promote release notes for `vX.Y.Z`.
- [ ] PyPI publish workflow succeeds via Trusted Publishing (OIDC).
- [ ] GitHub Release workflow creates/updates the tag release entry in the Releases tab.
- [ ] Release assets attached by automation:
  - [ ] wheel (`.whl`)
  - [ ] source tarball (`.tar.gz`)
  - [ ] `SHA256SUMS.txt`
- [ ] GitHub Packages (GHCR) workflow succeeds and publishes `nested-learning-dist` for the tag.
- [ ] Release notes include migration notes (if any) and links to compatibility/versioning docs.

## Post-Release

- [ ] Confirm install from PyPI in a clean environment.
- [ ] Confirm `nl doctor` and `nl smoke` on at least one non-maintainer machine or CI lane.
- [ ] Open follow-up issues for deferred release items.


================================================
FILE: docs/PAPER_COMPLIANCE.md
================================================
# Paper Compliance / Fidelity Guide (Nested Learning / HOPE)

This doc explains the **fidelity‑critical behaviors** (what the paper relies on) and how they map to this repo’s code, flags, and tests.

It is deliberately **mechanism‑focused**: you can use it to answer “did we implement the architecture/update rules correctly?” without requiring full‑scale training reproduction.

For exact chunk/segment/buffer semantics, see `docs/STREAMING_CONTRACT.md`.

## Paper Reference Pin

All compliance/equation references in this repo are pinned to:

- Source: `google_papers/Nested_Learning_Full_Paper/Nested_Learning_Full_Paper.md`
- SHA-256: `7524af0724ac8e3bad9163bf0e79c85b490a26bc30b92d96b0bdf17a27f9febc`

## Scope

**In scope**
- HOPE blocks (attention + CMS + TITAN/self‑mod paths) and the *nested/online* update mechanism.
- Correct teach‑signal alignment (LM head vs embedding), per‑layer local error signals (δℓ), and chunk‑accumulated CMS updates.
- A paper‑style optimizer option (M3) alongside practical defaults.

**Out of scope (today)**
- Full bi‑level meta‑learning experiments over explicit task episodes (outer objective over tasks + inner adaptation per task).
- Results parity at the original paper’s compute scale.

## Semantic contract (important)

This repo focuses on **mechanism-level fidelity** (update rules + dataflow) with explicit tests.

- **Differentiable reads:** the forward pass used to compute the outer LM loss is standard autograd.
- **Stop‑grad writes:** online memory updates are applied in an explicit update pass (typically under `torch.no_grad()`), so we do **not** backprop through online writes.
- **Algorithm mode:** `train.algorithm_mode=two_pass_stopgrad_updates` is the stable default.  
  `train.algorithm_mode=boundary_state_grad_through_write` is available as an **experimental single-process mechanism path** with strict runtime constraints (`online_updates=true`, `per_layer_teach_signal=true`, `use_fast_state=true`, non-DDP). It is not yet treated as full paper-training reproduction.
- **Boundary-target mode:** we support explicit boundary-token supervision (`train.online_boundary_targets=true`) and optional attention-state carry across chunks (`train.online_carry_attention_cache=true`) for stronger streaming equivalence, while keeping stop-grad write semantics.
- **Fast-state guardrail:** `train.online_updates=true` with `train.use_fast_state=false` now emits a warning in non-strict mode and raises in strict/paper-faithful mode.
- **Meta initialization (fast-state mode):** when `train.use_fast_state=true`, meta parameters are not mutated by online updates, but the *read-path* meta parameters still receive outer gradients:
  - CMS/TITAN fast state uses **meta+delta** (forward uses `meta + delta`; updates write deltas only).
  - HOPE‑SelfMod uses a detached per‑context state, but the read path uses a **straight‑through meta gradient** link so the meta initialization remains trainable.

## Quick start: mechanism-auditing presets (single GPU)

The highest-fidelity execution path in this repo is **single‑GPU** `train.py`, because it supports both:
1) **per‑layer δℓ** teach signals and  
2) **online chunked training** where later tokens’ loss/gradients can see earlier memory updates.

Minimal smoke:

```bash
uv run python train.py --config-name pilot_paper_faithful train.steps=5
```

Note: these presets set `data.batch_size=1` to avoid cross-sample memory sharing
when `train.use_fast_state=true`.

Optional: select the paper optimizer variant for the *outer* step:

```bash
uv run python train.py --config-name pilot_paper_faithful train.steps=5 optim.type=m3
```

Mechanism-auditing HOPE self-mod variant:

```bash
uv run python train.py --config-name pilot_selfmod_paper_faithful train.steps=5
```

Boundary-state experimental smoke (single process only):

```bash
uv run python train.py --config-name pilot_paper_faithful \
  train.algorithm_mode=boundary_state_grad_through_write \
  train.steps=5
```

Boundary-state mode tradeoffs:
- Keeps cross-chunk write paths differentiable, which increases activation retention and memory pressure.
- Usually runs slower than `two_pass_stopgrad_updates` due to larger autograd graphs.
- Intended for mechanism probing and diagnostics, not for long production runs in this repo yet.

## Mechanism-Auditing vs Practical Mode (Matrix)

This repo supports both mechanism-auditing presets (for correctness checks) and practical defaults (for running pilots quickly).

| Mechanism | Paper intent | This repo (single GPU) | Notes / Tests |
|---|---|---|---|
| Teach‑signal alignment | δ uses LM head weights | `compute_teach_signal()` matches autograd grad | `tests/test_teach_signal.py` |
| Per‑layer δℓ | block‑local error signals | `train.per_layer_teach_signal=true` | `tests/test_teach_signal.py` |
| Online chunked training | later tokens can “see” earlier inner updates | `train.online_updates=true` with either overlap mode or explicit boundary-target mode + end-of-sequence finalize | `src/nested_learning/training.py`, `tests/test_online_chunking.py` |
| CMS chunk accumulation | sum over token deltas per chunk | `cms_chunk_reduction="sum"` default | `tests/test_cms.py`, `tests/test_cms_delta_rule.py` |
| CMS partial-chunk flush | update on final partial chunk | `model.cms_flush_partial_at_end=true` | `tests/test_cms_flush_partial.py` |
| CMS cadence across chunked calls | `update_period` accumulation must survive multiple update-pass calls | fast-state CMS buffers persist until `finalize_updates=true` | `tests/test_cms_cross_call.py` |
| CMS LayerNorm | paper is architecture-light; norm is optional | `model.cms_use_layernorm=true` (default) | `tests/test_cms.py` |
| HOPE‑SelfMod local conv | local conv window=4 (paper HOPE module) | `SelfModifyingTitansConfig.local_conv_window=4` default (causal depthwise) | `tests/test_selfmod_local_conv.py` |
| HOPE‑SelfMod fixed q | paper: `q_t = x_t W_q` non‑adaptive | `SelfModifyingTitansConfig.adaptive_q=false` default | `tests/test_selfmod_adaptive_q.py` |
| HOPE‑SelfMod Eq. (91) skip | no projection skip term (`w_skip`) | `model.self_mod_use_skip=false` (mechanism-auditing presets) | `tests/test_residual_mlp_memory.py` |
| HOPE‑SelfMod read/write separation | differentiable read; stopgrad through writes | forward uses differentiable read; updates occur only in explicit update pass | `tests/test_selfmod_grad_flow.py`, `tests/test_hope_selfmod_update_pass.py` |
| Fast‑state isolation | per‑context inner updates without mutating meta params, while read‑path meta init remains learnable | `train.use_fast_state=true` | CMS/TITAN use **meta+delta**; HOPE‑SelfMod read path uses straight‑through meta gradients. Meta params remain unchanged during updates and still receive outer grads (`tests/test_hope_selfmod_fast_state_meta_unchanged.py`, `tests/test_fast_state_meta_grads.py`, `tests/test_fast_state_selfmod_meta_grads.py`, `tests/test_fast_state_forward_equivalence.py`, `tests/test_fast_state_batch_semantics.py`) |
| Surprise metric | paper “surprise” trigger | `model.surprise_metric=l2` (default); also `loss`, `logit_entropy` | `tests/test_surprise_metric.py`, `tests/test_faithfulness_harness.py` |
| Outer optimizer | M3 option exists | `optim.type=m3` | `tests/test_m3.py` |
| Outer param policy | include memory initial states in meta-update | `optim.param_policy=all` | `tests/test_optimizer_param_policy.py` |
| DDP fail-fast | avoid silent paper-divergent fallbacks | `train.fail_if_paper_faithful_disabled=true` | `tests/test_distributed_fail_fast.py` |
| Multi‑GPU | (not required by paper) | DDP disables `online_updates` + `per_layer_teach_signal`; FSDP uses offline updates | documented below |

Surprise-gating note: for `model.surprise_metric=l2`, the current implementation applies a
chunk-level gate from mean teach-signal norm, then applies token-level masking inside TITAN/CMS
updates. This behavior is intentionally tested (`tests/test_surprise_metric.py`).

## Claims Boundary (What We Claim vs What We Do Not)

| Claim category | Status | Notes |
|---|---|---|
| CMS/TITAN/self-mod mechanism wiring | Implemented | Unit tests cover teach-signal, chunking, cadence primitives, and update-path invariants. |
| Mechanism-auditing single-GPU path | Implemented | Uses per-layer teach signals + explicit stop-grad update pass. |
| Full paper boundary-state gradient training through online writes | Partially implemented (experimental) | `train.algorithm_mode=boundary_state_grad_through_write` enables a constrained single-process differentiable write path; still not treated as production/full-scale reproduction. |
| Cross-chunk attention-state continuity (KV cache) | Partially implemented | Optional cache-carry path is available in model APIs and training boundary-target mode; distributed faithful path remains deferred. |
| Full paper-scale result reproduction | Not implemented | Compute/data scale parity is intentionally deferred. |

## Implementation Fidelity vs Result Fidelity

- **Implementation fidelity (this repo target):** architecture/update-path correctness, teach-signal alignment, cadence, chunking semantics, and guardrails.
- **Result fidelity (deferred):** matching full-paper training scale, data budget, and final benchmark curves.
- This repo treats implementation fidelity as complete only when mechanism checks/tests pass; result parity is explicitly a separate track.

## Scale Statement (Current vs Paper)

- Current mechanism-auditing and pilot runs are intentionally below the full paper scale.
- This repo does **not** claim paper-scale result reproduction at current compute/data settings.
- Maintainer stance: prioritize faithful implementation and auditable behavior first; scale-up remains optional contributor work.

## Paper-Faithful Configs (Usage + Caveats)

| Config | Purpose | Default Algorithm Mode | Caveats |
|---|---|---|---|
| `configs/pilot_paper_faithful.yaml` | HOPE-attention mechanism-auditing baseline | `two_pass_stopgrad_updates` | Single-process intended; sets `data.batch_size=1`, `strict_streaming_contract=true`, boundary-target + cache-carry enabled |
| `configs/pilot_selfmod_paper_faithful.yaml` | HOPE self-mod mechanism-auditing baseline | `two_pass_stopgrad_updates` | Same constraints as above; self-mod paper knobs forced (`self_mod_use_skip=false`, fixed `q`, local conv) |

Boundary-state experimental override:
- `train.algorithm_mode=boundary_state_grad_through_write`
- Requires: `online_updates=true`, `per_layer_teach_signal=true`, `use_fast_state=true`, single-process (non-DDP).

## Equation / Mechanism Code Pointers (file:line)

| Paper mechanism | Code pointer |
|---|---|
| Teach-signal proxy `dL/dh` via LM head weights | `src/nested_learning/training.py:225` |
| Per-layer teach signals (`δℓ`) from block outputs | `src/nested_learning/training.py:295` |
| Online chunk iterators (overlap / boundary-target) | `src/nested_learning/training.py:352`, `src/nested_learning/training.py:369` |
| Algorithm-mode constraints (including boundary-state experimental mode) | `src/nested_learning/training.py:606` |
| Online cache/chunk constraint guards | `src/nested_learning/training.py:650` |
| Online chunked train loop + update pass wiring | `src/nested_learning/training.py:685` |
| Run-feature telemetry (algorithm + online flags) | `src/nested_learning/training.py:1418` |
| Checkpoint metadata with algorithm/online flags | `src/nested_learning/training.py:1492` |
| Tied embedding / LM head weight contract | `src/nested_learning/model.py:156` |
| Block output capture for δℓ path | `src/nested_learning/model.py:317` |
| Fast-state init + attention-cache init | `src/nested_learning/model.py:531`, `src/nested_learning/model.py:578` |
| CMS chunk accumulation + cadence telemetry | `src/nested_learning/hope/block.py:297`, `src/nested_learning/hope/block.py:341`, `src/nested_learning/hope/block.py:365` |
| CMS partial flush on final chunk | `src/nested_learning/hope/block.py:342`, `src/nested_learning/hope/block.py:941`, `src/nested_learning/hope/block.py:1493` |
| Surprise gating threshold logic | `src/nested_learning/hope/block.py:567`, `src/nested_learning/hope/block.py:1676` |
| Differentiable inner-update path toggle | `src/nested_learning/optim/manager.py:109`, `src/nested_learning/optim/manager.py:125` |
| Test-time memorization with path/threshold controls | `src/nested_learning/memorize.py:169`, `src/nested_learning/memorize.py:292`, `src/nested_learning/memorize.py:366` |

## Reproducibility Protocol (Mechanism Track)

1. Environment:
   - `uv sync --all-extras --dev`
   - PyTorch `2.9.0`
2. Determinism:
   - set `train.seed=<int>`
   - set `train.deterministic=true` for deterministic smoke runs
3. Minimal mechanism run:
   - `uv run python train.py --config-name pilot_paper_faithful train.steps=5`
4. Optional boundary-state mechanism probe:
   - `uv run python train.py --config-name pilot_paper_faithful train.algorithm_mode=boundary_state_grad_through_write train.steps=5`
5. Validation gates:
   - `uv run ruff check .`
   - `uv run mypy src`
   - `bash scripts/checks/run_fidelity_ci_subset.sh`
   - `uv run pytest -q`

## Community-Reported Remediation Map

- Data split fallback robustness: `docs/data_pipeline.md` + `scripts/data/{train_tokenizer,shard_corpus,filter_corpus}.py`
- Missing tokenizer/help ergonomics: `scripts/data/run_sample.sh`, `scripts/checks/check_data_script_help.sh`, CI workflow
- Boundary-state mode guardrails + visibility: `src/nested_learning/training.py` + `tests/test_strict_streaming_contract.py` + `tests/test_boundary_state_training_loop.py`
- Packaging metadata completeness: `src/nested_learning/training.py` + `scripts/package_pilot_release.sh` + `tests/test_package_release_script.py`

## Acceptance Checklist (Mechanism Fidelity)

- [x] Teach signal uses LM head weights with tied embedding head.
- [x] Per-layer teach signals (`δℓ`) are available and tested.
- [x] Online chunked updates support overlap + boundary-target semantics.
- [x] CMS chunk accumulation/cadence is audited with machine-readable reports.
- [x] Surprise gating behavior is tested (loss/entropy/l2 paths).
- [x] Test-time memorization path controls (`paths`, `surprise_threshold`) are implemented and tested.
- [x] Algorithm mode + online flags are emitted in run telemetry and checkpoint metadata.
- [x] Data scripts have deterministic split fallback and CI help-smoke coverage.
- [x] Security/release gates block large/binary artifact leakage.
- [ ] Full paper-scale result reproduction (explicitly out of current scope).

## Concepts → implementation mapping

### 1) Outer parameters vs inner (“fast”) procedure

In this codebase:
- **Outer update** = the standard optimizer step (`optimizer.step()`) on the model parameters after backprop.
- **Inner update** = memory/fast updates applied *outside* the gradient graph using teach signals (δ), e.g. CMS updates and self‑modifying TITAN updates.

Where:
- Outer loop: `src/nested_learning/training.py` (`run_training_loop`)
- Inner update calls: inside the training loop after backward:
  - `base_model(tokens, teach_signal=...)` or `base_model(tokens, teach_signals=[...])`
- The update logic lives in the block implementations:
  - `src/nested_learning/hope/block.py`

### 2) “Levels” and update frequencies

Levels are represented explicitly as `LevelSpec` entries with independent `update_period`s.

Where:
- Specs: `src/nested_learning/levels.py`
- Config surface (Hydra): `model.titan_level` and `model.cms_levels` in `configs/*.yaml`
- Enforcement:
  - Online CMS buffering + update‑period gating in `src/nested_learning/hope/block.py`
  - Level optimizer tick/step orchestration in `src/nested_learning/optim/manager.py`

### 3) Teach signal alignment (LM head gradient proxy)

The global teach signal is an approximation to **dL/dh**, where `h` is the hidden state **before** the LM head. This approximation must align to the LM head weights.

In this repo, `h` is explicitly the **post-LayerNorm hidden** (the exact input to `lm_head`), and tests pin this contract.

Where:
- Weight tying is explicit: `src/nested_learning/model.py` (`self.lm_head.weight = self.embed.weight`)
- Teach signal implementation: `src/nested_learning/training.py` (`compute_teach_signal`)
- Unit coverage: `tests/test_teach_signal.py`

### 4) Per‑layer local error signals (δℓ)

When enabled, we compute a teach signal **per block output** (δℓ) via autograd and route it into each block’s update path.

Where:
- Block output capture: `src/nested_learning/model.py` (`forward_with_block_outputs`)
- δℓ computation: `src/nested_learning/training.py` (`_compute_layer_teach_signals`)
- Routing to blocks: `src/nested_learning/model.py` (`teach_signals=[...]`)
- Unit coverage: `tests/test_teach_signal.py` (shape + matching expectations)

Flag:
- `train.per_layer_teach_signal=true`

### 5) Chunked online training (read‑after‑write for *loss*, not just updates)

This is the core “gradient propagation across frequencies” concern:

If you compute the LM loss on a full sequence **once**, and only apply memory updates after the backward pass, then later tokens’ loss does not reflect earlier inner updates.

To make later tokens “see” earlier inner updates during training, we support an **online chunked training** mode:
- Split the sequence into chunks.
- For each chunk:
  1) forward → loss  
  2) `loss.backward()` **accumulating** gradients across chunks (we do not zero grads per chunk)  
  3) apply inner updates in `torch.no_grad()`  
  4) proceed to the next chunk with updated memory
- At the end, we do a single outer `optimizer.step()`.
- Chunking supports **one-token overlap** mode and **explicit boundary-target** mode.
- In fast-state mode, CMS accumulation buffers persist across calls and are finalized (optional partial flush + reset) only when `finalize_updates=true` for the sequence end.

Where:
- `src/nested_learning/training.py` (search for `online_updates`)

Flags:
- `train.online_updates=true`
- `train.online_chunk_size=0` (auto‑infer a chunk size from the minimum CMS update period)

### 6) CMS update semantics (per‑token δ + sum‑over‑chunk accumulation)

CMS updates are applied using:
- **per‑token δ targets** (no chunk‑mean broadcast), and
- **sum‑over‑chunk reduction** for the CMS update loss (rather than mean), which preserves the “accumulate over C tokens” semantics.

We implement the CMS local objective via a **gradient-shaping construction**:
- `_chunk_loss()` chooses a target `t = stopgrad(prediction − δ)` so that `∂loss/∂prediction ∝ δ` under the chosen mask and reduction.
- This matches the paper’s δ-based local learning rule while letting us implement the update via standard autograd.
- Verified by `tests/test_cms_delta_rule.py`.

Where:
- Chunk loss reduction: `src/nested_learning/hope/block.py` (`_chunk_loss`, `cms_chunk_reduction="sum"`)
- Online buffering by update_period and “pop exactly C tokens”: `src/nested_learning/hope/block.py` (`_CmsBuffer`, `_pop_buffer_chunk`, `_cms_forward_online`)
- Unit coverage:
  - `tests/test_cms.py` (online updates affect later tokens; update_period gating)

Notes:
- In the Hydra configs, CMS chunk reduction / online toggles are mechanism-auditing defaults inside the HOPE block configs. They are not currently exposed as top-level YAML keys; changing them requires a small code change.
- `model.cms_flush_partial_at_end` is exposed because it affects correctness when sequence lengths are not exact multiples of update periods.

### 7) Self‑modifying TITAN path (always‑on)

Self‑modifying TITAN updates run in the update pass; they do not require the teach signal to be nonzero, but they **do** require an explicit update call (i.e., passing `teach_signal`/`teach_signals` to trigger the update pass).

Where:
- `src/nested_learning/hope/block.py` (self‑mod update path)
- Unit coverage: `tests/test_selfmod_online.py`

### 8) Outer optimizer options (including paper M3)

Default outer optimizer in configs is practical and reproducible (`optim.type=muon` hybrid with AdamW fallback for 1D params).

Paper option:
- `optim.type=m3` selects the M3 optimizer (multi‑scale momentum + Newton‑Schulz orthogonalization).

Where:
- `src/nested_learning/optim/m3.py`
- Unit coverage: `tests/test_m3.py`

### 8.1 `nl_l2_precond` mapping assumptions (best-effort)

The inner deep optimizer variant `nl_l2_precond` is implemented as a rank-1 projection-style preconditioner:

- Context vector `x_t`: repo uses the provided level context (typically mean hidden state over batch/sequence for that update event).
- Projector: update is projected orthogonal to context via `g - (g·u)u` where `u = x_t / ||x_t||`.
- This is a best-effort mechanism mapping, not a formal proof of exact paper-equation equivalence under all normalizations/objective variants.

Code + tests:
- `src/nested_learning/optim/deep.py` (`_nl_precondition`)
- `tests/test_optim.py`

## Distributed training caveats (important)

Mechanism-auditing mode is currently focused on `train.py` (single‑GPU).

- **DDP (`train_dist.py`)**: calls the shared training loop, but explicitly disables:
  - per‑layer teach signals (`train.per_layer_teach_signal`)
  - online chunked training (`train.online_updates`)
  because these require capturing block outputs and applying sequential inner updates in a way that is not yet DDP‑safe in this repo.
  - If you want to avoid silent fallback behavior, set `train.fail_if_paper_faithful_disabled=true` to raise instead of disabling.

- **FSDP (`train_fsdp.py`)**: currently uses a simpler “offline” update pass with a global teach signal after each outer step. It does not yet implement per‑layer δℓ or online chunked training.

If you need mechanism-auditing semantics at multi-GPU scale, the next engineering task is to port the `train.py` online/per-layer flow to FSDP (or a custom DDP scheme) while keeping correctness tests.

## Verification checklist (fast)

Run the fidelity tests:

```bash
uv run python scripts/checks/verify_docs_refs.py

uv run pytest \
  tests/test_teach_signal.py \
  tests/test_cms.py \
  tests/test_cms_cross_call.py \
  tests/test_cms_flush_partial.py \
  tests/test_online_chunking.py \
  tests/test_attention_cache.py \
  tests/test_eval_state.py \
  tests/test_selfmod_online.py \
  tests/test_m3.py \
  tests/test_residual_mlp_memory.py \
  tests/test_selfmod_local_conv.py \
  tests/test_selfmod_adaptive_q.py \
  tests/test_selfmod_grad_flow.py \
  tests/test_hope_selfmod_update_pass.py \
  tests/test_cms_delta_rule.py \
  tests/test_selfmod_dgd_linear.py \
  tests/test_optimizer_param_policy.py \
  tests/test_distributed_fail_fast.py \
  tests/test_strict_streaming_contract.py \
  tests/test_verify_docs_refs.py
```

Confirm you’re running with the intended features:
- startup `run_features` should include `train.algorithm_mode=two_pass_stopgrad_updates` and `train.backprop_through_online_writes=false`.
- training logs include `teach_signal_norm` and per‑layer update telemetry (e.g. `layer0.cms.cms_fast.grad_norm`) when an update pass runs.
- streaming semantics match `docs/STREAMING_CONTRACT.md` for the selected config mode.

## Known gaps / intentionally deferred work

- Full task‑episode meta‑learning evaluation loops are not implemented.
- Multi‑GPU mechanism-auditing training (online + per-layer δℓ) is not yet implemented.
- Full distributed mechanism-auditing path with boundary-target + attention-cache carry remains deferred.
- Large‑scale results reproduction is not a requirement for claiming mechanism fidelity in this repo.


================================================
FILE: docs/PHASE2_LONG_CONTEXT_COMPARISON.md
================================================
# Phase 2 – HOPE-Attention vs Transformer (Long-Context Sanity)

This repo includes a lightweight Phase‑2 sanity check that compares **HOPE-Attention** (Attention → CMS) against a **baseline Transformer** on synthetic long‑context retrieval prompts.

The goal is not to claim paper‑level results (that requires large‑scale training), but to provide a **reproducible, implementation-level signal** that:

- HOPE-Attention’s fast-state memorization path can **improve the margin/logprob** of the correct answer on long contexts.
- The baseline Transformer **cannot**, because it has no in‑context update path.

## What to run

This uses resolved, eval-friendly configs (no Hydra composition required):
- `configs/resolved/phase2_pilot_attention_eval.yaml`
- `configs/resolved/phase2_pilot_transformer_eval.yaml`

And uses the init checkpoints generated under `artifacts/checkpoints/phase2_init/` (gitignored).

Run (GPU recommended):

```bash
UV_LINK_MODE=copy UV_CACHE_DIR=/tmp/uv-cache \
uv run python scripts/eval/compare_variants.py \
  --a-config configs/resolved/phase2_pilot_attention_eval.yaml \
  --a-checkpoint artifacts/checkpoints/phase2_init/hope_attention_step000000.pt \
  --b-config configs/resolved/phase2_pilot_transformer_eval.yaml \
  --b-checkpoint artifacts/checkpoints/phase2_init/transformer_step000000.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --device cuda:1 \
  --output eval/phase2_compare_smoke_lastlayer_metrics.json \
  --seed 0 \
  --smoke \
  --memorize \
  --memorize-use-correct-answer \
  --memorize-layers last \
  --memorize-paths cms_fast
```

## What to look at

Open `eval/phase2_compare_smoke_lastlayer_metrics.json` and compare:

- **HOPE-Attention (A)**:
  - `a.passkey.mean_margin_delta` > 0
  - `a.niah.niah_256_mean_margin_delta` > 0
- **Transformer (B)**:
  - corresponding `*_mean_margin_delta` fields are exactly `0.0`

This demonstrates a concrete Phase‑2 differentiator at pilot scale: **test‑time learning updates move the model in a direction that improves long‑context answer margins**, and the baseline cannot.



================================================
FILE: docs/PHASE_2_PLAN.md
================================================
# Phase 2 Plan – Execution & Results Packaging

## Immediate Remediation Tasks (from EX_PHASE_1_CRITIQUE)

Before resuming large-scale runs, we must land the following **P0 faithfulness fixes** plus high-priority engineering upgrades. Each item lists the concrete code touchpoints, validation criteria, and downstream dependencies.

### 1. Tie LM head weights + correct teach signal
- **Scope**: `src/nested_learning/model.py`, `src/nested_learning/titan/model.py`, `src/nested_learning/training.py`, unit tests under `tests/`.
- **Actions**:
  1. Tie `lm_head.weight` to `embed.weight` for HOPE + TITAN models.
  2. Update `compute_teach_signal` to:
     - Use `model.lm_head.weight.detach()` instead of embeddings.
     - Shift logits/targets to align with CE loss (`logits[:, :-1]` vs `tokens[:, 1:]`).
     - Pad the teacher signal to maintain sequence length.
  3. Add `tests/test_teach_signal.py` performing a finite-difference gradient check.
- **Acceptance**: Unit test passing; manual verification on pilot smoke run logs (teach-signal norms logged).

### 2. Implement CMS chunk accumulation (Eq. 31)
- **Scope**: `src/nested_learning/cms.py` (or equivalent), `src/nested_learning/levels.py`, new telemetry structs, tests.
- **Actions**:
  1. Add per-level ring buffers sized to `update_period`.
  2. Accumulate gradients/error proxies each step; only trigger optimizer update when buffer is full, then clear.
  3. Emit `UpdateEvent` metrics (count, L2 norm) per level.
  4. Unit test verifying exactly one update per `update_period` ticks.
- **Acceptance**: Tests pass; pilot smoke shows stepped CMS updates in logs.

### 3. Add L2-regression inner update (Eq. 27–29)
- **Scope**: `src/nested_learning/optim/deep_momentum.py`, model forward hooks to pass `x_t`, tests.
- **Actions**:
  1. Introduce `variant="nl_l2_precond"` that computes the rank-1 projector from input activations.
  2. Route the relevant activations into the optimizer context.
  3. Config flag in `configs/hope/*.yaml` to enable this variant.
  4. Toy test: optimization reduces regression objective.
- **Acceptance**: Unit test + pilot smoke run with `variant` enabled (log preconditioner statistics).

### 4. Enable test-time memorization
- **Scope**: `scripts/eval/zeroshot.py`, `scripts/eval/niah.py`, `scripts/eval/continual.py`, model eval hooks.
- **Actions**:
  1. Add flags (`--memorize`, `--memorize-steps`, `--memory-lr`, `--surprise-threshold`).
  2. Implement TITAN memory updates (and optional CMS fast level) when `memorize=True`.
  3. Add synthetic integration test ensuring memorization improves accuracy on a constructed needle task.
- **Acceptance**: Tests pass; eval scripts produce separate `*_memorize.json` outputs with metrics > baseline on synthetic task.

### 5. PyTorch performance upgrades
- **Scope**: `src/nested_learning/*.py` (attention, training loop), optim factory.
- **Actions**:
  1. Replace `nn.MultiheadAttention` with manual QKV + `torch.nn.functional.scaled_dot_product_attention`, enabling FlashAttention where supported.
  2. Wrap training step in `torch.autocast(device_type, dtype=torch.bfloat16)`; add config switch.
  3. Add `torch.compile` (guarded) to model init.
  4. Use fused AdamW (`fused=True`) for outer optimizer.
- **Acceptance**: Pilot smoke runtime improves or stays stable; fallback path works on CPU.

### 6. Muon integration
- **Scope**: `src/nested_learning/optim/factory.py`, configs.
- **Actions**:
  1. Detect availability of `torch.optim.Muon`.
  2. Split param groups: matrices → Muon, embeddings/biases/LayerNorm → AdamW.
  3. Config knob `optim.outer.type = mixed_muon_adamw`.
  4. Benchmark vs AdamW and log results.
- **Acceptance**: Pilot smoke runs succeed with Muon; documentation updated.

### 7. Seeding & backend robustness
- **Scope**: training entrypoints (`train*.py`), `nested_learning/training.py`.
- **Actions**:
  1. Add `--seed` (Hydra config) and set Python/NumPy/Torch seeds + DataLoader worker init.
  2. Auto-select DDP backend (`nccl` for CUDA, `gloo` otherwise); expose override.
  3. Add CPU DDP smoke job in CI.
- **Acceptance**: Seed reproducibility test (two runs same seed → identical loss trace); CI job green.

### 8. Documentation & licensing polish
- **Scope**: `pyproject.toml`, README, release docs.
- **Actions**:
  1. Align license declaration with `LICENSE` (Apache-2.0).
  2. Ensure all referenced scripts are shipped; add `scripts/run_e2e_smoke.sh`.
  3. Update README with memorization instructions and Muon requirements.
- **Acceptance**: Lint job confirms license metadata; README diff reviewed.

These items are **blocking** for Stage 2 long runs. Only after P0 checklist completion do we resume the training/eval roadmap below.

## 1. Training Runs
1. **Pilot (160M / 3B tokens)**
   - Objective: confirm stability, log teach-scale findings, generate base checkpoints for eval harnesses.
   - Actions: run `configs/hope/pilot.yaml` with the full shard mixture; log to W&B and artifacts/.
2. **Mid-scale (760M / 30B tokens)**
   - Objective: produce the headline zero-shot/NIAH results.
   - Actions: run `configs/hope/mid.yaml` (FSDP or DeepSpeed), capture checkpoints every ~50k steps.
3. **Target (1.3B / 100B tokens)**
   - Objective: long-context + continual-learning showcase.
   - Actions: integrate 8k context curriculum, run with DeepSpeed ZeRO-3, checkpoint frequently.

## 2. Evaluation Campaign
1. **Zero-shot pack** – Use `scripts/eval/zeroshot.py --tasks all` on pilot/mid/target checkpoints; store JSON in `eval/zeroshot_*.json` and plot aggregated table in `docs/experiments_report.md`.
2. **NIAH curves** – Run `scripts/eval/niah.py` (2048→512k) for each major checkpoint and plot accuracy vs. context length.
3. **Continual-learning** – Run `scripts/eval/continual.py` across chronological segments; generate forgetting plots and correlate with level clocks.

## 3. Baseline Comparisons
- Reproduce lighter TITAN/Transformer baselines (reuse refs or simple adaptations) to evaluate on the same data/eval tasks.
- Log results alongside HOPE for direct comparison in `reports/ablations.md` and W&B dashboards.

## 4. Ablations
1. Self-modifier on/off.
2. CMS depth variations (1 vs. 3 vs. 5 levels).
3. Deep optimizer variants per level.
4. Attention swap (full vs. sliding-window/DeltaNet).
Record commands + metrics in `reports/ablations.md`.

## 5. Documentation & Release
1. Update `docs/experiments_report.md` with tables/plots.
2. Record stability tricks and teach-scale notes in `docs/stability_journal.md`.
3. Prepare a blog/paper draft summarizing architecture, training setup, and results.
4. Tag a release (`v0.2-stage2-prep`) with checkpoints, configs, eval JSONs.

## 6. Outreach & Community
- Share follow-up results posts (link to W&B dashboards, zero-shot tables, long-context plots).
- Invite collaborators for continual-learning and scaling experiments via README/Issues/Discussions.

## 7. Tracking
- Keep `TODO.md` updated per milestone.
- Use W&B projects for each run (pilot/mid/target) and link them in `docs/stage2_progress.md`.


================================================
FILE: docs/PYPI_TRUSTED_PUBLISHING.md
================================================
# PyPI Trusted Publishing Setup

This repository ships `.github/workflows/release.yml` for OIDC-based publishing.
Use this checklist once per repository to activate it.
The same workflow also publishes a GitHub Release entry (Releases tab) with wheel/sdist/checksum assets for each tag.

## 1) Configure TestPyPI trusted publisher

In TestPyPI project settings (`nested-learning`):
- Publisher: **GitHub**
- Owner: `kmccleary3301`
- Repository: `nested_learning`
- Workflow name: `release.yml`
- Environment: `testpypi`

## 2) Configure PyPI trusted publisher

In PyPI project settings (`nested-learning`):
- Publisher: **GitHub**
- Owner: `kmccleary3301`
- Repository: `nested_learning`
- Workflow name: `release.yml`
- Environment: `pypi`

## 3) Validate release tags

- RC tags (publish to TestPyPI): `vX.Y.ZrcN`
- Stable tags (publish to PyPI): `vX.Y.Z`

Example:
```bash
git tag v0.2.0rc1
git push origin v0.2.0rc1
```

## 4) Verify workflow permissions

`release.yml` requires:
- `id-token: write` (for OIDC)
- `contents: write`

No long-lived PyPI API tokens are required.

## 5) Recommended first dry-run

1. Create RC tag and publish to TestPyPI.
2. Create clean virtualenv.
3. Install package from TestPyPI.
4. Run:
   - `python -m nested_learning --help`
   - `python -m nested_learning doctor --json`
   - `python -m nested_learning smoke --config-name pilot_smoke --device cpu --batch-size 1 --seq-len 8`

## 6) Verify GitHub release assets

After the tag workflow completes, confirm the Releases tab entry for that tag contains:
- `nested_learning-<version>-py3-none-any.whl`
- `nested_learning-<version>.tar.gz`
- `SHA256SUMS.txt`

## 7) Verify GitHub Packages tab (GHCR)

The repository also ships `.github/workflows/packages.yml`, which publishes:
- `ghcr.io/<owner>/nested-learning-dist:<tag>`

This is an OCI artifact bundle for distribution files (`dist/*`) and appears in the GitHub Packages tab.
Use PyPI for normal `pip install` workflows.


================================================
FILE: docs/STREAMING_CONTRACT.md
================================================
# Streaming Contract (Mechanism-Auditing Mode)

This document defines the exact streaming semantics used by the single-GPU mechanism-auditing path.

## Terms

- `sequence`: one tokenized training example of length `T` used for next-token LM loss.
- `segment`: one externally provided slice of a longer document (used in eval/inference workflows).
- `chunk`: one training-time online slice used inside `train.online_updates=true`.
- `batch`: the `B` sequences processed together by the dataloader.
- `fast-state context`: per-context mutable memory state (CMS/TITAN/self-mod) used for online updates.

## State Scope and Lifetime

- Base model parameters (meta params): persistent across all steps.
- Fast-state: initialized per batch in training when `train.use_fast_state=true`.
- With `data.batch_size>1`, fast-state is currently shared across examples in the same batch.
- In mechanism-auditing presets we set `data.batch_size=1` to preserve per-context semantics.

## CMS Buffer Lifecycle

For each CMS level `level_name`:

1. `initialize`: create empty buffer with `inputs`, `teach`, `active`, `count=0`.
2. `accumulate`: append current tokens and increment `count`.
3. `boundary update`: while `count >= update_period`, pop exactly `update_period` tokens and apply one update.
4. `finalize`:
   - if `cms_flush_partial_at_end=true`, flush remaining partial tokens once.
   - clear buffer contents and reset count to zero.
5. `reset`: equivalent to finalize + clear, used at sequence end.

## `finalize_updates` Contract

- `finalize_updates=false`:
  - accumulate/update only full `update_period` boundaries.
  - do not partial-flush.
  - keep pending tokens for the next chunk call.
- `finalize_updates=true`:
  - apply normal boundary updates.
  - optional partial flush (`cms_flush_partial_at_end=true`).
  - clear per-level CMS buffers after finalize.

Training uses `finalize_updates=true` only on the last chunk of the sequence.

## Chunk-Boundary Objective Semantics

Two training modes are supported:

1. **Overlap mode (default)**: one-token overlap between neighboring chunks.
2. **Boundary-target mode**: no overlap; each chunk receives explicit `next_tokens` boundary targets.

Example for tokens `[t0 t1 t2 t3 t4]` and `chunk_size=2`:

- Overlap mode:
  - chunk 1: `[t0 t1]` contributes pair `t0->t1`
  - chunk 2: `[t1 t2 t3]` contributes pairs `t1->t2`, `t2->t3`
  - chunk 3: `[t3 t4]` contributes pair `t3->t4`
- Boundary-target mode:
  - chunk 1: `[t0 t1]` + boundary target `t2`
  - chunk 2: `[t2 t3]` + boundary target `t4`
  - chunk 3: `[t4]` (no boundary target)

Total supervised pairs remain `T-1`.

Boundary-target mode is enabled with:
- `train.online_boundary_targets=true`
- `train.online_carry_attention_cache=true` is the canonical paper-faithful setting for
  transformer-backed chunked runs in this repo.

## Segment Semantics for Long Documents

- A segment is external input partitioning, not the same as training chunking.
- Optional attention-state carry is available via model attention cache APIs:
  - `model.init_attention_cache()`
  - `model(..., attention_cache=..., return_attention_cache=True)`
- Training can carry attention state across chunk calls when:
  - `train.online_boundary_targets=true`
  - `train.online_carry_attention_cache=true`
- Fast-memory updates can persist across steps when the caller reuses fast-state.

## Strict Mode

Set `train.strict_streaming_contract=true` to fail fast on known semantics violations:

- distributed training with unsupported paper-auditing features,
- fast-state with `data.batch_size>1`,
- `train.online_updates=true` with `train.use_fast_state=false`,
- non paper-defined variant under strict paper-auditing expectations,
- invalid boundary/carry combinations for online chunking.

## Cadence Verification Example

After a run that emits JSON metrics, validate a CMS level cadence:

```bash
uv run python scripts/checks/verify_update_cadence.py \
  --log-path logs/mechanism_audit_smoke.json \
  --metric-prefix layer0.cms.cms_mid \
  --total-tokens 8 \
  --update-period 4 \
  --output reports/cadence_mechanism_audit_smoke.json
```

Expected report keys:
- `ok`
- `metric_prefix`
- `expected`
- `observed`
- `checks`


================================================
FILE: docs/VERSIONING_POLICY.md
================================================
# Versioning and Stability Policy

This repository follows SemVer-style versioning with explicit 0.x constraints.

## Current Phase: 0.x

Before `1.0.0`, stability guarantees are intentionally limited:
- `0.x.y` patch releases should be non-breaking for normal workflows.
- `0.X.0` minor releases may include breaking changes to config schema, defaults, CLI behavior, or checkpoint metadata.

## Public Surface

Stable-ish surfaces (prioritized for compatibility):
- `nl` CLI commands and flags
- Hydra config schema for primary shipped configs
- checkpoint sidecar metadata fields used by verification tooling

Explicitly unstable surfaces:
- internal Python module APIs
- experimental mechanism paths and ablation-only options
- ad hoc scripts under `scripts/` unless documented as stable entrypoints

## Breaking Change Handling

When a release introduces breakage:
1. call it out in `CHANGELOG.md`,
2. include migration notes,
3. keep old behavior behind compatibility flags where reasonable for at least one minor cycle.

## Golden Environment vs Supported Ranges

- Golden reproduction environment: lockfile-based (`uv lock`, Python 3.12, PyTorch 2.9.x).
- Package metadata supports broader compatibility ranges for portability.
- If range installs diverge from golden behavior, prefer golden env for paper-faithful runs.



================================================
FILE: docs/compute_plan.md
================================================
# Compute Reservation Plan (Stage 2)

## Hardware
- Cluster: 2× nodes with dual NVIDIA RTX 6000 Ada (48 GB VRAM) + 64-core CPU + 512 GB RAM.
- Scheduler: Slurm (partition `gpu-a6000`), 2 nodes available concurrently.

## Reservations
| Phase | Resources | Duration | Window | Purpose |
|-------|-----------|----------|--------|---------|
| Pilot run | 1 node (2× A6000) | 3 days | Week 1 (Mon–Wed) | 160 M param sanity run, tokenizer validation |
| Ablations | 1 node | 2 days | Week 1 (Thu–Fri) | Self-modifier/CMS toggles at pilot scale |
| Mid-scale | 2 nodes | 10 days | Weeks 2–3 | 760 M training to 30 B tokens + evals |
| Mid evals | 1 node | 2 days | Week 3 (end) | Zero-shot + NIAH scripts on mid checkpoint |
| Target warmup | 2 nodes | 3 days | Week 4 (start) | 1.3 B config dry run (short token budget) |
| Target full run | 2 nodes | 14 days | Weeks 4–6 | 1.3 B / 100 B tokens |
| Final evals | 1 node | 3 days | Week 6 | Long-context + continual learning |

## Actions
1. Submit Slurm reservations (`scripts/compute/create_reservations.sh`) for the windows above; tag jobs with `NL-Stage2`.
2. Pre-stage datasets/token shards on node-local NVMe before each run to avoid network bottlenecks.
3. Enable checkpoint mirroring to shared storage every 12 hours for resilience.
4. Maintain utilization log in `reports/compute_usage.md` (to be created after first run).


================================================
FILE: docs/continual_classification_eval.md
================================================
# Continual Classification Evaluation (CLINC / Banking77 / DBpedia14)

The Nested Learning paper highlights **class-incremental continual learning** in the text classification
domain (CLINC, Banking77, DBpedia). This repo provides a lightweight, implementation-first harness that
treats classification as **generative label selection**:

- Prompt: `Text: ... \nLabel:`
- Score each candidate label by log-probability of the label string
- Optionally apply HOPE/TITAN/CMS **test-time memorization** after each example (fast-state by default)

## Script

Use `scripts/eval/continual_classification.py`.

### Smoke run (CPU)

```bash
uv run python scripts/eval/continual_classification.py \
  --config configs/pilot_smoke.yaml \
  --checkpoint artifacts/checkpoints/pilot_smoke/step_000010.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --dataset clinc \
  --smoke \
  --device cpu \
  --output eval/continual_cls_smoke.json
```

### Memorization-enabled run

```bash
uv run python scripts/eval/continual_classification.py \
  --config configs/hope/pilot_attention.yaml \
  --checkpoint artifacts/checkpoints/pilot/step_230000.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --dataset banking77 \
  --task-size 10 --train-per-label 25 --eval-per-label 25 \
  --memorize --memorize-steps 1 \
  --memorize-paths titan,cms_fast \
  --memorize-surprise-threshold 0.02 \
  --device cuda:0 \
  --output eval/continual_cls_banking77.json
```

Notes:
- `--task-size` controls class increments (how many labels per task).
- `--memorize-no-reset` (default) keeps the fast-state across examples/tasks, matching a continual setting.
- For “pure baseline” continual evaluation, omit `--memorize`.

### Offline / local JSONL

If you don’t want to rely on HuggingFace downloads, supply a JSONL file:

```bash
uv run python scripts/eval/continual_classification.py \
  --config configs/pilot_smoke.yaml \
  --checkpoint artifacts/checkpoints/pilot_smoke/step_000010.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --local-jsonl data/local_continual_fixture.jsonl \
  --task-size 3 --train-per-label 2 --eval-per-label 2 \
  --smoke --device cpu \
  --output eval/continual_cls_local.json
```

Each line must be: `{"text": "...", "label": "..."}`.

## Output

The JSON contains:
- `task_accuracy_matrix[i][j]`: accuracy on task `i` evaluated after finishing task `j`
- `avg_accuracy_final`: average accuracy after the last task
- `avg_forgetting`: average (`max_acc_i - final_acc_i`) across tasks

This harness is intentionally lightweight so the community can refine the exact protocol to match the
paper’s class-incremental schedules and reporting conventions.

## Plotting

```bash
uv run python scripts/eval/plot_continual_classification.py \
  --continual-json eval/continual_cls_banking77.json \
  --output reports/plots/continual_cls_banking77.png
```


================================================
FILE: docs/continual_eval.md
================================================
# Continual-Learning Evaluation Guide

Use `scripts/eval/continual.py` to quantify forgetting across streaming segments. Supply:

- `--config`: Hydra config for the HOPE model.
- `--checkpoints`: ordered list of checkpoint paths (chronological training steps).
- `--segments-yaml`: YAML describing segment names + shard directories (see `configs/data/continual_segments_sample.yaml`).
- `--batch-size`, `--max-batches`: evaluation throughput controls (0 = entire shard).
- `--eval-state-mode`: `reset_per_sample` (default) or `carry_across_samples`.
- `--eval-use-fast-state` / `--eval-use-attention-cache`: enable inference-time streaming state carry semantics.

Example:
```bash
uv run python scripts/eval/continual.py \
  --config configs/hope/mid.yaml \
  --checkpoints checkpoints/mid/step_000050.pt checkpoints/mid/step_000100.pt \
  --segments-yaml configs/data/continual_segments_sample.yaml \
  --batch-size 4 --max-batches 20 \
  --eval-state-mode carry_across_samples \
  --eval-use-attention-cache \
  --memorize --memorize-steps 2 \
  --memorize-paths titan,cms_fast \
  --memorize-surprise-threshold 0.02 \
  --output eval/continual_mid.json
```

With memorization enabled the output includes baseline vs. memorize cross-entropy, Titan/CMS update stats per segment, the active memory paths, and the surprise threshold used. Adjust `--memorize-paths` (comma-separated) to restrict which levels update (e.g., `titan` only, or `titan,cms_fast`) and `--memorize-surprise-threshold` to replicate the paper’s surprise gating.

Note: memorization uses a per-context fast state by default, so evaluation does not mutate checkpoint weights.

To visualize forgetting curves:

```bash
uv run python scripts/eval/plot_forgetting.py \
  --continual-json eval/continual_mid.json \
  --segment refinedweb_2018 \
  --output reports/plots/continual_mid_refinedweb.png
```

The plot overlays baseline vs. memorize loss across checkpoints for the chosen segment. For full-scale runs, replace the sample YAML with the production segment list (e.g., chronological Wikipedia shards, MAWI sequences, etc.) and archive both the JSON and plot in your checkpoint report.


================================================
FILE: docs/data_pipeline.md
================================================
# Data Pipeline (Stage 2)

This document explains how to generate tokenizer artifacts and token shards for Stage 2 training.

## Prerequisites
- Ensure the `uv` environment is synced (`uv sync --all-extras`).
- Large storage mounted at `data/raw/` and `data/shards/`.
- HF datasets cache configured with valid credentials if accessing gated sets.

## Dataset acquisition & licensing
The Stage 2 mixture mimics RefinedWeb + supplements. Download each source into `data/raw/<source>/` and document provenance before filtering.

| Source | License / Terms | Acquisition Command(s) | Notes |
|--------|-----------------|------------------------|-------|
| RefinedWeb / FineWeb proxy | CC BY 4.0 (FineWeb) | `uv run python scripts/data/shard_corpus.py --dataset HuggingFaceFW/fineweb --subset sample-10BT --split train --output data/raw/refinedweb.ndjsonl --limit 20000000` | Keep a copy of the HF dataset card; respect scraping policies. |
| FineWeb-Edu | CC BY 4.0 (FineWeb) | Use `HuggingFaceFW/fineweb-edu` (e.g., `subset=sample-10BT`) via `scripts/data/filter_corpus.py` + `scripts/data/process_mixture.py`. | Paper-aligned option; prefer long-doc filtering if matching the paper’s setup. |
| Wikipedia 2023-12 dump | CC BY-SA 3.0 | Download `https://huggingface.co/datasets/wikipedia/20220301.en` via HF CLI or mirror the XML dump. | Use HF `datasets load_dataset` inside the filtering script to avoid storing raw XML. |
| C4 (en) | ODC-By | `uv run python scripts/data/shard_corpus.py --dataset allenai/c4 --subset en --split train --output data/raw/c4_en.ndjsonl --limit 8000000` | Heavy dataset; ensure disk quota before streaming. |
| RedPajama CC subset | CC BY | Use `togethercomputer/RedPajama-Data-1T-Sample` or the CC subset tarballs. | Store gzipped JSONL files under `data/raw/redpajama/*.jsonl.gz`. |
| Code (Stack/Python mix) | Mostly MIT/Apache | Pull from `bigcode/starcoderdata` shards or permissively licensed repos. | Preserve LICENSE metadata per shard (`data/raw/code/LICENSES.md`). |

Every corpus contribution is tracked in `data/manifest/refinedweb_full_manifest.json`. Regenerate or edit this manifest whenever the mixture changes so downstream runs can validate shard presence and licensing.

To verify the manifest against local shards:

```bash
uv run python scripts/data/validate_mixture.py \
  --manifest data/manifest/refinedweb_full_manifest.json \
  --output data/mixtures/refinedweb_mix_manifest_report.json
```

All raw pulls should include a short README describing the source URL, date retrieved, and any filters applied. Update `docs/data_pipeline.md` whenever the mix changes so downstream users know which corpora are safe to redistribute.

## 1. Train tokenizer (multi-corpus manifest)

```bash
uv run python scripts/data/train_tokenizer.py \
  --manifest configs/data/refinedweb_mixture.yaml \
  --vocab-size 32000 \
  --output-dir artifacts/tokenizer/refinedweb_mix \
  --log-file data/mixtures/refinedweb_mix_tokenizer.json
```

The manifest pulls small samples from FineWeb (RefinedWeb proxy), Wikimedia/Wikipedia, AllenAI C4, SlimPajama, and codeparrot code datasets. Outputs live in `artifacts/tokenizer/refinedweb_mix/`.

### Sample pipeline note (hard vocab limit)
When training a tokenizer on **tiny local samples**, SentencePiece can fail if it cannot reach the requested `--vocab-size` (default `hard_vocab_limit=true`).

For the repo’s sample pipeline (`scripts/data/run_sample.sh`), we disable this check:

```bash
uv run python scripts/data/train_tokenizer.py ... --no-hard-vocab-limit
```

For “paper-faithful” runs, prefer training on a sufficiently large corpus and keep the default `--hard-vocab-limit`.

### Tokenizer checksum
Record the checksum of every published tokenizer so collaborators can verify integrity before launching runs.

```bash
uv run python scripts/data/check_tokenizer.py \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --expected-sha256 f8871517ca968839bf6b9595a6e7891e6b8c6a70fd4df788696bce35be62d6c2 \
  --metadata-json artifacts/tokenizer/refinedweb_mix/checksum.json
```

The command prints the SHA-256 digest and writes a JSON record (optional). Keep the expected hash in this doc so CI/scripts can assert integrity. Update the hash whenever the tokenizer is retrained.

### Coverage sanity check
Before publishing a tokenizer, capture coverage metrics on a representative sample:

```bash
uv run python scripts/data/check_tokenizer_coverage.py \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --sample-file data/filtered/refinedweb_en_sample.txt \
  --max-lines 5000 \
  --output data/mixtures/refinedweb_mix_tokenizer_coverage.json
```

The script reports tokens/word, proportion of single-token words, and a histogram of piece lengths. Add the JSON to your release bundle so collaborators can verify coverage.

#### Automated regression guard
Add a regression check to CI or pre-release automation to ensure coverage does not drift:

```bash
uv run python scripts/checks/tokenizer_coverage_guard.py \
  --baseline data/mixtures/refinedweb_mix_tokenizer_coverage.json \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --sample-file data/filtered/refinedweb_en_sample.txt \
  --max-lines 5000 \
  --output data/mixtures/refinedweb_mix_tokenizer_coverage_latest.json
```

The guard fails if `avg_tokens_per_word` increases by more than `0.05` or if the single/two-token coverage drops by more than `2 %`. Adjust tolerances via CLI flags if a new tokenizer intentionally changes segmentation. Include the generated JSON in release bundles alongside the manifest validation report.

## 2. Shard mixture components

```bash
uv run python scripts/data/process_mixture.py \
  configs/data/refinedweb_mixture_filtered.yaml \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --log-file data/mixtures/refinedweb_mix_filtered_shards.json
```

This iterates over each dataset entry (either streamed from HF or the filtered local files), tokenizes at sequence length 2048, and writes NumPy shards to `data/shards/<dataset>`. Stats (records, sequences, shards, total tokens) are recorded in `data/mixtures/refinedweb_mix_shards_full.json`.

## 3. Legacy pilot data
- `data/shards/tinystories_train/` retains 1,718 shards for unit tests and smoke runs.

## Troubleshooting Matrix

| Symptom | Likely cause | Deterministic fix |
|---|---|---|
| `run_sample.sh` cannot find `artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model` | Tokenizer has not been trained yet | Re-run `uv run bash scripts/data/run_sample.sh`; it auto-trains tokenizer when missing |
| `Bad split: train. Available splits: ['test']` | Dataset exposes a non-`train` split | Use fallback (`FALLBACK_SPLIT=test uv run bash scripts/data/run_full.sh`) or set per-source split env vars like `RW_SPLIT=test` |
| `Bad split` in tokenizer/shard/filter scripts | Requested split absent in source dataset | Built-in fallback now resolves in order `train -> validation -> test -> first available` and logs available splits |
| SentencePiece fails to hit requested vocab size on tiny corpora | `hard_vocab_limit=true` with too little data | Use `--no-hard-vocab-limit` for sample runs; keep hard limit for large production corpora |
| Tokenizer coverage regresses between runs | Different corpus sample or tokenizer settings | Run `scripts/data/check_tokenizer_coverage.py` and `scripts/checks/tokenizer_coverage_guard.py` against baseline JSON |

## 4. Filtering & deduplication
Before sharding full-scale corpora, run language filtering + dedup to keep only high-quality English segments:

```bash
uv run python scripts/data/filter_corpus.py \
  --dataset HuggingFaceFW/fineweb \
  --subset sample-10BT \
  --split train \
  --text-column text \
  --output-path data/filtered/fineweb_en.txt \
  --min-chars 200 \
  --max-chars 8000 \
  --lang-threshold 0.85
```

Adjust dataset/subset arguments per manifest entry. The script enforces language probabilities via `langdetect`, performs length screening, and deduplicates using a rolling hash window. Point `scripts/data/process_mixture.py` to these filtered files (or custom dataset definitions) for large-scale processing.

## 4.1 FineWeb-Edu manifests (paper-aligned)

This repo includes two manifest recipes for FineWeb-Edu:
- `configs/data/fineweb_edu_mixture_sample.yaml` (subset `sample-10BT`, bounded `max_records`)
- `configs/data/fineweb_edu_mixture_full.yaml` (subset `sample-100BT`, `seq_len=4096`)

Tokenizer training:
```bash
uv run python scripts/data/train_tokenizer.py \
  --manifest configs/data/fineweb_edu_mixture_sample.yaml \
  --vocab-size 32000 \
  --output-dir artifacts/tokenizer/fineweb_edu \
  --log-file data/mixtures/fineweb_edu_tokenizer_samples.json
```

Sharding:
```bash
uv run python scripts/data/process_mixture.py \
  configs/data/fineweb_edu_mixture_sample.yaml \
  --tokenizer-path artifacts/tokenizer/fineweb_edu/spm_32000_unigram.model \
  --log-file data/mixtures/fineweb_edu_sample_shards.json
```

If you want to more closely mimic “long document” regimes, filter first (higher `min_chars` / `max_chars`)
and then switch the manifest entry to `dataset: text` + `data_files: <filtered_file>`. The tokenizer and
sharding scripts accept `data_files` and will enforce the requested split.

### 4.1.1 FineWeb-Edu long-doc filtered sample (turnkey)

For a concrete, paper-aligned “long document” recipe, use:
- `configs/data/fineweb_edu_longdoc_filtered_sample.yaml`

Step 1 — create a filtered long-doc file (example settings; tune `min_chars`/`max_chars` to match your needs):

```bash
uv run python scripts/data/filter_corpus.py \
  --dataset HuggingFaceFW/fineweb-edu \
  --subset sample-10BT \
  --split train \
  --text-column text \
  --target-lang en \
  --lang-threshold 0.85 \
  --min-chars 2000 \
  --max-chars 20000 \
  --limit 5000 \
  --output-path data/filtered/fineweb_edu_longdoc_en_sample.txt \
  --force-exit
```

Step 2 — train a tokenizer on that filtered file:

```bash
uv run python scripts/data/train_tokenizer.py \
  --manifest configs/data/fineweb_edu_longdoc_filtered_sample.yaml \
  --vocab-size 32000 \
  --output-dir artifacts/tokenizer/fineweb_edu_longdoc \
  --log-file data/mixtures/fineweb_edu_longdoc_tokenizer_samples.json
```

Step 3 — shard into tokenized `.npy` shards:

```bash
uv run python scripts/data/process_mixture.py \
  configs/data/fineweb_edu_longdoc_filtered_sample.yaml \
  --tokenizer-path artifacts/tokenizer/fineweb_edu_longdoc/spm_32000_unigram.model \
  --log-file data/mixtures/fineweb_edu_longdoc_sample_shards.json
```

All outputs (`data/filtered/`, `data/shards/`, `artifacts/tokenizer/`) are gitignored.

## 5. Artifacts & stats
- Tokenizer samples: `data/mixtures/refinedweb_mix_tokenizer.json`
- Shard stats (pilot stream): `data/mixtures/refinedweb_mix_shards.json`
- Shard stats (filtered sample run): `data/mixtures/refinedweb_mix_filtered_shards.json`
- Shard stats (full filtered run, seq_len=2048): `data/mixtures/refinedweb_mix_shards_full.json`
- Latest corpus verification log: `logs/data_inventory_2025-11-10.md` (matches `data/mixtures/refinedweb_mix_full_shards.json` with `verified_at_utc` timestamp).
- Tokenizer model: `artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model`
- Continual-learning sample segments: `configs/data/continual_segments_sample.yaml`

## 6. Next steps
- Integrate the full shards into the training configs (see `configs/hope/mid.yaml`, `configs/hope/target.yaml`).
- Automate periodic re-generation (e.g., weekly) if new data arrives.
- Version mixture manifests and stats under `configs/data/` as recipes evolve.


================================================
FILE: docs/env_matrix.md
================================================
# Environment Matrix – Stage 2

This document captures the exact runtime state used for the Stage 2 sprint so collaborators can reproduce the setup without guesswork.

## 1. Runtime Summary

| Component | Version | Notes / Verification |
|-----------|---------|----------------------|
| OS | Ubuntu 22.04 LTS (kernel 6.x) | `cat /etc/os-release` (see host) |
| Python | 3.12.2 (conda-forge build) | `uv run python -V` |
| uv | 0.9.8 | `uv --version` |
| PyTorch | 2.9.0+cu128 | `uv run python -c "import torch; print(torch.__version__)"` |
| torchvision | 0.24.0+cu128 | `uv run python -c "import torchvision; print(torchvision.__version__)"` |
| torchaudio | 2.9.0+cu128 | `uv run python -c "import torchaudio; print(torchaudio.__version__)"` |
| CUDA runtime | 12.8 (PyTorch wheels) | `uv run python -c "import torch; print(torch.version.cuda)"` |
| NVIDIA driver | 550.90.07 | `nvidia-smi --query-gpu=name,driver_version --format=csv` |
| GPUs | 2 × NVIDIA RTX 6000 Ada (49 GB) | Prefer `cuda:1` for single-GPU jobs |

## 2. uv / Dependency Management
- `pyproject.toml` + `uv.lock` pin all Python dependencies.
- Sync command: `uv sync --all-extras`.
- When installing torch 2.9 manually: `uv pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/cu128`.
- Cache guidance: set `UV_CACHE_DIR=/tmp/uv-cache` if default path lacks space.

## 3. GPU Usage Notes
- Default to `cuda:1` for long single-GPU training/eval to avoid interfering with tmux sessions pinned to GPU0.
- Distributed jobs use `torchrun --nproc_per_node=2` with both GPUs.
- Driver 550.90.07 + CUDA 12.4 runtime confirmed compatible with PyTorch 2.9.0/cu128 wheel; no additional toolkit install needed.
- Enable `NCCL_IB_DISABLE=1` if networking errors appear (not observed yet).

## 4. Verification Checklist
Run the following snippet after provisioning a new machine to confirm parity:

```bash
uv --version
uv run python -V
uv run python - <<'PY'
import torch, torchvision, torchaudio
print('torch', torch.__version__, 'cuda', torch.version.cuda)
print('torchvision', torchvision.__version__)
print('torchaudio', torchaudio.__version__)
print('device0', torch.cuda.get_device_name(0))
PY
nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader
```

Record the outputs in `logs/env_checks/<date>.txt` before running large jobs.

## 5. Known Good Combinations
| Stack | Status | Notes |
|-------|--------|-------|
| torch 2.9.0 + torchvision 0.24.0 + CUDA 12.8 | ✅ | Current default; supports FlashAttention and Muon optimizers. |
| torch 2.9.0 + torchvision 0.23.x | ❌ | Version mismatch; torchvision 0.23 expects torch 2.8. |
| torch 2.5.0 + torchvision 0.20.0 | ✅ legacy | Use only if targeting older runs (no muon support). |

## 6. Process
1. Clone repo → `git clone https://github.com/kmccleary3301/nested_learning.git`.
2. `cd nested_learning && uv sync --all-extras`.
3. Verify versions via checklist above.
4. Export `WANDB_API_KEY` from `git.env` (sourced manually) before training.
5. Launch jobs via `uv run ...` to guarantee the locked environment.

Keeping this matrix current prevents silent drifts when PyTorch or CUDA releases change. Update it whenever the `uv.lock` or driver stack changes.


================================================
FILE: docs/experiments_report.md
================================================
# Experiments Report – Nested Learning Reproduction

_Draft covering work completed through 9 Nov 2025. This document is meant to accompany the initial public release so contributors understand what has been reproduced and what remains._

---

## 1. Overview
- **Goal:** Reproduce key aspects of Google's Nested Learning (HOPE) architecture using public tooling (`uv`, PyTorch 2.9.0) and release a community-ready codebase.
- **Hardware:** Dual RTX 6000 Ada (49 GB each). All long-running experiments in this report use a single GPU (`cuda:1`) to accommodate other projects on the host.
- **Data:** Filtered RefinedWeb mixture (FineWeb, Wikipedia, C4, SlimPajama, CodeParrot). Sample pipeline (`scripts/data/run_sample.sh`) for smoke tests; full pipeline (`scripts/data/run_full.sh`) for larger runs. Tokenizer: SentencePiece unigram 32k.

---

## 2. Experimental Setup
| Component | Details |
|-----------|---------|
| Framework | PyTorch 2.9.0 (LTS), CUDA 12.4 |
| Dependency Mgmt | `uv` with `pyproject.toml` + `uv.lock` |
| Logging | JSON logs under `logs/` (W&B optional but disabled for release) |
| Training Driver | `train.py` (single GPU), `train_dist.py` (torchrun) |
| Evaluation | `scripts/eval/zeroshot.py`, `scripts/eval/niah.py`, `scripts/eval/continual.py` |
| Teach Signal | Outer teach signal derived from logits residual; scale/clip adjustable per config with runtime scheduling |

### Key Configurations
1. **HOPE Mid (single GPU)**
   - Config: `configs/mid_stage2.yaml`
   - Dim = 768, 18 layers, 12 heads, TITAN-level + CMS levels (fast/mid/slow/ultra)
   - Teach schedule: warmup 60 steps, decay start 140, duration 80 (for 220-step run)
   - Gradient clipping applied inside TITAN and CMS blocks

2. **TITAN Baseline**
   - Config: `configs/mid_titan_baseline.yaml` (`model.type=titan`)
   - Same backbone (attention + TITAN memory) but no CMS/self-mod update path
   - Teach schedule mirrors HOPE run to enable apples-to-apples comparison

---

## 3. Experiments

### 3.1 Data Pipeline Validation
| Command | Purpose |
|---------|---------|
| `uv run bash scripts/data/run_sample.sh` | Smoke-friendly filtering + sharding (RefinedWeb/Wiki/C4/SlimPajama/Code) |
| `RW_LIMIT=20000 ... uv run bash scripts/data/run_full.sh` | Full pipeline (run in tmux `data_full`) to produce `_full` shards |
| `uv run python scripts/data/process_mixture.py configs/data/refinedweb_mixture_full.yaml ...` | Re-sharding with SentencePiece tokenizer |

Artifacts: `data/filtered/*_full.txt`, `data/shards/*_full`, stats in `data/mixtures/refinedweb_mix_full_shards.json`.

- Manifest validation: `data/manifest/refinedweb_full_manifest.json` lists every corpus (shard dir, license, download URL). Running `uv run python scripts/data/validate_mixture.py --manifest ...` produces overlap and size stats (`data/mixtures/refinedweb_mix_manifest_report.json`) so we can spot missing/duplicate shards before training.
- Tokenizer coverage: `scripts/data/check_tokenizer_coverage.py` now emits coverage JSON (`data/mixtures/refinedweb_mix_tokenizer_coverage.json`). On the filtered RefinedWeb sample the 32k unigram tokenizer averages 1.34 tokens/word with ~77% single-token words, confirming adequate coverage before scaling runs.

### 3.2 HOPE vs TITAN (single GPU, 220 steps)
All runs below use batch size 4, optimizer LR 1e‑5, teach_scale 0.10, teach_clip 4.0, runtime schedule (warmup 60, decay 140→220). Commands launched via tmux to keep the CLI free.

| Model | Checkpoint | PIQA (128) | Winogrande (128) | Notes |
|-------|------------|------------|------------------|-------|
| HOPE | `artifacts/checkpoints/mid_stage2_ts10_single220_schedD/step_000220.pt` | 0.469 | 0.594 | Loss drops from 10.55 → 8.55; NIAH still ~0 |
| TITAN | `artifacts/checkpoints/mid_titan_baseline/step_000200.pt` | 0.469 | 0.594 | Loss similar; continuous memory absent |

NIAH results (`eval/niah_mid_stage2_ts10_single220_schedD.json`, `eval/niah_mid_titan_baseline.json`) remain near random at 2k/4k tokens for both models. Continual-learning logs are finite but noisy (short runs). A longer training window is needed to expose the advantages cited in the paper (e.g., HOPE surpassing TITAN on long-context recall).

### 3.3 Teach-Scale Sweep (short runs)
| teach_scale | Configuration | Checkpoint | Final loss (step 40) |
|-------------|---------------|------------|----------------------|
| 0.05 | `logs/mid_stage2_single_ts05.json` | `artifacts/checkpoints/mid_stage2_single_ts05/step_000040.pt` | 9.81 |
| 0.10 | `logs/mid_stage2_single_ts10.json` | `artifacts/checkpoints/mid_stage2_single_ts10/step_000040.pt` | 9.77 |
| 0.20 | `logs/mid_stage2_single_ts20.json` | `artifacts/checkpoints/mid_stage2_single_ts20/step_000040.pt` | 9.76 |

Even at 0.20, residual clipping kept the run stable, indicating headroom for larger teach scales once the data window grows.

### 3.4 Dual-GPU Smoke (HOPE)
| Command | Output |
|---------|--------|
| `uv run torchrun --nproc_per_node=2 train_dist.py --config-name mid_stage2_smoke` | `artifacts/checkpoints/mid_stage2_smoke/step_000060.pt`, `logs/mid_stage2_smoke.json` |
| `uv run python scripts/eval/zeroshot.py ...` | `eval/zeroshot_mid_stage2_smoke.json` |
| `uv run python scripts/eval/niah.py ...` | `eval/niah_mid_stage2_smoke.json` |
| `uv run python scripts/eval/continual.py ...` | `eval/continual_mid_stage2_smoke.json` |

These runs validate the distributed training/eval path and are the recommended “smoke” workflows for contributors.

### 3.5 Test-Time Memorization Harness
HOPE/TITAN models now support TITAN-style test-time learning via shared CLI flags:

```
uv run python scripts/eval/zeroshot.py \
  --config configs/mid_stage2_smoke.yaml \
  --checkpoint artifacts/checkpoints/mid_stage2_smoke/step_000060.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --tasks piqa \
  --max-samples 32 \
  --output eval/zeroshot_mid_stage2_smoke_piqa_mem.json \
  --device cuda:1 \
  --memorize \
  --memorize-steps 2 \
  --memorize-use-correct-answer
```

NIAH and continual harnesses expose analogous options (`--memorize`, `--memorize-steps`, `--memorize-no-reset`, `--memorize-use-correct-answer`). The memorization loop replays the prompt (optionally augmented with the correct answer) through the teach-signal pathway before each eval query, letting us probe TITAN-style “learning at test time”.

Pilot PIQA example (32-sample subset, single GPU):

| Mode | Command / Output | Accuracy |
|------|------------------|----------|
| Baseline | `eval/zeroshot_mid_stage2_smoke_piqa_baseline.json` | 0.5625 |
| Memorize (prompt + answer, 2 steps) | `eval/zeroshot_mid_stage2_smoke_piqa_mem.json` | 0.5625 |

At this scale, memorization neither helps nor hurts, but the infrastructure is in place to replicate the substantial gains reported in HOPE/TITAN once longer contexts and richer checkpoints are available.

### 3.6 Long-context diagnostics (pilot step 230k)
- **Passkey retrieval (`eval/passkey_pilot_step230000.json`):** 64 prompts with 256 filler sentences each. Accuracy baseline vs memorize is flat at 0.484 while Titan updates average ~2.13 (CMS-fast disabled). This confirms the harness works but also shows we need longer training to see the passkey delta reported in the paper.
- **PG-19 perplexity (`eval/pg19_pilot_step230000.json`):** Streaming PG-19 excerpts truncated to 2048 tokens yield PPL ≈ 2.5k for both baseline and memorize settings (4 samples). The script is part of the pilot suite so future checkpoints can report comparable long-form perplexities out-of-the-box.

### 3.7 Continual forgetting plots
`scripts/eval/continual.py` now records both baseline and memorize CE per segment. Running it on checkpoints `[5k, 10k, 230k]` and passing the JSON into `scripts/eval/plot_forgetting.py` produces `reports/plots/continual_pilot_refinedweb.png`, which shows continual CE dropping from ~48 at step 5k to ~8 at step 230k on the RefinedWeb segment (memorization on). These plots will accompany every checkpoint report going forward.

### 3.8 Pilot (3 B tokens) – 230 k-step snapshot
- **Config:** `configs/pilot.yaml` (dim 512, 12 layers, TITAN + CMS fast/mid/slow/ultra, teach_schedule warmup 2k → decay 120k→140k). Train batch = 6, seq_len = 2048, Muon optimizer, bf16 autocast + SDPA + `torch.compile`.
- **Run status:** The HOPE pilot reached step 246 667 (≈3.0 B tokens). We package the step 230 000 checkpoint as the release artifact because it predates the LR cooldown and logged stable eval metrics.
- **Metrics (memorization enabled, 256-sample cap per task):**

  | Eval | HOPE (step 230k) | TITAN (step 9k, reference) |
  |------|------------------|----------------------------|
  | PIQA | **0.496** | 0.492 |
  | HellaSwag | 0.297 | – |
  | Winogrande | 0.473 | – |
  | ARC-E / ARC-C | 0.285 / 0.234 | – |
  | BoolQ | 0.367 | – |
  | SIQA | 0.316 | – |
  | CommonSenseQA | 0.180 | – |
  | OpenBookQA | 0.113 | – |
  | NIAH (2 k → 65 k) | 0.625 / 0.50 / 0.375 / 0.50 / 0.75 / 0.50 | 0.50 @ 2–8 k |
  | Continual CE (RefinedWeb/Wiki/C4/RP) | 8.06 / 7.79 / 7.68 / 7.95 | 12–14 |

- **Packaging:** `artifacts/pilot_release/` mirrors the 230 k checkpoint (`checkpoint.pt`), config snapshot, pilot logs, metadata with the 3 B-token goal, and eval JSONs (legacy step 22 k + new step 230 k). TITAN short-run metrics remain bundled.
- **Next:** With both HOPE (step 230 k) and TITAN (step 25 k) packaged, the immediate tasks are (1) run the queued ablations (teach-scale, CMS chunking, optimizer swaps) on the HOPE checkpoint tree, and (2) extend evaluation coverage to larger configs before resuming the HOPE long run past 246 k steps.

- **TITAN baseline (25 k steps):** The long run on `configs/mid_titan_baseline.yaml` wrapped at step 25 000 (`artifacts/checkpoints/mid_titan_baseline/step_025000.pt`, W&B `titan-long-20251113192738`). Fresh evals (memorization on, 256 max samples) show:

  | Eval | TITAN (step 25k) |
  |------|------------------|
  | PIQA / HellaSwag / Winogrande | 0.484 / 0.293 / 0.480 |
  | ARC-E / ARC-C / BoolQ / SIQA | 0.281 / 0.250 / 0.398 / 0.293 |
  | CSQA / OpenBookQA | 0.188 / 0.145 |
  | NIAH (2 k → 65 k) | 0.50 / 0.625 / 0.125 / 0.75 / 0.50 / 0.125 |
  | Continual CE (RefinedWeb/Wiki/C4/RP) | 8.36 / 8.12 / 7.85 / 8.11 |

  Outputs live in `eval/zeroshot_titan_step25000.json`, `eval/niah_titan_step25000.json`, `eval/continual_titan_step25000.json` (also copied into `artifacts/pilot_release/` alongside `titan_step_025000.pt`). These numbers now provide the matched baseline for HOPE step 230 k comparisons and upcoming ablations.


---

## 4. Observations & Lessons Learned
1. **NaNs past 80 steps:** Early runs blew up after 80 steps once teach_scale exceeded 0.05. Introducing runtime scaling + residual clipping inside TITAN/CMS eliminated the NaNs and allowed 220-step runs on a single GPU.
2. **Batch-size constraints:** With only one GPU, we reduced per-GPU batch to 4 to stay within 49 GB VRAM. DDP runs will need gradient checkpointing or FSDP to scale further.
3. **NIAH is data hungry:** Every HOPE/TITAN run so far shows near-random recall at 2k/4k tokens; longer contexts and more tokens are required to differentiate architectures.
4. **Teach signal scheduling:** A linear warmup (60 steps) followed by linear decay (start 140) kept the 220-step run stable. Future runs should explore cosine or per-level schedules.

---

## 5. Limitations
- Current comparisons cover only the 160 M-scale HOPE/TITAN pair; larger configs (760 M / 1.3 B) remain untrained.
- Scaling beyond the pilot is still blocked on additional compute + stability sweeps for teach_scale, CMS depth, and optimizer variants.
- DDP/TITAN runs still rely on JSON logging; integration with structured logging (e.g., W&B) is deferred to future contributors.
- Pipeline uses filtered RefinedWeb proxies; exact data parity with Google’s internal corpora is not guaranteed.

---

## 6. Next Steps
1. **Longer Runs:** Extend both HOPE and TITAN baselines to millions of tokens using FSDP/DeepSpeed (target ≥760 M parameter config).
2. **Eval Coverage:** Integrate full RAFT/ARC suite plus additional long-context datasets (Needle-in-a-Haystack 32k, PassKey tasks).
3. **HPO:** Once stable runs exist, sweep teach_scale/clip, CMS depth, and self-mod learning rates to quantify HOPE vs TITAN gains.
4. **Automation:** Add CI for data sampling + dual-GPU smoke to catch regressions, and consider nightly tmux scripts for longer training jobs.

### 3.5 HOPE Pilot Relaunch (toward step 250 k, surprise-gated)

- **Config:** `configs/pilot.yaml` with Muon outer optimizer, `nl_l2_precond` inner variant, `teach_scale=0.10`, `surprise_threshold=0.02`.
- **Checkpoint:** `artifacts/checkpoints/pilot_relaunch/step_477000.pt` (verified via `scripts/checkpoint/verify.py`; sidecars stored alongside the checkpoint).
- **Eval suite:** `eval/zeroshot_pilot.json`, `eval/niah_pilot.json`, `eval/continual_pilot.json`, `eval/passkey_pilot.json`, `eval/pg19_pilot.json`.
- **Report:** `reports/checkpoints/pilot_relaunch_step477000.md`.
- **Note:** with `surprise_threshold=0.02` the memorize harness recorded 0 update events on these short prompts, so memorization deltas are ≈0 (expected for this gated configuration).

### 3.6 TITAN Long Baseline Relaunch (toward step 25 k)

- **Config:** `configs/mid_titan_baseline.yaml`, `teach_scale=0.10`, `surprise_threshold=0.02`.
- **Checkpoint:** `artifacts/checkpoints/mid_titan_long/step_032000.pt` (verified via `scripts/checkpoint/verify.py`; sidecars stored alongside the checkpoint).
- **Eval suite:** `eval/zeroshot_titan.json`, `eval/niah_titan.json`, `eval/continual_titan.json`, `eval/passkey_titan.json`, `eval/pg19_titan.json`.
- **Report:** `reports/checkpoints/titan_long_step32000.md`.
- **Note:** with `surprise_threshold=0.02` the memorize harness recorded 0 update events on these short prompts, so memorization deltas are ≈0 (expected for this gated configuration).

---

## 7. References
- `docs/stage2_progress.md` – running log of all Stage 2 work.
- `docs/stability_journal.md` – chronological notes on NaN fixes, teach-scale tuning, tmux jobs.
- `reports/stage2_smoke.md` – command cheat sheet for reproducing the smoke runs referenced here.

This report will be updated as we push beyond short runs and start reproducing the full metrics from Google's Nested Learning paper.


================================================
FILE: docs/future_directions.md
================================================
# Future Directions – Nested Learning Reproduction

This roadmap outlines high-impact areas for contributors once the initial public release is out. Items are organized by theme and roughly prioritized.

---

## 1. Scaling the Architecture
1. **Longer Runs (≥3B tokens):** Use FSDP or DeepSpeed ZeRO to train the 760 M config on the filtered `_full` shards. Target at least 3B tokens so HOPE’s long-context advantages can emerge.
2. **Target Config (1.3 B / 100 B tokens):** Prepare configs and launcher scripts for multi-node environments (Slurm, Kubernetes). Emphasize reproducible manifests and resume logic.
3. **Context Expansion:** Integrate FlashAttention2 or block-sparse attention to push context lengths beyond 32k tokens. Update `scripts/eval/niah.py` accordingly.

## 2. Evaluation & Analysis
1. **Full Benchmark Suite:** Extend `scripts/eval/zeroshot.py` to include ARC-E/C, BoolQ, SIQA by default with standard prompts. Automate results aggregation into Markdown tables.
2. **Long-Context Benchmarks:** Add Passkey, PG19, and retrieval tasks besides Needle-in-a-Haystack.
3. **Continual Learning:** Create larger segment manifests (e.g., Wikipedia by year) and compute forgetting metrics across dozens of checkpoints.

## 3. Optimization & HPO
1. **Teach-Scale Scheduling:** Explore cosine or per-level schedules; integrate gradient clipping hyperparameters through Hydra sweeps.
2. **Optimizer Variants:** Try Muon/DeepMomentum for TITAN/CMS updates. Compare against simple SGD/Adam baselines.
3. **Automated Sweeps:** Wire up lightweight HPO (Ray Tune, Ax) for pilot configs to test teach_scale, clip, and CMS depth combinations.

## 4. Data & Tooling
1. **Dataset Expansion:** Add book/video/code corpora, ensure licensing compliance, and document provenance.
2. **Tokenizer Experiments:** Evaluate alternative vocab sizes or SentencePiece BPE to see if certain domains benefit.
3. **CI Enhancements:** Add GPU-aware smoke tests (e.g., GitHub self-hosted runner) to catch regressions in dual-GPU workflows.

## 5. Documentation & Community
1. **Release Notes:** Publish structured release notes with each tagged version (capabilities, limitations, roadmap).
2. **Contributor Guides:** Document coding standards, logging conventions, and how to submit new configs/evals.
3. **Experiment Tracking:** Encourage use of the `docs/experiments_report.md` template for all major runs to keep the public record up to date.

---

Contributors are welcome to pick any of these items (or propose new ones) via GitHub issues or pull requests. Please cross-reference this file so efforts stay coordinated.*** End Patch


================================================
FILE: docs/phase2_comparison.md
================================================
# Phase 2 – HOPE-Attention vs Transformer Baseline

Phase 2 is “implementation-complete” when we can compare the **paper-defined HOPE-Attention** variant
(`Attention → CMS`) against a **standard Transformer** baseline (`Attention → MLP`) using the same
tokenizer, context lengths, and evaluation harness.

This does **not** require paper-scale training; it’s intended for correctness/ergonomics and
CPU-friendly smoke checks.

## 0) Prerequisites

- A SentencePiece tokenizer at `artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model`.
  - If missing, run `uv run bash scripts/data/run_sample.sh` (see `docs/guide.md`).

## 1) Smoke checkpoints (CPU)

Train two tiny smoke checkpoints from the same base config:

```bash
# HOPE-Attention smoke (paper-defined variant)
uv run python train.py --config-name pilot_smoke \
  model.block_variant=hope_attention \
  model.qk_l2_norm=true model.local_conv_window=4 \
  train.checkpoint.dir=artifacts/checkpoints/pilot_smoke_attention \
  logging.path=logs/pilot_smoke_attention.json

# Transformer baseline smoke
uv run python train.py --config-name pilot_smoke \
  model.block_variant=transformer \
  model.qk_l2_norm=true model.local_conv_window=4 \
  train.checkpoint.dir=artifacts/checkpoints/pilot_smoke_transformer \
  logging.path=logs/pilot_smoke_transformer.json
```

## 2) Long-context comparison (CPU)

Use the comparison runner (writes a single JSON with both results):

```bash
uv run python scripts/eval/compare_variants.py \
  --a-config configs/pilot_smoke.yaml \
  --a-checkpoint artifacts/checkpoints/pilot_smoke_attention/step_000010.pt \
  --b-config configs/pilot_smoke.yaml \
  --b-checkpoint artifacts/checkpoints/pilot_smoke_transformer/step_000010.pt \
  --tokenizer-path artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model \
  --device cpu \
  --smoke \
  --output eval/phase2_compare_smoke.json
```

For larger GPU-backed pilots, use the dedicated Hydra configs:
- `configs/hope/pilot_attention.yaml`
- `configs/hope/pilot_transformer.yaml`

and rerun the comparison script on the resulting checkpoints.

## 3) Adaptation sanity check (no training)

This repo also includes a deterministic unit-level smoke that demonstrates **in-context adaptation**
exists for `hope_attention` (via CMS fast-state up

Download .txt

gitextract_wq324oq_/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── config.yml
│   │   ├── eval_request.md
│   │   ├── faithfulness_gap.md
│   │   └── perf_regression.md
│   └── workflows/
│       ├── ci.yml
│       ├── packages.yml
│       ├── release.yml
│       └── security.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── TODO.md
├── configs/
│   ├── ablations/
│   │   ├── cms_sparse.yaml
│   │   ├── selfmod_chunked_8_64.yaml
│   │   ├── selfmod_momentum_off.yaml
│   │   ├── selfmod_momentum_on.yaml
│   │   ├── selfmod_no_alpha.yaml
│   │   ├── selfmod_no_cms.yaml
│   │   └── selfmod_rank1_precond_off.yaml
│   ├── data/
│   │   ├── continual_segments_sample.yaml
│   │   ├── fineweb_edu_longdoc_filtered_sample.yaml
│   │   ├── fineweb_edu_mixture_full.yaml
│   │   ├── fineweb_edu_mixture_sample.yaml
│   │   ├── refinedweb_mixture.yaml
│   │   ├── refinedweb_mixture_filtered.yaml
│   │   ├── refinedweb_mixture_full.yaml
│   │   └── refinedweb_mixture_sample.yaml
│   ├── deepspeed/
│   │   └── zero3.json
│   ├── hope/
│   │   ├── mid.yaml
│   │   ├── mid_fsdp.yaml
│   │   ├── pilot.yaml
│   │   ├── pilot_attention.yaml
│   │   ├── pilot_selfmod.yaml
│   │   ├── pilot_transformer.yaml
│   │   ├── target.yaml
│   │   └── target_fsdp.yaml
│   ├── mid_smoke.yaml
│   ├── mid_stage2.yaml
│   ├── mid_stage2_smoke.yaml
│   ├── mid_titan_baseline.yaml
│   ├── pilot.yaml
│   ├── pilot_paper_faithful.yaml
│   ├── pilot_selfmod_paper_faithful.yaml
│   ├── pilot_smoke.yaml
│   └── resolved/
│       ├── cms_sparse_eval.yaml
│       ├── phase2_pilot_attention_eval.yaml
│       └── phase2_pilot_transformer_eval.yaml
├── docker/
│   └── Dockerfile.dist
├── docs/
│   ├── BUG_REPORT_CHECKLIST.md
│   ├── COMPATIBILITY_MATRIX.md
│   ├── FSDP_SCALING_GUIDE.md
│   ├── IMPLEMENTATION_STATUS.md
│   ├── P4_REMEDIATION_PLAN.md
│   ├── PACKAGE_RELEASE_CHECKLIST.md
│   ├── PAPER_COMPLIANCE.md
│   ├── PHASE2_LONG_CONTEXT_COMPARISON.md
│   ├── PHASE_2_PLAN.md
│   ├── PYPI_TRUSTED_PUBLISHING.md
│   ├── STREAMING_CONTRACT.md
│   ├── VERSIONING_POLICY.md
│   ├── compute_plan.md
│   ├── continual_classification_eval.md
│   ├── continual_eval.md
│   ├── data_pipeline.md
│   ├── env_matrix.md
│   ├── experiments_report.md
│   ├── future_directions.md
│   ├── phase2_comparison.md
│   ├── release_checklist.md
│   ├── scaling_guidance.md
│   ├── spec_interfaces.md
│   ├── sprint_next_plan.md
│   ├── stage2_plan.md
│   ├── stage2_progress.md
│   ├── templates/
│   │   └── checkpoint_report.md
│   └── zeroshot_eval.md
├── eval/
│   ├── continual_dummy.json
│   ├── continual_mid_stage2.json
│   ├── continual_mid_stage2_smoke.json
│   ├── continual_mid_stage2_ts10.json
│   ├── continual_mid_stage2_ts10_single120_clip.json
│   ├── continual_mid_stage2_ts10_single140_schedC.json
│   ├── continual_mid_stage2_ts10_single220_schedD.json
│   ├── continual_mid_stage2_ts10_single80.json
│   ├── continual_mid_stage2_ts10_single80lr2e5.json
│   ├── continual_mid_stage2_ts20.json
│   ├── continual_mid_titan_baseline.json
│   ├── continual_pilot.json
│   ├── continual_pilot_cms_nochunk_step5000.json
│   ├── continual_pilot_cms_sparse_step5000.json
│   ├── continual_pilot_multi.json
│   ├── continual_pilot_opt_adamw_step5000.json
│   ├── continual_pilot_opt_muon_step5000.json
│   ├── continual_pilot_selfmod_off_step5000.json
│   ├── continual_pilot_step22000.json
│   ├── continual_pilot_step230000.json
│   ├── continual_pilot_teach05_long_step25000.json
│   ├── continual_pilot_teach05_step2000.json
│   ├── continual_pilot_teach15_long_step25000.json
│   ├── continual_pilot_teach15_step2000.json
│   ├── continual_smoke.json
│   ├── continual_titan.json
│   ├── continual_titan_relaunch_step001000.json
│   ├── continual_titan_step25000.json
│   ├── niah_dummy.json
│   ├── niah_mid_stage2.json
│   ├── niah_mid_stage2_smoke.json
│   ├── niah_mid_stage2_ts10.json
│   ├── niah_mid_stage2_ts10_single120_clip.json
│   ├── niah_mid_stage2_ts10_single140_schedC.json
│   ├── niah_mid_stage2_ts10_single220_schedD.json
│   ├── niah_mid_stage2_ts10_single80.json
│   ├── niah_mid_stage2_ts10_single80lr2e5.json
│   ├── niah_mid_stage2_ts20.json
│   ├── niah_mid_titan_baseline.json
│   ├── niah_pilot.json
│   ├── niah_pilot_cms_nochunk_step5000.json
│   ├── niah_pilot_cms_sparse_step5000.json
│   ├── niah_pilot_opt_adamw_step5000.json
│   ├── niah_pilot_opt_muon_step5000.json
│   ├── niah_pilot_selfmod_off_step5000.json
│   ├── niah_pilot_step22000.json
│   ├── niah_pilot_step230000.json
│   ├── niah_pilot_teach05_long_step25000.json
│   ├── niah_pilot_teach05_step2000.json
│   ├── niah_pilot_teach15_long_step25000.json
│   ├── niah_pilot_teach15_step2000.json
│   ├── niah_smoke.json
│   ├── niah_titan.json
│   ├── niah_titan_relaunch_step001000.json
│   ├── niah_titan_step25000.json
│   ├── passkey_pilot.json
│   ├── passkey_pilot_step230000.json
│   ├── passkey_titan.json
│   ├── passkey_titan_relaunch_step001000.json
│   ├── passkey_titan_step25000.json
│   ├── pg19_pilot.json
│   ├── pg19_pilot_step230000.json
│   ├── pg19_titan.json
│   ├── pg19_titan_relaunch_step001000.json
│   ├── pg19_titan_step25000.json
│   ├── phase2_compare_smoke_lastlayer_metrics.json
│   ├── zeroshot_full_smoke.json
│   ├── zeroshot_mid_stage2.json
│   ├── zeroshot_mid_stage2_smoke.json
│   ├── zeroshot_mid_stage2_smoke_piqa_baseline.json
│   ├── zeroshot_mid_stage2_smoke_piqa_mem.json
│   ├── zeroshot_mid_stage2_ts10.json
│   ├── zeroshot_mid_stage2_ts10_single120_clip.json
│   ├── zeroshot_mid_stage2_ts10_single140_schedC.json
│   ├── zeroshot_mid_stage2_ts10_single220_schedD.json
│   ├── zeroshot_mid_stage2_ts10_single80.json
│   ├── zeroshot_mid_stage2_ts10_single80lr2e5.json
│   ├── zeroshot_mid_stage2_ts20.json
│   ├── zeroshot_mid_titan_baseline.json
│   ├── zeroshot_pilot.json
│   ├── zeroshot_pilot_cms_nochunk_step5000.json
│   ├── zeroshot_pilot_cms_sparse_step5000.json
│   ├── zeroshot_pilot_debug.json
│   ├── zeroshot_pilot_dummy_piqa.json
│   ├── zeroshot_pilot_opt_adamw_step5000.json
│   ├── zeroshot_pilot_opt_muon_step5000.json
│   ├── zeroshot_pilot_selfmod_off_step5000.json
│   ├── zeroshot_pilot_step22000.json
│   ├── zeroshot_pilot_step230000.json
│   ├── zeroshot_pilot_teach05_long_step25000.json
│   ├── zeroshot_pilot_teach05_step2000.json
│   ├── zeroshot_pilot_teach15_long_step25000.json
│   ├── zeroshot_pilot_teach15_step2000.json
│   ├── zeroshot_smoke.json
│   ├── zeroshot_titan.json
│   ├── zeroshot_titan_relaunch_step001000.json
│   └── zeroshot_titan_step25000.json
├── google_papers/
│   ├── Nested_Learning/
│   │   ├── Nested_Learning.json
│   │   └── Nested_Learning.md
│   └── TITANs/
│       ├── TITANs.json
│       └── TITANs.md
├── pyproject.toml
├── reports/
│   ├── ablations.md
│   ├── cadence_mechanism_audit_smoke.json
│   ├── compliance_mechanism_audit_smoke.json
│   ├── compliance_summary_pilot.json
│   ├── compliance_summary_pilot_paper_faithful.json
│   ├── next_backlog_scoped.md
│   ├── security_release_gate.md
│   ├── sprint_completion_report.md
│   └── stage2_smoke.md
├── scripts/
│   ├── __init__.py
│   ├── checkpoint/
│   │   └── verify.py
│   ├── checks/
│   │   ├── check_data_script_help.sh
│   │   ├── check_git_tracked_sizes.sh
│   │   ├── check_readme_commands.sh
│   │   ├── compliance_report.py
│   │   ├── run_fidelity_ci_subset.sh
│   │   ├── tokenizer_coverage_guard.py
│   │   ├── verify_docs_refs.py
│   │   └── verify_update_cadence.py
│   ├── compute/
│   │   └── create_reservations.sh
│   ├── data/
│   │   ├── __init__.py
│   │   ├── check_tokenizer.py
│   │   ├── check_tokenizer_coverage.py
│   │   ├── filter_corpus.py
│   │   ├── process_mixture.py
│   │   ├── run_full.sh
│   │   ├── run_sample.sh
│   │   ├── shard_corpus.py
│   │   ├── train_tokenizer.py
│   │   └── validate_mixture.py
│   ├── eval/
│   │   ├── __init__.py
│   │   ├── compare_variants.py
│   │   ├── continual.py
│   │   ├── continual_classification.py
│   │   ├── niah.py
│   │   ├── niah_suite.py
│   │   ├── passkey.py
│   │   ├── pg19_perplexity.py
│   │   ├── phase2_memorization_delta_smoke.py
│   │   ├── plot_continual_classification.py
│   │   ├── plot_forgetting.py
│   │   ├── plot_niah_suite.py
│   │   ├── run_pilot_suite.sh
│   │   ├── summarize_eval.py
│   │   └── zeroshot.py
│   ├── package_pilot_release.sh
│   ├── run_cpu_ddp_smoke.sh
│   ├── run_e2e_smoke.sh
│   ├── run_mechanism_audit_smoke.sh
│   ├── run_smoke.sh
│   └── tests/
│       └── run_passkey_smoke.sh
├── src/
│   └── nested_learning/
│       ├── __init__.py
│       ├── __main__.py
│       ├── assoc_memory.py
│       ├── backbones.py
│       ├── capabilities.py
│       ├── cli.py
│       ├── cms.py
│       ├── config_utils.py
│       ├── continual_classification.py
│       ├── continual_streaming.py
│       ├── data.py
│       ├── device.py
│       ├── eval_state.py
│       ├── fast_state.py
│       ├── functional.py
│       ├── hope/
│       │   ├── __init__.py
│       │   ├── block.py
│       │   └── self_mod.py
│       ├── instrumentation.py
│       ├── levels.py
│       ├── logging_utils.py
│       ├── memorize.py
│       ├── model.py
│       ├── optim/
│       │   ├── __init__.py
│       │   ├── deep.py
│       │   ├── factory.py
│       │   ├── m3.py
│       │   └── manager.py
│       ├── titan/
│       │   ├── __init__.py
│       │   ├── memory.py
│       │   ├── model.py
│       │   └── self_modifying.py
│       ├── tokenizer.py
│       ├── tokenizer_coverage.py
│       ├── training.py
│       └── transformer.py
├── tests/
│   ├── conftest.py
│   ├── data/
│   │   ├── passkey_corpus.txt
│   │   ├── tiny_tokenizer.model
│   │   └── tiny_tokenizer.vocab
│   ├── test_algorithm_mode_grad.py
│   ├── test_attention_cache.py
│   ├── test_attention_features.py
│   ├── test_boundary_state_mode.py
│   ├── test_boundary_state_training_loop.py
│   ├── test_build_model_from_cfg_selfmod.py
│   ├── test_checkpoint_metadata_and_eval_loaders.py
│   ├── test_cli_tooling.py
│   ├── test_cms.py
│   ├── test_cms_cross_call.py
│   ├── test_cms_delta_rule.py
│   ├── test_cms_flush_partial.py
│   ├── test_compare_variants_cli.py
│   ├── test_compile_toggle.py
│   ├── test_compliance_report.py
│   ├── test_continual_classification.py
│   ├── test_continual_eval_state_mode.py
│   ├── test_data_scripts_help.py
│   ├── test_data_split_fallbacks.py
│   ├── test_determinism_seed.py
│   ├── test_device_resolution.py
│   ├── test_distributed_fail_fast.py
│   ├── test_eval_builders.py
│   ├── test_eval_state.py
│   ├── test_eval_state_cli.py
│   ├── test_faithfulness_harness.py
│   ├── test_fast_state_batch_semantics.py
│   ├── test_fast_state_forward_equivalence.py
│   ├── test_fast_state_meta_grads.py
│   ├── test_fast_state_selfmod_meta_grads.py
│   ├── test_git_tracked_sizes_check.py
│   ├── test_hope_block.py
│   ├── test_hope_selfmod_fast_state_meta_unchanged.py
│   ├── test_hope_selfmod_integration.py
│   ├── test_hope_selfmod_update_pass.py
│   ├── test_levels.py
│   ├── test_m3.py
│   ├── test_m3_slow_timing.py
│   ├── test_memorization.py
│   ├── test_model.py
│   ├── test_model_streaming_cadence.py
│   ├── test_online_chunking.py
│   ├── test_optim.py
│   ├── test_optimizer_param_policy.py
│   ├── test_package_release_script.py
│   ├── test_paper_faithful_configs.py
│   ├── test_phase2_memorization_delta.py
│   ├── test_residual_mlp_memory.py
│   ├── test_run_features.py
│   ├── test_self_modifying_titans.py
│   ├── test_selfmod_adaptive_q.py
│   ├── test_selfmod_dgd_linear.py
│   ├── test_selfmod_grad_flow.py
│   ├── test_selfmod_local_conv.py
│   ├── test_selfmod_online.py
│   ├── test_strict_streaming_contract.py
│   ├── test_surprise_metric.py
│   ├── test_surprise_override.py
│   ├── test_teach_signal.py
│   ├── test_tied_weight_guard.py
│   ├── test_variants.py
│   ├── test_verify_docs_refs.py
│   └── test_verify_update_cadence.py
├── train.py
├── train_deepspeed.py
├── train_dist.py
└── train_fsdp.py

Download .txt

SYMBOL INDEX (723 symbols across 124 files)

FILE: scripts/checkpoint/verify.py
  function main (line 15) | def main(

FILE: scripts/checks/compliance_report.py
  function _load_resolved_config (line 22) | def _load_resolved_config(config_path: Path):
  class CheckResult (line 42) | class CheckResult:
  function _append (line 48) | def _append(results: list[CheckResult], name: str, ok: bool, detail: str...
  function main (line 53) | def main(

FILE: scripts/checks/tokenizer_coverage_guard.py
  function main (line 19) | def main(

FILE: scripts/checks/verify_docs_refs.py
  function _iter_code_spans (line 31) | def _iter_code_spans(text: str) -> Iterable[str]:
  function _iter_link_targets (line 36) | def _iter_link_targets(text: str) -> Iterable[str]:
  function _normalize_path_candidate (line 41) | def _normalize_path_candidate(token: str) -> str | None:
  function parse_referenced_paths (line 68) | def parse_referenced_paths(doc_text: str) -> set[str]:
  function _slugify_heading (line 82) | def _slugify_heading(heading: str) -> str:
  function _extract_markdown_anchors (line 91) | def _extract_markdown_anchors(path: Path) -> set[str]:
  function parse_anchor_references (line 108) | def parse_anchor_references(doc_text: str) -> list[tuple[str, str]]:
  function verify_docs_refs (line 126) | def verify_docs_refs(
  function main (line 156) | def main() -> int:

FILE: scripts/checks/verify_update_cadence.py
  function _expected_counts (line 10) | def _expected_counts(
  function _load_records (line 33) | def _load_records(path: Path) -> list[dict[str, Any]]:
  function _find_last_with_prefix (line 40) | def _find_last_with_prefix(records: list[dict[str, Any]], prefix: str) -...
  function verify_cadence (line 49) | def verify_cadence(
  function _build_parser (line 89) | def _build_parser() -> argparse.ArgumentParser:
  function main (line 119) | def main() -> int:

FILE: scripts/data/check_tokenizer.py
  function compute_sha256 (line 11) | def compute_sha256(path: Path) -> str:
  function dump_metadata (line 19) | def dump_metadata(path: Path, sha256: str, output: Optional[Path]) -> None:
  function parse_args (line 29) | def parse_args() -> argparse.Namespace:
  function main (line 60) | def main() -> None:

FILE: scripts/data/check_tokenizer_coverage.py
  function main (line 16) | def main(

FILE: scripts/data/filter_corpus.py
  function _select_fallback_split (line 21) | def _select_fallback_split(available: list[str]) -> str:
  function normalize_text (line 28) | def normalize_text(text: str) -> str:
  function is_target_language (line 32) | def is_target_language(text: str, target_lang: str, threshold: float) ->...
  function main (line 41) | def main(

FILE: scripts/data/process_mixture.py
  function main (line 18) | def main(

FILE: scripts/data/shard_corpus.py
  class ShardConfig (line 19) | class ShardConfig:
  function _select_fallback_split (line 34) | def _select_fallback_split(available: list[str]) -> str:
  function shard_dataset (line 41) | def shard_dataset(config: ShardConfig) -> dict:
  function main (line 116) | def main(
  function _write_shard (line 154) | def _write_shard(sequences: List[List[int]], output_dir: Path, shard_idx...

FILE: scripts/data/train_tokenizer.py
  class DatasetSpec (line 19) | class DatasetSpec:
  function _select_fallback_split (line 29) | def _select_fallback_split(available: list[str]) -> str:
  function _load_specs_from_manifest (line 36) | def _load_specs_from_manifest(manifest: Path) -> List[DatasetSpec]:
  function _write_samples (line 55) | def _write_samples(spec: DatasetSpec, handle) -> int:
  function main (line 90) | def main(

FILE: scripts/data/validate_mixture.py
  function _dir_stats (line 14) | def _dir_stats(path: Path, sample_limit: int = 2000) -> tuple[dict[str, ...
  function main (line 27) | def main(

FILE: scripts/eval/compare_variants.py
  class ModelSpec (line 32) | class ModelSpec:
  function _load_model (line 38) | def _load_model(spec: ModelSpec, device: torch.device) -> torch.nn.Module:
  function _logprob_answer (line 53) | def _logprob_answer(
  function _memorize_prompt_answer_only (line 79) | def _memorize_prompt_answer_only(
  function _make_passkey_prompt (line 112) | def _make_passkey_prompt(*, filler_sentences: int, key: str) -> str:
  function _run_passkey (line 122) | def _run_passkey(
  function _make_niah_prompt (line 246) | def _make_niah_prompt(*, needle: str, filler_tokens: int) -> str:
  function _run_niah (line 257) | def _run_niah(
  function main (line 384) | def main(

FILE: scripts/eval/continual.py
  function load_segments (line 34) | def load_segments(yaml_path: Path) -> List[Dict[str, str]]:
  function evaluate_segment (line 39) | def evaluate_segment(
  function main (line 143) | def main(

FILE: scripts/eval/continual_classification.py
  function _load_local_jsonl (line 34) | def _load_local_jsonl(path: Path) -> List[ClassificationExample]:
  function _load_examples (line 44) | def _load_examples(
  function main (line 58) | def main(

FILE: scripts/eval/niah.py
  function load_model (line 29) | def load_model(config_path: Path, checkpoint: Path, device: torch.device...
  function make_prompt (line 44) | def make_prompt(needle: str, filler_tokens: int) -> str:
  function logprob_answer (line 55) | def logprob_answer(
  function main (line 82) | def main(

FILE: scripts/eval/niah_suite.py
  function load_model (line 28) | def load_model(config_path: Path, checkpoint: Path, device: torch.device):
  function _logprob_answer (line 43) | def _logprob_answer(
  function _filler_sentences (line 68) | def _filler_sentences(count: int) -> List[str]:
  function _ensure_prompt_length (line 72) | def _ensure_prompt_length(
  class VariantCase (line 96) | class VariantCase:
  function _case_single_needle (line 102) | def _case_single_needle(rng: random.Random) -> VariantCase:
  function _case_multi_needle (line 114) | def _case_multi_needle(rng: random.Random, *, needles: int) -> VariantCase:
  function _case_kv_single (line 132) | def _case_kv_single(rng: random.Random) -> VariantCase:
  function _case_kv_multi (line 145) | def _case_kv_multi(rng: random.Random, *, pairs: int) -> VariantCase:
  function _case_positioned_needle (line 165) | def _case_positioned_needle(rng: random.Random, *, position: str) -> Var...
  function _variant_cases (line 176) | def _variant_cases(rng: random.Random, *, variant: str) -> VariantCase:
  function _evaluate_variant (line 191) | def _evaluate_variant(
  function main (line 302) | def main(

FILE: scripts/eval/passkey.py
  function load_model (line 30) | def load_model(config: Path, checkpoint: Path, device: torch.device):
  function make_prompt (line 42) | def make_prompt(context_tokens: int, key: str) -> str:
  function logprob (line 49) | def logprob(
  function main (line 65) | def main(

FILE: scripts/eval/pg19_perplexity.py
  function load_model (line 26) | def load_model(config: Path, checkpoint: Path, device: torch.device):
  function _nll_for_text (line 38) | def _nll_for_text(
  function main (line 62) | def main(

FILE: scripts/eval/phase2_memorization_delta_smoke.py
  function _build_model (line 23) | def _build_model(*, variant: str, vocab_size: int, dim: int, layers: int...
  function _run_once (line 40) | def _run_once(
  function main (line 71) | def main(

FILE: scripts/eval/plot_continual_classification.py
  function main (line 18) | def main(

FILE: scripts/eval/plot_forgetting.py
  function main (line 14) | def main(

FILE: scripts/eval/plot_niah_suite.py
  function main (line 16) | def main(

FILE: scripts/eval/summarize_eval.py
  function _flatten_numeric (line 13) | def _flatten_numeric(obj: Any, *, prefix: str = "") -> Dict[str, float]:
  function _expand_keys (line 30) | def _expand_keys(flat: Dict[str, float], keys: Iterable[str]) -> List[str]:
  function _render_table (line 53) | def _render_table(rows: List[Tuple[str, Dict[str, float]]], keys: List[s...
  function main (line 69) | def main(

FILE: scripts/eval/zeroshot.py
  function load_model (line 29) | def load_model(config_path: Path, checkpoint: Path, device: torch.device):
  function score_text (line 44) | def score_text(
  function evaluate_multiple_choice (line 61) | def evaluate_multiple_choice(
  function build_piqa_texts (line 140) | def build_piqa_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_piqa (line 148) | def eval_piqa(model, tokenizer, device, max_samples, memorize_cfg):
  function build_hellaswag_texts (line 155) | def build_hellaswag_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_hellaswag (line 164) | def eval_hellaswag(model, tokenizer, device, max_samples, memorize_cfg):
  function build_winogrande_texts (line 178) | def build_winogrande_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_winogrande (line 186) | def eval_winogrande(model, tokenizer, device, max_samples, memorize_cfg):
  function build_arc_texts (line 200) | def build_arc_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_arc (line 209) | def eval_arc(
  function build_boolq_texts (line 225) | def build_boolq_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_boolq (line 232) | def eval_boolq(model, tokenizer, device, max_samples, memorize_cfg):
  function build_siqa_texts (line 239) | def build_siqa_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_siqa (line 247) | def eval_siqa(model, tokenizer, device, max_samples, memorize_cfg):
  function build_commonsenseqa_texts (line 254) | def build_commonsenseqa_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_commonsenseqa (line 263) | def eval_commonsenseqa(model, tokenizer, device, max_samples, memorize_c...
  function build_openbookqa_texts (line 277) | def build_openbookqa_texts(sample: dict) -> Tuple[str, List[str], int]:
  function eval_openbookqa (line 286) | def eval_openbookqa(model, tokenizer, device, max_samples, memorize_cfg):
  function main (line 316) | def main(

FILE: src/nested_learning/__main__.py
  function main (line 6) | def main() -> None:

FILE: src/nested_learning/assoc_memory.py
  class AssocMemory (line 9) | class AssocMemory(nn.Module):
    method forward (line 12) | def forward(self, query: torch.Tensor) -> torch.Tensor:  # type: ignor...
    method update (line 16) | def update(
  class SupportsReset (line 22) | class SupportsReset(Protocol):
    method reset_state (line 23) | def reset_state(self) -> None: ...

FILE: src/nested_learning/backbones.py
  class AttentionConfig (line 13) | class AttentionConfig:
  class SelfAttention (line 24) | class SelfAttention(nn.Module):
    method __init__ (line 25) | def __init__(self, config: AttentionConfig):
    method forward (line 51) | def forward(  # type: ignore[override]
    method _compute_qkv (line 91) | def _compute_qkv(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.T...
    method _scaled_dot_product_attn (line 103) | def _scaled_dot_product_attn(

FILE: src/nested_learning/capabilities.py
  class RuntimeCapabilities (line 12) | class RuntimeCapabilities:
    method to_dict (line 32) | def to_dict(self) -> dict[str, Any]:
  function collect_runtime_capabilities (line 36) | def collect_runtime_capabilities() -> RuntimeCapabilities:

FILE: src/nested_learning/cli.py
  function _resolve_cli_device (line 23) | def _resolve_cli_device(device: str) -> torch.device:
  function doctor (line 32) | def doctor(
  function smoke (line 89) | def smoke(
  function train (line 161) | def train(
  function audit (line 207) | def audit(

FILE: src/nested_learning/cms.py
  class CMSBlock (line 11) | class CMSBlock(nn.Module):
    method __init__ (line 12) | def __init__(
    method forward (line 38) | def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[ov...
  class CMS (line 48) | class CMS(nn.Module):
    method __init__ (line 51) | def __init__(
    method forward (line 76) | def forward(

FILE: src/nested_learning/config_utils.py
  function find_repo_root (line 15) | def find_repo_root(start: Path | None = None) -> Path | None:
  function resolved_config_dir (line 24) | def resolved_config_dir(config_dir: Path | None = None) -> Iterator[Path]:
  function compose_config (line 40) | def compose_config(

FILE: src/nested_learning/continual_classification.py
  class ClassificationExample (line 8) | class ClassificationExample:
  class LoadedClassificationDataset (line 14) | class LoadedClassificationDataset:
  function load_hf_classification_dataset (line 21) | def load_hf_classification_dataset(
  function load_clinc_oos (line 75) | def load_clinc_oos(
  function load_banking77 (line 90) | def load_banking77(
  function load_dbpedia14 (line 105) | def load_dbpedia14(
  function unique_labels (line 120) | def unique_labels(examples: Iterable[ClassificationExample]) -> List[str]:
  function filter_examples_by_labels (line 131) | def filter_examples_by_labels(

FILE: src/nested_learning/continual_streaming.py
  class StreamingTask (line 15) | class StreamingTask:
  class ContinualEvalConfig (line 23) | class ContinualEvalConfig:
  function _logprob_completion (line 33) | def _logprob_completion(
  function predict_label (line 54) | def predict_label(
  function _balanced_split (line 81) | def _balanced_split(
  function build_streaming_tasks (line 109) | def build_streaming_tasks(
  class ContinualEvalResult (line 142) | class ContinualEvalResult:
  function evaluate_continual_classification (line 149) | def evaluate_continual_classification(

FILE: src/nested_learning/data.py
  class SyntheticTextConfig (line 13) | class SyntheticTextConfig:
  class SyntheticTextDataset (line 19) | class SyntheticTextDataset(Dataset[torch.Tensor]):
    method __init__ (line 20) | def __init__(self, config: SyntheticTextConfig):
    method __len__ (line 23) | def __len__(self) -> int:
    method __getitem__ (line 26) | def __getitem__(self, idx: int) -> torch.Tensor:
  class TokenShardDataset (line 31) | class TokenShardDataset(Dataset[torch.Tensor]):
    method __init__ (line 34) | def __init__(self, shard_dir: str | Path):
    method __len__ (line 53) | def __len__(self) -> int:
    method _load_array (line 56) | def _load_array(self, shard_idx: int) -> np.memmap:
    method __getitem__ (line 61) | def __getitem__(self, idx: int) -> torch.Tensor:
    method _find_shard (line 71) | def _find_shard(self, idx: int) -> int:
  class ShardSourceConfig (line 86) | class ShardSourceConfig:
  class ShardSource (line 92) | class ShardSource:
    method __init__ (line 93) | def __init__(self, config: ShardSourceConfig):
    method sample (line 105) | def sample(self, rng: np.random.Generator) -> np.ndarray:
  class MixtureShardDataset (line 114) | class MixtureShardDataset(IterableDataset[torch.Tensor]):
    method __init__ (line 115) | def __init__(
    method __len__ (line 131) | def __len__(self) -> int:
    method __iter__ (line 134) | def __iter__(self) -> Iterator[torch.Tensor]:
  function collate_batch (line 152) | def collate_batch(batch: list[torch.Tensor]) -> torch.Tensor:

FILE: src/nested_learning/device.py
  function resolve_device (line 6) | def resolve_device(device_str: str) -> torch.device:

FILE: src/nested_learning/eval_state.py
  class EvalStreamingState (line 9) | class EvalStreamingState:
  function parse_eval_state_mode (line 14) | def parse_eval_state_mode(mode: str) -> bool:
  function init_eval_streaming_state (line 29) | def init_eval_streaming_state(
  function forward_with_eval_state (line 53) | def forward_with_eval_state(

FILE: src/nested_learning/fast_state.py
  class CMSChunkBuffer (line 16) | class CMSChunkBuffer:
  function init_module_deltas (line 30) | def init_module_deltas(module: nn.Module) -> ParamDict:
  class BlockFastState (line 43) | class BlockFastState:
  function build_block_fast_state (line 51) | def build_block_fast_state(
  class ModelFastState (line 82) | class ModelFastState:
  class AttentionKVCache (line 87) | class AttentionKVCache:
  class ModelAttentionCache (line 101) | class ModelAttentionCache:

FILE: src/nested_learning/functional.py
  function params_with_deltas (line 12) | def params_with_deltas(module: nn.Module, deltas: ParamDict) -> ParamDict:
  function module_buffers (line 28) | def module_buffers(module: nn.Module) -> ParamDict:
  function call_with_params (line 32) | def call_with_params(
  function call_with_deltas (line 42) | def call_with_deltas(
  function require_grad_params (line 51) | def require_grad_params(
  function grads_to_dict (line 63) | def grads_to_dict(params: ParamDict, grads: Tuple[torch.Tensor | None, ....

FILE: src/nested_learning/hope/block.py
  function _chunk_loss (line 27) | def _chunk_loss(
  function _min_update_period (line 48) | def _min_update_period(levels: Sequence[LevelSpec]) -> int:
  class _CmsBuffer (line 54) | class _CmsBuffer:
  function _clear_buffer (line 61) | def _clear_buffer(buffer: _CmsBuffer | CMSChunkBuffer) -> None:
  function _fast_state_buffers (line 68) | def _fast_state_buffers(
  function _pop_buffer_chunk (line 78) | def _pop_buffer_chunk(
  class HOPEBlockConfig (line 115) | class HOPEBlockConfig:
  class HOPEAttentionBlockConfig (line 135) | class HOPEAttentionBlockConfig:
  class HOPEAttentionBlock (line 151) | class HOPEAttentionBlock(nn.Module):
    method __init__ (line 158) | def __init__(self, config: HOPEAttentionBlockConfig):
    method forward (line 187) | def forward(
    method set_surprise_threshold (line 248) | def set_surprise_threshold(self, threshold: float | None) -> None:
    method set_surprise_metric (line 251) | def set_surprise_metric(self, metric: str) -> None:
    method set_allowed_levels (line 254) | def set_allowed_levels(self, allowed: Set[str] | None) -> None:
    method pop_update_stats (line 257) | def pop_update_stats(self) -> Dict[str, Dict[str, float]]:
    method _cms_forward_fast (line 262) | def _cms_forward_fast(
    method _cms_forward_online (line 276) | def _cms_forward_online(
    method _cms_forward_online_fast (line 381) | def _cms_forward_online_fast(
    method _update_cms_fast (line 498) | def _update_cms_fast(
    method _is_level_allowed (line 561) | def _is_level_allowed(self, level_name: str) -> bool:
    method _passes_surprise (line 566) | def _passes_surprise(self, surprise_value: float | None) -> bool:
    method _record_gate (line 573) | def _record_gate(self, level_name: str, *, hit: bool) -> None:
    method _update_cms (line 578) | def _update_cms(
    method _update_cms_chunk (line 633) | def _update_cms_chunk(
    method _update_cms_chunk_fast (line 667) | def _update_cms_chunk_fast(
  class HOPESelfModBlockConfig (line 719) | class HOPESelfModBlockConfig:
  class HOPESelfModBlock (line 745) | class HOPESelfModBlock(nn.Module):
    method __init__ (line 752) | def __init__(self, config: HOPESelfModBlockConfig):
    method forward (line 790) | def forward(
    method set_surprise_threshold (line 847) | def set_surprise_threshold(self, threshold: float | None) -> None:
    method set_surprise_metric (line 850) | def set_surprise_metric(self, metric: str) -> None:
    method set_allowed_levels (line 853) | def set_allowed_levels(self, allowed: Set[str] | None) -> None:
    method pop_update_stats (line 856) | def pop_update_stats(self) -> Dict[str, Dict[str, float]]:
    method _cms_forward_fast (line 861) | def _cms_forward_fast(
    method _cms_forward_online (line 875) | def _cms_forward_online(
    method _cms_forward_online_fast (line 980) | def _cms_forward_online_fast(
    method _is_level_allowed (line 1091) | def _is_level_allowed(self, level_name: str) -> bool:
    method _passes_surprise (line 1096) | def _passes_surprise(self, surprise_value: float | None) -> bool:
    method _record_gate (line 1103) | def _record_gate(self, level_name: str, *, hit: bool) -> None:
    method _update_cms (line 1108) | def _update_cms(
    method _update_cms_fast (line 1163) | def _update_cms_fast(
    method _update_cms_chunk (line 1219) | def _update_cms_chunk(
    method _update_cms_chunk_fast (line 1252) | def _update_cms_chunk_fast(
  class HOPEBlock (line 1298) | class HOPEBlock(nn.Module):
    method __init__ (line 1299) | def __init__(self, config: HOPEBlockConfig):
    method forward (line 1337) | def forward(
    method set_surprise_threshold (line 1403) | def set_surprise_threshold(self, threshold: float | None) -> None:
    method set_surprise_metric (line 1406) | def set_surprise_metric(self, metric: str) -> None:
    method set_allowed_levels (line 1409) | def set_allowed_levels(self, allowed: Set[str] | None) -> None:
    method _cms_forward_fast (line 1412) | def _cms_forward_fast(
    method _cms_forward_online (line 1427) | def _cms_forward_online(
    method _cms_forward_online_fast (line 1532) | def _cms_forward_online_fast(
    method _update_titan (line 1642) | def _update_titan(
    method _update_titan_fast (line 1702) | def _update_titan_fast(
    method _update_cms (line 1762) | def _update_cms(
    method _update_cms_fast (line 1817) | def _update_cms_fast(
    method _update_cms_chunk (line 1873) | def _update_cms_chunk(
    method _update_cms_chunk_fast (line 1906) | def _update_cms_chunk_fast(
    method pop_update_stats (line 1951) | def pop_update_stats(self) -> Dict[str, Dict[str, float]]:
    method _passes_surprise (line 1956) | def _passes_surprise(self, surprise_value: float | None) -> bool:
    method _is_level_allowed (line 1963) | def _is_level_allowed(self, level_name: str) -> bool:
    method _record_gate (line 1970) | def _record_gate(self, level_name: str, *, hit: bool) -> None:

FILE: src/nested_learning/hope/self_mod.py
  class SelfModifier (line 7) | class SelfModifier(nn.Module):
    method __init__ (line 21) | def __init__(self, dim: int, hidden_multiplier: int = 4):
    method forward (line 32) | def forward(

FILE: src/nested_learning/instrumentation.py
  class UpdateEvent (line 8) | class UpdateEvent:
  class UpdateLog (line 15) | class UpdateLog:
    method record (line 20) | def record(self, *, step: int, level: str, magnitude: float | None = N...
    method summary (line 23) | def summary(self) -> Dict[str, Dict[str, float]]:

FILE: src/nested_learning/levels.py
  class LevelSpec (line 8) | class LevelSpec:
    method __post_init__ (line 17) | def __post_init__(self) -> None:
  class LevelState (line 30) | class LevelState:
  class LevelClock (line 35) | class LevelClock:
    method __init__ (line 38) | def __init__(self, specs: Sequence[LevelSpec]):
    method step (line 47) | def step(self) -> int:
    method tick (line 50) | def tick(self) -> None:
    method should_update (line 53) | def should_update(self, name: str) -> bool:
    method record_update (line 64) | def record_update(self, name: str) -> None:
    method levels_in_frequency_order (line 70) | def levels_in_frequency_order(self) -> List[LevelSpec]:
    method stats (line 73) | def stats(self) -> Dict[str, LevelState]:
    method timeline (line 78) | def timeline(self) -> List[dict]:
  function ensure_level_specs (line 82) | def ensure_level_specs(entries: Iterable[LevelSpec]) -> List[LevelSpec]:

FILE: src/nested_learning/logging_utils.py
  class BaseLogger (line 10) | class BaseLogger:
    method log (line 11) | def log(self, metrics: Dict[str, Any], step: int) -> None:
    method finish (line 14) | def finish(self) -> None:
  class NullLogger (line 18) | class NullLogger(BaseLogger):
    method log (line 19) | def log(self, metrics: Dict[str, Any], step: int) -> None:
  class JSONLogger (line 23) | class JSONLogger(BaseLogger):
    method __init__ (line 24) | def __init__(self, path: Path):
    method log (line 28) | def log(self, metrics: Dict[str, Any], step: int) -> None:
    method finish (line 32) | def finish(self) -> None:
  class WandbLogger (line 37) | class WandbLogger(BaseLogger):
    method __init__ (line 38) | def __init__(self, cfg: DictConfig, full_cfg: DictConfig):
    method log (line 46) | def log(self, metrics: Dict[str, Any], step: int) -> None:
    method finish (line 50) | def finish(self) -> None:
  function init_logger (line 55) | def init_logger(logging_cfg: DictConfig | None, full_cfg: DictConfig) ->...

FILE: src/nested_learning/memorize.py
  class MemorizeConfig (line 14) | class MemorizeConfig:
  function snapshot_state_dict (line 26) | def snapshot_state_dict(model: torch.nn.Module) -> Dict[str, torch.Tensor]:
  function restore_state_dict (line 30) | def restore_state_dict(model: torch.nn.Module, state: Dict[str, torch.Te...
  function _setup_memorization_context (line 34) | def _setup_memorization_context(model, cfg: MemorizeConfig):
  function _teardown_memorization_context (line 58) | def _teardown_memorization_context(model, prev_allowed, prev_threshold, ...
  function _collect_metrics (line 72) | def _collect_metrics(model, stats: dict[str, float]):
  function _layernorm_backward (line 102) | def _layernorm_backward(
  function _get_model_surprise_metric (line 127) | def _get_model_surprise_metric(model) -> str:
  function _compute_surprise_value (line 134) | def _compute_surprise_value(
  function memorize_tokens (line 169) | def memorize_tokens(
  function memorize_sequence (line 366) | def memorize_sequence(

FILE: src/nested_learning/model.py
  class ModelConfig (line 29) | class ModelConfig:
  class HOPEModel (line 65) | class HOPEModel(nn.Module):
    method __init__ (line 66) | def __init__(self, config: ModelConfig):
    method set_teach_runtime (line 163) | def set_teach_runtime(self, *, scale: float | None = None, clip: float...
    method set_surprise_threshold (line 169) | def set_surprise_threshold(self, threshold: float | None) -> None:
    method get_surprise_threshold (line 174) | def get_surprise_threshold(self) -> float | None:
    method set_surprise_metric (line 177) | def set_surprise_metric(self, metric: str) -> None:
    method get_surprise_metric (line 188) | def get_surprise_metric(self) -> str:
    method set_allowed_update_levels (line 191) | def set_allowed_update_levels(self, levels: set[str] | None) -> None:
    method get_allowed_update_levels (line 196) | def get_allowed_update_levels(self) -> set[str] | None:
    method set_allowed_update_layers (line 199) | def set_allowed_update_layers(self, layers: set[int] | None) -> None:
    method get_allowed_update_layers (line 214) | def get_allowed_update_layers(self) -> set[int] | None:
    method forward (line 217) | def forward(
    method forward_with_pre_norm (line 262) | def forward_with_pre_norm(
    method forward_with_block_outputs (line 317) | def forward_with_block_outputs(
    method _run_blocks (line 379) | def _run_blocks(
    method _gather_block_stats (line 514) | def _gather_block_stats(self) -> Dict[str, float]:
    method pop_update_metrics (line 526) | def pop_update_metrics(self) -> Dict[str, float]:
    method init_fast_state (line 531) | def init_fast_state(self) -> ModelFastState:
    method init_attention_cache (line 578) | def init_attention_cache(self) -> ModelAttentionCache:
    method freeze_backbone (line 581) | def freeze_backbone(self) -> None:
  class _UpdateControlledBlock (line 599) | class _UpdateControlledBlock(Protocol):
    method set_surprise_threshold (line 600) | def set_surprise_threshold(self, threshold: float | None) -> None: ...
    method set_surprise_metric (line 602) | def set_surprise_metric(self, metric: str) -> None: ...
    method set_allowed_levels (line 604) | def set_allowed_levels(self, allowed: set[str] | None) -> None: ...

FILE: src/nested_learning/optim/deep.py
  class DeepMomentumState (line 11) | class DeepMomentumState:
  class DeepMomentum (line 16) | class DeepMomentum(nn.Module):
    method __init__ (line 19) | def __init__(
    method reset_state (line 36) | def reset_state(self) -> None:
    method _precondition (line 39) | def _precondition(self, grad: torch.Tensor, state: DeepMomentumState) ...
    method _nl_precondition (line 46) | def _nl_precondition(
    method forward (line 76) | def forward(  # type: ignore[override]

FILE: src/nested_learning/optim/factory.py
  function build_optimizer (line 8) | def build_optimizer(config: Dict[str, Any]) -> DeepMomentum:

FILE: src/nested_learning/optim/m3.py
  function _newton_schulz (line 8) | def _newton_schulz(matrix: torch.Tensor, steps: int, eps: float = 1e-6) ...
  function _orthogonalize (line 23) | def _orthogonalize(tensor: torch.Tensor, steps: int, eps: float) -> torc...
  class M3 (line 31) | class M3(torch.optim.Optimizer):
    method __init__ (line 42) | def __init__(
    method step (line 70) | def step(self, closure=None):  # type: ignore[override]

FILE: src/nested_learning/optim/manager.py
  class LevelConfig (line 14) | class LevelConfig:
  class LevelOptimizerManager (line 20) | class LevelOptimizerManager:
    method __init__ (line 21) | def __init__(self, config: LevelConfig):
    method should_update (line 37) | def should_update(self, level: str) -> bool:
    method optimize (line 40) | def optimize(
    method apply_module_grads (line 71) | def apply_module_grads(
    method tick (line 103) | def tick(self) -> None:
    method pop_last_metrics (line 106) | def pop_last_metrics(self, level: str) -> Dict[str, float]:
    method apply_grads (line 109) | def apply_grads(

FILE: src/nested_learning/titan/memory.py
  class TitanMemoryConfig (line 13) | class TitanMemoryConfig:
  function _activation (line 20) | def _activation(name: str) -> nn.Module:
  class TitanMemory (line 31) | class TitanMemory(AssocMemory):
    method __init__ (line 34) | def __init__(self, config: TitanMemoryConfig):
    method forward (line 47) | def forward(self, query: torch.Tensor) -> torch.Tensor:  # type: ignor...
    method surprise (line 56) | def surprise(self, residual: torch.Tensor) -> torch.Tensor:
    method update (line 60) | def update(
    method apply_deltas (line 83) | def apply_deltas(self, deltas: Dict[str, torch.Tensor], scale: float =...

FILE: src/nested_learning/titan/model.py
  class TitanOnlyModelConfig (line 31) | class TitanOnlyModelConfig:
  class TitanOnlyBlock (line 52) | class TitanOnlyBlock(nn.Module):
    method __init__ (line 53) | def __init__(self, config: TitanOnlyModelConfig):
    method forward (line 83) | def forward(
    method set_surprise_threshold (line 128) | def set_surprise_threshold(self, threshold: float | None) -> None:
    method set_surprise_metric (line 131) | def set_surprise_metric(self, metric: str) -> None:
    method set_enabled (line 134) | def set_enabled(self, enabled: bool) -> None:
    method _passes_surprise (line 137) | def _passes_surprise(self, surprise_value: float | None) -> bool:
    method _update_titan (line 144) | def _update_titan(
    method _update_titan_fast (line 197) | def _update_titan_fast(
  class TitanOnlyModel (line 252) | class TitanOnlyModel(nn.Module):
    method __init__ (line 253) | def __init__(self, config: TitanOnlyModelConfig):
    method set_teach_runtime (line 271) | def set_teach_runtime(self, *, scale: float | None = None, clip: float...
    method set_surprise_threshold (line 277) | def set_surprise_threshold(self, threshold: float | None) -> None:
    method get_surprise_threshold (line 282) | def get_surprise_threshold(self) -> float | None:
    method set_surprise_metric (line 285) | def set_surprise_metric(self, metric: str) -> None:
    method get_surprise_metric (line 296) | def get_surprise_metric(self) -> str:
    method set_allowed_update_levels (line 299) | def set_allowed_update_levels(self, levels: set[str] | None) -> None:
    method get_allowed_update_levels (line 307) | def get_allowed_update_levels(self) -> set[str] | None:
    method forward (line 312) | def forward(
    method freeze_backbone (line 381) | def freeze_backbone(self) -> None:
    method init_fast_state (line 396) | def init_fast_state(self) -> ModelFastState:
    method init_attention_cache (line 411) | def init_attention_cache(self) -> ModelAttentionCache:

FILE: src/nested_learning/titan/self_modifying.py
  class SelfModifyingTitansConfig (line 13) | class SelfModifyingTitansConfig:
    method __post_init__ (line 29) | def __post_init__(self) -> None:
  class ResidualMLPMemoryState (line 49) | class ResidualMLPMemoryState:
    method clone (line 57) | def clone(self) -> "ResidualMLPMemoryState":
  class SelfModifyingTitansState (line 69) | class SelfModifyingTitansState:
    method clone (line 84) | def clone(self) -> "SelfModifyingTitansState":
  class ResidualMLPMemory (line 95) | class ResidualMLPMemory(nn.Module):
    method __init__ (line 96) | def __init__(
    method forward (line 119) | def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[ov...
  class SelfModifyingTitans (line 129) | class SelfModifyingTitans(nn.Module):
    method __init__ (line 140) | def __init__(self, config: SelfModifyingTitansConfig):
    method init_fast_state (line 177) | def init_fast_state(self) -> SelfModifyingTitansState:
    method apply_updates_inplace (line 187) | def apply_updates_inplace(
    method forward (line 210) | def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[ov...
    method forward_with_state (line 217) | def forward_with_state(
    method forward_with_updates (line 238) | def forward_with_updates(
    method _apply_local_conv (line 376) | def _apply_local_conv(self, x: torch.Tensor) -> torch.Tensor:
    method _load_state_mean_ (line 388) | def _load_state_mean_(self, state: SelfModifyingTitansState) -> None:
    method _apply_chunk_update (line 411) | def _apply_chunk_update(
    method _apply_chunk_update_seq (line 433) | def _apply_chunk_update_seq(
    method _memory_grads (line 477) | def _memory_grads(
    method _memory_grads_chunk (line 514) | def _memory_grads_chunk(
    method _apply_param_update (line 589) | def _apply_param_update(
    method _apply_momentum (line 616) | def _apply_momentum(
    method _init_memory_state (line 632) | def _init_memory_state(self, module: ResidualMLPMemory) -> ResidualMLP...
    method _ensure_batched_state (line 640) | def _ensure_batched_state(
    method _expand_memory_state (line 660) | def _expand_memory_state(
    method _memory_forward (line 678) | def _memory_forward(
    method _straight_through_meta (line 718) | def _straight_through_meta(fast: torch.Tensor, meta: torch.Tensor) -> ...

FILE: src/nested_learning/tokenizer.py
  class SentencePieceTokenizer (line 10) | class SentencePieceTokenizer:
    method __init__ (line 11) | def __init__(self, model_path: str | Path):
    method vocab_size (line 15) | def vocab_size(self) -> int:
    method encode (line 18) | def encode(self, text: str, add_bos: bool = False, add_eos: bool = Tru...
    method batch_encode (line 27) | def batch_encode(self, texts: Sequence[str]) -> list[torch.Tensor]:

FILE: src/nested_learning/tokenizer_coverage.py
  function compute_tokenizer_coverage_stats (line 10) | def compute_tokenizer_coverage_stats(

FILE: src/nested_learning/training.py
  class DistributedContext (line 35) | class DistributedContext:
  function unwrap_config (line 41) | def unwrap_config(cfg: DictConfig) -> DictConfig:
  function build_model_from_cfg (line 52) | def build_model_from_cfg(model_cfg: DictConfig) -> torch.nn.Module:
  function build_dataloader (line 148) | def build_dataloader(
  function _build_dataset (line 192) | def _build_dataset(data_cfg: DictConfig):
  function compute_teach_signal (line 225) | def compute_teach_signal(
  function _compute_layer_teach_signals (line 295) | def _compute_layer_teach_signals(
  function _compute_surprise_override (line 314) | def _compute_surprise_override(
  function _infer_online_chunk_size (line 336) | def _infer_online_chunk_size(model: HOPEModel) -> int | None:
  function _iter_online_token_chunks (line 352) | def _iter_online_token_chunks(
  function _iter_online_boundary_chunks (line 369) | def _iter_online_boundary_chunks(
  class _HasLMHead (line 391) | class _HasLMHead(Protocol):
  function _checksum_path (line 395) | def _checksum_path(path: str | None) -> str | None:
  function maybe_save_checkpoint (line 408) | def maybe_save_checkpoint(
  function _validate_distributed_config (line 451) | def _validate_distributed_config(cfg: DictConfig, distributed: bool) -> ...
  function _emit_streaming_warning (line 489) | def _emit_streaming_warning(
  function _validate_paper_auditing_variant (line 501) | def _validate_paper_auditing_variant(cfg: DictConfig) -> None:
  function _validate_tied_lm_head_for_paper_auditing (line 522) | def _validate_tied_lm_head_for_paper_auditing(
  function _validate_fast_state_batch_semantics (line 546) | def _validate_fast_state_batch_semantics(cfg: DictConfig) -> None:
  function _validate_online_update_fast_state_semantics (line 574) | def _validate_online_update_fast_state_semantics(cfg: DictConfig) -> None:
  function _resolve_algorithm_mode (line 598) | def _resolve_algorithm_mode(cfg: DictConfig) -> str:
  function _validate_algorithm_mode_constraints (line 606) | def _validate_algorithm_mode_constraints(
  function _validate_online_chunking_constraints (line 650) | def _validate_online_chunking_constraints(cfg: DictConfig) -> None:
  function _check_online_supervised_pairs (line 663) | def _check_online_supervised_pairs(
  function run_training_loop (line 685) | def run_training_loop(
  function _apply_teach_schedule (line 1072) | def _apply_teach_schedule(model: HOPEModel, cfg: DictConfig, step: int) ...
  function _maybe_compile_model (line 1093) | def _maybe_compile_model(model: torch.nn.Module, compile_cfg: dict | Non...
  function _make_autocast_factory (line 1110) | def _make_autocast_factory(device: torch.device, mp_cfg: dict | None):
  function _resolve_autocast_dtype (line 1128) | def _resolve_autocast_dtype(name: str) -> torch.dtype:
  function _build_optimizer (line 1138) | def _build_optimizer(
  function _build_muon_optimizer (line 1194) | def _build_muon_optimizer(
  function _build_m3_optimizer (line 1257) | def _build_m3_optimizer(
  function _select_outer_named_parameters (line 1327) | def _select_outer_named_parameters(
  function _is_memory_param_name (line 1346) | def _is_memory_param_name(name: str) -> bool:
  function _is_muon_candidate (line 1351) | def _is_muon_candidate(name: str, param: torch.nn.Parameter) -> bool:
  class _HybridOptimizer (line 1360) | class _HybridOptimizer:
    method __init__ (line 1361) | def __init__(
    method zero_grad (line 1378) | def zero_grad(self) -> None:
    method step (line 1384) | def step(self) -> None:
    method state_dict (line 1390) | def state_dict(self) -> dict:
    method load_state_dict (line 1396) | def load_state_dict(self, state: dict) -> None:
    method param_groups (line 1403) | def param_groups(self):
    method get_param_split (line 1411) | def get_param_split(self) -> dict[str, int]:
  function _log_run_features (line 1418) | def _log_run_features(
  function _detect_flash_attention (line 1482) | def _detect_flash_attention(model: torch.nn.Module) -> bool:
  function write_checkpoint_metadata (line 1492) | def write_checkpoint_metadata(cfg: DictConfig, ckpt_path: Path, step: in...
  function verify_checkpoint_integrity (line 1520) | def verify_checkpoint_integrity(ckpt_path: Path) -> Dict[str, object]:
  function _capture_rng_states (line 1554) | def _capture_rng_states() -> Dict[str, object]:
  function _encode_pickle (line 1567) | def _encode_pickle(obj: object) -> str:
  function _tensor_state_to_hex (line 1571) | def _tensor_state_to_hex(state: torch.Tensor) -> str:
  function _seed_everything (line 1575) | def _seed_everything(seed: int, *, deterministic: bool = False) -> None:
  function _make_worker_init_fn (line 1593) | def _make_worker_init_fn(base_seed: int):

FILE: src/nested_learning/transformer.py
  class TransformerBlockConfig (line 13) | class TransformerBlockConfig:
  class FeedForward (line 22) | class FeedForward(nn.Module):
    method __init__ (line 23) | def __init__(
    method forward (line 45) | def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[ov...
  class TransformerBlock (line 51) | class TransformerBlock(nn.Module):
    method __init__ (line 58) | def __init__(self, config: TransformerBlockConfig) -> None:
    method forward (line 75) | def forward(
    method set_surprise_threshold (line 97) | def set_surprise_threshold(self, threshold: float | None) -> None:
    method set_surprise_metric (line 100) | def set_surprise_metric(self, metric: str) -> None:
    method set_allowed_levels (line 103) | def set_allowed_levels(self, allowed) -> None:

FILE: tests/test_algorithm_mode_grad.py
  function _manager (line 7) | def _manager() -> LevelOptimizerManager:
  function test_apply_grads_differentiable_preserves_gradient_path (line 17) | def test_apply_grads_differentiable_preserves_gradient_path() -> None:
  function test_apply_grads_nondifferentiable_breaks_gradient_path (line 35) | def test_apply_grads_nondifferentiable_breaks_gradient_path() -> None:

FILE: tests/test_attention_cache.py
  function _build_transformer_model (line 9) | def _build_transformer_model() -> HOPEModel:
  function test_attention_cache_chunked_logits_match_full_logits (line 24) | def test_attention_cache_chunked_logits_match_full_logits() -> None:
  function test_attention_cache_reset_changes_continuation_state (line 44) | def test_attention_cache_reset_changes_continuation_state() -> None:
  function test_transformer_block_rejects_kv_cache_with_local_conv (line 59) | def test_transformer_block_rejects_kv_cache_with_local_conv() -> None:

FILE: tests/test_attention_features.py
  function test_self_attention_qk_l2_norm_unit_vectors (line 6) | def test_self_attention_qk_l2_norm_unit_vectors() -> None:
  function test_self_attention_local_conv_window_preserves_shape (line 16) | def test_self_attention_local_conv_window_preserves_shape() -> None:
  function test_self_attention_local_conv_is_causal (line 25) | def test_self_attention_local_conv_is_causal() -> None:

FILE: tests/test_boundary_state_mode.py
  function _build_attention_model (line 9) | def _build_attention_model() -> HOPEModel:
  function _two_chunk_grad_norm (line 23) | def _two_chunk_grad_norm(*, differentiable_updates: bool) -> float:
  function test_boundary_state_grad_mode_propagates_across_write_path (line 66) | def test_boundary_state_grad_mode_propagates_across_write_path() -> None:
  function test_stopgrad_mode_blocks_boundary_state_grad_path (line 70) | def test_stopgrad_mode_blocks_boundary_state_grad_path() -> None:

FILE: tests/test_boundary_state_training_loop.py
  function _tiny_boundary_state_cfg (line 9) | def _tiny_boundary_state_cfg():
  function test_boundary_state_mode_runs_in_training_loop (line 61) | def test_boundary_state_mode_runs_in_training_loop() -> None:

FILE: tests/test_build_model_from_cfg_selfmod.py
  function test_build_model_from_cfg_plumbs_selfmod_fields (line 7) | def test_build_model_from_cfg_plumbs_selfmod_fields() -> None:

FILE: tests/test_checkpoint_metadata_and_eval_loaders.py
  function _tiny_cfg (line 17) | def _tiny_cfg():
  function _load_script_module (line 44) | def _load_script_module(script_path: Path):
  function test_checkpoint_metadata_includes_algorithm_and_online_flags (line 52) | def test_checkpoint_metadata_includes_algorithm_and_online_flags(tmp_pat...
  function test_eval_loaders_accept_boundary_state_checkpoint (line 69) | def test_eval_loaders_accept_boundary_state_checkpoint(tmp_path: Path) -...

FILE: tests/test_cli_tooling.py
  function test_doctor_json_output (line 15) | def test_doctor_json_output() -> None:
  function test_smoke_cpu_passes (line 25) | def test_smoke_cpu_passes() -> None:
  function test_smoke_auto_passes (line 49) | def test_smoke_auto_passes() -> None:
  function test_audit_reports_tied_weights (line 70) | def test_audit_reports_tied_weights() -> None:
  function test_train_dry_run_prints_config (line 81) | def test_train_dry_run_prints_config() -> None:
  function test_compose_config_with_explicit_config_dir (line 99) | def test_compose_config_with_explicit_config_dir(tmp_path: Path) -> None:
  function test_python_module_entrypoint_help (line 121) | def test_python_module_entrypoint_help() -> None:

FILE: tests/test_cms.py
  function test_cms_forward_preserves_shape (line 8) | def test_cms_forward_preserves_shape() -> None:
  function test_cms_can_disable_layernorm (line 20) | def test_cms_can_disable_layernorm() -> None:
  function test_cms_updates_respect_update_period_tokens (line 33) | def test_cms_updates_respect_update_period_tokens() -> None:
  function test_cms_updates_skip_when_no_signal (line 54) | def test_cms_updates_skip_when_no_signal() -> None:
  function test_cms_online_updates_affect_later_tokens (line 69) | def test_cms_online_updates_affect_later_tokens() -> None:

FILE: tests/test_cms_cross_call.py
  function _build_variant (line 16) | def _build_variant(variant: str, *, flush_partial: bool):
  function test_cms_fast_state_buffers_persist_across_calls (line 79) | def test_cms_fast_state_buffers_persist_across_calls(variant: str) -> None:
  function test_cms_fast_state_flushes_only_on_finalize (line 97) | def test_cms_fast_state_flushes_only_on_finalize(variant: str) -> None:

FILE: tests/test_cms_delta_rule.py
  function test_cms_target_shift_loss_grad_is_proportional_to_delta (line 6) | def test_cms_target_shift_loss_grad_is_proportional_to_delta() -> None:
  function test_cms_chunk_loss_sum_scales_relative_to_mean (line 25) | def test_cms_chunk_loss_sum_scales_relative_to_mean() -> None:

FILE: tests/test_cms_flush_partial.py
  function _run_block (line 8) | def _run_block(*, flush_partial: bool, use_fast_state: bool) -> dict[str...
  function test_cms_flush_partial_disabled_leaves_remainder_unupdated (line 35) | def test_cms_flush_partial_disabled_leaves_remainder_unupdated() -> None:
  function test_cms_flush_partial_enabled_updates_final_remainder (line 42) | def test_cms_flush_partial_enabled_updates_final_remainder() -> None:

FILE: tests/test_compare_variants_cli.py
  function _train_tiny_sentencepiece (line 13) | def _train_tiny_sentencepiece(tmp_path: Path, *, vocab_size: int) -> Path:
  function _write_minimal_model_config (line 40) | def _write_minimal_model_config(path: Path, *, vocab_size: int, block_va...
  function _write_checkpoint (line 55) | def _write_checkpoint(path: Path, config_path: Path) -> None:
  function test_compare_variants_cli_smoke (line 61) | def test_compare_variants_cli_smoke(tmp_path: Path) -> None:

FILE: tests/test_compile_toggle.py
  function _tiny_compile_cfg (line 9) | def _tiny_compile_cfg():
  function test_compile_toggle_smoke_does_not_crash (line 60) | def test_compile_toggle_smoke_does_not_crash() -> None:

FILE: tests/test_compliance_report.py
  function _run_report (line 11) | def _run_report(config_path: Path, output_path: Path, repo_root: Path) -...
  function test_compliance_report_includes_algorithm_mode_checks (line 30) | def test_compliance_report_includes_algorithm_mode_checks(tmp_path: Path...
  function test_compliance_report_validates_boundary_mode_constraints (line 41) | def test_compliance_report_validates_boundary_mode_constraints(tmp_path:...

FILE: tests/test_continual_classification.py
  function _train_tiny_sentencepiece (line 18) | def _train_tiny_sentencepiece(tmp_path: Path, *, vocab_size: int) -> Path:
  function _tiny_transformer_model (line 46) | def _tiny_transformer_model(vocab_size: int) -> HOPEModel:
  function _toy_examples (line 59) | def _toy_examples() -> list[ClassificationExample]:
  function test_build_streaming_tasks_balanced_split (line 67) | def test_build_streaming_tasks_balanced_split() -> None:
  function test_evaluate_continual_classification_runs (line 77) | def test_evaluate_continual_classification_runs(tmp_path: Path) -> None:
  function test_evaluate_continual_classification_with_memorize_fast_state (line 101) | def test_evaluate_continual_classification_with_memorize_fast_state(tmp_...

FILE: tests/test_continual_eval_state_mode.py
  function _load_evaluate_segment (line 12) | def _load_evaluate_segment():
  class _TokenDataset (line 26) | class _TokenDataset(Dataset):
    method __init__ (line 27) | def __init__(self) -> None:
    method __len__ (line 30) | def __len__(self) -> int:
    method __getitem__ (line 33) | def __getitem__(self, idx: int) -> torch.Tensor:
  function _build_model (line 37) | def _build_model() -> HOPEModel:
  function test_continual_eval_state_modes_run_without_errors (line 50) | def test_continual_eval_state_modes_run_without_errors() -> None:

FILE: tests/test_data_scripts_help.py
  function test_data_scripts_help_smoke (line 7) | def test_data_scripts_help_smoke() -> None:

FILE: tests/test_data_split_fallbacks.py
  function test_train_tokenizer_manifest_supports_text_data_files (line 13) | def test_train_tokenizer_manifest_supports_text_data_files(tmp_path: Pat...
  function test_shard_corpus_accepts_text_data_files_with_train_split (line 42) | def test_shard_corpus_accepts_text_data_files_with_train_split(tmp_path:...
  function test_train_tokenizer_allows_small_corpus_with_no_hard_vocab_limit (line 67) | def test_train_tokenizer_allows_small_corpus_with_no_hard_vocab_limit(tm...
  function test_split_fallback_prefers_validation_then_test (line 115) | def test_split_fallback_prefers_validation_then_test() -> None:
  function test_split_fallback_uses_first_when_no_standard_split (line 122) | def test_split_fallback_uses_first_when_no_standard_split() -> None:

FILE: tests/test_determinism_seed.py
  function test_seed_everything_reproducible_python_numpy_torch (line 9) | def test_seed_everything_reproducible_python_numpy_torch() -> None:
  function test_seed_everything_toggles_deterministic_algorithms (line 25) | def test_seed_everything_toggles_deterministic_algorithms() -> None:

FILE: tests/test_device_resolution.py
  function test_resolve_device_mps_falls_back_when_unavailable (line 6) | def test_resolve_device_mps_falls_back_when_unavailable() -> None:

FILE: tests/test_distributed_fail_fast.py
  function test_fail_if_paper_faithful_disabled_blocks_ddp_per_layer_teach (line 7) | def test_fail_if_paper_faithful_disabled_blocks_ddp_per_layer_teach() ->...
  function test_fail_if_paper_faithful_disabled_blocks_ddp_online_updates (line 21) | def test_fail_if_paper_faithful_disabled_blocks_ddp_online_updates() -> ...
  function test_fail_if_paper_faithful_disabled_allows_single_process (line 35) | def test_fail_if_paper_faithful_disabled_allows_single_process() -> None:
  function test_strict_streaming_contract_blocks_ddp_online_features (line 48) | def test_strict_streaming_contract_blocks_ddp_online_features() -> None:
  function test_fail_if_paper_faithful_disabled_blocks_ddp_boundary_targets (line 63) | def test_fail_if_paper_faithful_disabled_blocks_ddp_boundary_targets() -...
  function test_fail_if_paper_faithful_disabled_blocks_ddp_attention_cache_carry (line 78) | def test_fail_if_paper_faithful_disabled_blocks_ddp_attention_cache_carr...

FILE: tests/test_eval_builders.py
  function test_commonsenseqa_builder (line 9) | def test_commonsenseqa_builder() -> None:
  function test_openbookqa_builder (line 24) | def test_openbookqa_builder() -> None:

FILE: tests/test_eval_state.py
  function _transformer_model (line 12) | def _transformer_model() -> HOPEModel:
  function test_parse_eval_state_mode_variants (line 25) | def test_parse_eval_state_mode_variants() -> None:
  function test_forward_with_eval_state_attention_cache_continuity (line 32) | def test_forward_with_eval_state_attention_cache_continuity() -> None:
  function test_forward_with_eval_state_none_state_passthrough (line 51) | def test_forward_with_eval_state_none_state_passthrough() -> None:

FILE: tests/test_eval_state_cli.py
  function _load_eval_script (line 7) | def _load_eval_script(name: str):
  function test_zeroshot_rejects_carry_eval_state_mode (line 22) | def test_zeroshot_rejects_carry_eval_state_mode() -> None:
  function test_zeroshot_allows_reset_eval_state_mode_for_task_listing (line 42) | def test_zeroshot_allows_reset_eval_state_mode_for_task_listing() -> None:
  function test_niah_rejects_carry_eval_state_mode_before_loading_inputs (line 62) | def test_niah_rejects_carry_eval_state_mode_before_loading_inputs(tmp_pa...

FILE: tests/test_faithfulness_harness.py
  function _cms_delta_l1 (line 8) | def _cms_delta_l1(state, level_name: str) -> float:
  function test_e2e_update_paths_and_surprise_gate (line 13) | def test_e2e_update_paths_and_surprise_gate() -> None:

FILE: tests/test_fast_state_batch_semantics.py
  function test_fast_state_batch_semantics_raises_when_strict (line 7) | def test_fast_state_batch_semantics_raises_when_strict() -> None:
  function test_fast_state_batch_semantics_allows_batch1 (line 18) | def test_fast_state_batch_semantics_allows_batch1() -> None:
  function test_fast_state_batch_semantics_warns_with_structured_payload_when_not_strict (line 28) | def test_fast_state_batch_semantics_warns_with_structured_payload_when_n...

FILE: tests/test_fast_state_forward_equivalence.py
  function test_fast_state_zero_deltas_matches_meta_forward (line 7) | def test_fast_state_zero_deltas_matches_meta_forward() -> None:

FILE: tests/test_fast_state_meta_grads.py
  function test_fast_state_preserves_outer_grads_for_memory_meta_params (line 8) | def test_fast_state_preserves_outer_grads_for_memory_meta_params() -> None:

FILE: tests/test_fast_state_selfmod_meta_grads.py
  function test_hope_selfmod_fast_state_preserves_meta_forward_at_init (line 8) | def test_hope_selfmod_fast_state_preserves_meta_forward_at_init() -> None:
  function test_hope_selfmod_fast_state_preserves_outer_grads_for_meta_memory_init (line 28) | def test_hope_selfmod_fast_state_preserves_outer_grads_for_meta_memory_i...

FILE: tests/test_git_tracked_sizes_check.py
  function test_git_tracked_sizes_check_passes_repo_defaults (line 7) | def test_git_tracked_sizes_check_passes_repo_defaults() -> None:

FILE: tests/test_hope_block.py
  function make_block (line 7) | def make_block() -> HOPEBlock:
  function test_hope_block_forward (line 17) | def test_hope_block_forward() -> None:
  function test_hope_block_self_mod (line 24) | def test_hope_block_self_mod() -> None:

FILE: tests/test_hope_selfmod_fast_state_meta_unchanged.py
  function test_hope_selfmod_fast_state_updates_do_not_mutate_meta_params (line 9) | def test_hope_selfmod_fast_state_updates_do_not_mutate_meta_params() -> ...

FILE: tests/test_hope_selfmod_integration.py
  function test_hope_selfmod_variant_updates_selfmod_state_in_fast_mode (line 8) | def test_hope_selfmod_variant_updates_selfmod_state_in_fast_mode() -> None:

FILE: tests/test_hope_selfmod_update_pass.py
  function test_hope_selfmod_updates_module_params_only_in_update_pass (line 8) | def test_hope_selfmod_updates_module_params_only_in_update_pass() -> None:

FILE: tests/test_levels.py
  function test_level_clock_updates_on_schedule (line 4) | def test_level_clock_updates_on_schedule() -> None:

FILE: tests/test_m3.py
  function test_m3_updates_and_slow_momentum (line 6) | def test_m3_updates_and_slow_momentum() -> None:
  function test_m3_step_matches_reference_denominator_for_first_update (line 30) | def test_m3_step_matches_reference_denominator_for_first_update() -> None:
  function test_m3_two_steps_match_closed_form_without_slow_momentum (line 57) | def test_m3_two_steps_match_closed_form_without_slow_momentum() -> None:
  function test_m3_weight_decay_is_included_in_reference_step (line 93) | def test_m3_weight_decay_is_included_in_reference_step() -> None:
  function test_m3_slow_buffer_resets_and_o2_updates_on_chunk_boundary (line 121) | def test_m3_slow_buffer_resets_and_o2_updates_on_chunk_boundary() -> None:

FILE: tests/test_m3_slow_timing.py
  function test_m3_slow_momentum_applies_next_chunk_not_boundary_step (line 6) | def test_m3_slow_momentum_applies_next_chunk_not_boundary_step() -> None:

FILE: tests/test_memorization.py
  function _tiny_model (line 8) | def _tiny_model() -> HOPEModel:
  function _tiny_model_update_every_call (line 27) | def _tiny_model_update_every_call() -> HOPEModel:
  function _tiny_model_with_self_mod_lr (line 46) | def _tiny_model_with_self_mod_lr(lr: float) -> HOPEModel:
  function _fast_titan_delta_norm (line 62) | def _fast_titan_delta_norm(fast_state, before: dict[str, torch.Tensor]) ...
  function test_memorize_fast_state_does_not_mutate_meta_params (line 72) | def test_memorize_fast_state_does_not_mutate_meta_params() -> None:
  function test_memorize_fast_state_changes_outputs_and_resets (line 84) | def test_memorize_fast_state_changes_outputs_and_resets() -> None:
  function test_memorize_respects_surprise_threshold (line 105) | def test_memorize_respects_surprise_threshold() -> None:
  function test_memorize_paths_filter_blocks_updates (line 116) | def test_memorize_paths_filter_blocks_updates() -> None:
  function test_memorize_online_chunking_updates_once_per_target (line 127) | def test_memorize_online_chunking_updates_once_per_target() -> None:
  function test_teach_mask_restricts_memorization_updates (line 136) | def test_teach_mask_restricts_memorization_updates() -> None:
  function test_self_mod_lr_scales_fast_state_update_magnitude (line 161) | def test_self_mod_lr_scales_fast_state_update_magnitude() -> None:

FILE: tests/test_model.py
  function test_model_forward (line 7) | def test_model_forward() -> None:

FILE: tests/test_model_streaming_cadence.py
  function _metric (line 8) | def _metric(metrics: dict[str, float], key: str) -> float:
  function _build_attention_model (line 12) | def _build_attention_model(*, flush_partial: bool) -> HOPEModel:
  function _build_attention_model_with_period (line 26) | def _build_attention_model_with_period(*, update_period: int) -> HOPEModel:
  function test_model_streaming_cadence_matches_single_call_counts (line 40) | def test_model_streaming_cadence_matches_single_call_counts() -> None:
  function test_model_streaming_cadence_matches_for_multiple_periods (line 80) | def test_model_streaming_cadence_matches_for_multiple_periods(update_per...
  function test_model_finalize_flushes_partial_once (line 115) | def test_model_finalize_flushes_partial_once() -> None:
  function test_slow_cms_level_does_not_starve_under_segmented_calls (line 157) | def test_slow_cms_level_does_not_starve_under_segmented_calls() -> None:

FILE: tests/test_online_chunking.py
  function test_online_chunking_carries_boundary_overlap_and_token_pairs (line 13) | def test_online_chunking_carries_boundary_overlap_and_token_pairs() -> N...
  function test_online_chunking_supports_chunk_size_one (line 26) | def test_online_chunking_supports_chunk_size_one() -> None:
  function test_online_chunking_chunk_size_one_preserves_train_loop_token_accounting (line 36) | def test_online_chunking_chunk_size_one_preserves_train_loop_token_accou...
  function test_online_boundary_chunks_emit_next_tokens_and_exact_target_count (line 46) | def test_online_boundary_chunks_emit_next_tokens_and_exact_target_count(...
  function _supervised_targets_overlap (line 59) | def _supervised_targets_overlap(tokens: torch.Tensor, chunk_size: int) -...
  function _supervised_targets_boundary (line 68) | def _supervised_targets_boundary(tokens: torch.Tensor, chunk_size: int) ...
  function _build_transformer_model (line 78) | def _build_transformer_model() -> HOPEModel:
  function test_online_boundary_chunked_loss_matches_monolithic_with_attention_cache (line 91) | def test_online_boundary_chunked_loss_matches_monolithic_with_attention_...
  function test_online_target_coverage_property_randomized (line 128) | def test_online_target_coverage_property_randomized() -> None:
  function test_chunk_schedule_permutations_preserve_supervision_set (line 140) | def test_chunk_schedule_permutations_preserve_supervision_set() -> None:
  function test_per_layer_teach_with_boundary_chunks_runs_update_path (line 155) | def test_per_layer_teach_with_boundary_chunks_runs_update_path() -> None:

FILE: tests/test_optim.py
  function test_deep_momentum_nl_preconditioner_projects_grad (line 6) | def test_deep_momentum_nl_preconditioner_projects_grad() -> None:
  function test_deep_momentum_nl_preconditioner_reduces_simple_objective (line 18) | def test_deep_momentum_nl_preconditioner_reduces_simple_objective() -> N...
  function test_deep_momentum_keeps_state_per_param_key (line 32) | def test_deep_momentum_keeps_state_per_param_key() -> None:
  function test_deep_momentum_nl_preconditioner_skips_mismatched_shapes (line 44) | def test_deep_momentum_nl_preconditioner_skips_mismatched_shapes() -> None:
  function test_deep_momentum_nl_preconditioner_outputs_orthogonal_update (line 53) | def test_deep_momentum_nl_preconditioner_outputs_orthogonal_update() -> ...

FILE: tests/test_optimizer_param_policy.py
  function _make_small_hope_model (line 9) | def _make_small_hope_model() -> HOPEModel:
  function _optimizer_param_set (line 23) | def _optimizer_param_set(optimizer: torch.optim.Optimizer) -> set[torch....
  function test_param_policy_all_includes_all_trainable_params (line 31) | def test_param_policy_all_includes_all_trainable_params() -> None:
  function test_param_policy_exclude_memory_drops_memory_params (line 44) | def test_param_policy_exclude_memory_drops_memory_params() -> None:
  function test_param_policy_only_memory_keeps_only_memory_params (line 63) | def test_param_policy_only_memory_keeps_only_memory_params() -> None:

FILE: tests/test_package_release_script.py
  function test_package_script_includes_train_flags_and_excludes_raw_data (line 8) | def test_package_script_includes_train_flags_and_excludes_raw_data(tmp_p...

FILE: tests/test_paper_faithful_configs.py
  function _compose_config (line 9) | def _compose_config(name: str, overrides: list[str] | None = None):
  function test_pilot_paper_faithful_config_composes (line 16) | def test_pilot_paper_faithful_config_composes() -> None:
  function test_pilot_selfmod_paper_faithful_config_composes (line 33) | def test_pilot_selfmod_paper_faithful_config_composes() -> None:
  function test_paper_faithful_variants_are_explicitly_paper_defined (line 50) | def test_paper_faithful_variants_are_explicitly_paper_defined() -> None:
  function test_pilot_paper_faithful_override_to_boundary_state_mode_applies (line 58) | def test_pilot_paper_faithful_override_to_boundary_state_mode_applies() ...
  function test_pilot_paper_faithful_never_implicitly_falls_back_to_stopgrad (line 66) | def test_pilot_paper_faithful_never_implicitly_falls_back_to_stopgrad() ...

FILE: tests/test_phase2_memorization_delta.py
  function _tiny_variant (line 8) | def _tiny_variant(variant: str) -> HOPEModel:
  function test_hope_attention_adapts_transformer_does_not (line 25) | def test_hope_attention_adapts_transformer_does_not() -> None:

FILE: tests/test_residual_mlp_memory.py
  function test_residual_mlp_memory_matches_eq91_when_dims_match (line 7) | def test_residual_mlp_memory_matches_eq91_when_dims_match() -> None:
  function test_residual_mlp_memory_uses_projection_skip_when_dims_differ (line 18) | def test_residual_mlp_memory_uses_projection_skip_when_dims_differ() -> ...
  function test_residual_mlp_memory_disables_projection_skip_in_faithful_mode (line 23) | def test_residual_mlp_memory_disables_projection_skip_in_faithful_mode()...

FILE: tests/test_run_features.py
  class _CaptureLogger (line 11) | class _CaptureLogger:
    method __init__ (line 12) | def __init__(self) -> None:
    method log (line 15) | def log(self, data: dict[str, object], step: int) -> None:
  function _tiny_model (line 19) | def _tiny_model() -> HOPEModel:
  function _tiny_cfg (line 32) | def _tiny_cfg(algorithm_mode: str) -> object:
  function test_run_features_reports_stopgrad_mode_flag (line 50) | def test_run_features_reports_stopgrad_mode_flag() -> None:
  function test_run_features_reports_boundary_state_mode_flag (line 62) | def test_run_features_reports_boundary_state_mode_flag() -> None:

FILE: tests/test_self_modifying_titans.py
  function test_self_modifying_titans_forward_shape (line 6) | def test_self_modifying_titans_forward_shape() -> None:
  function test_self_modifying_titans_updates_fast_state (line 13) | def test_self_modifying_titans_updates_fast_state() -> None:
  function test_self_modifying_titans_supports_batch_fast_state_updates (line 24) | def test_self_modifying_titans_supports_batch_fast_state_updates() -> None:
  function test_self_modifying_titans_chunked_outputs_match_no_update_with_single_chunk (line 35) | def test_self_modifying_titans_chunked_outputs_match_no_update_with_sing...
  function test_self_modifying_titans_flushes_partial_chunks_for_memory_updates (line 57) | def test_self_modifying_titans_flushes_partial_chunks_for_memory_updates...

FILE: tests/test_selfmod_adaptive_q.py
  function test_selfmod_fixed_q_does_not_update_q_memory (line 6) | def test_selfmod_fixed_q_does_not_update_q_memory() -> None:
  function test_selfmod_adaptive_q_updates_q_memory (line 16) | def test_selfmod_adaptive_q_updates_q_memory() -> None:

FILE: tests/test_selfmod_dgd_linear.py
  function test_selfmod_linear_memory_l2_grad_matches_analytic (line 10) | def test_selfmod_linear_memory_l2_grad_matches_analytic() -> None:

FILE: tests/test_selfmod_grad_flow.py
  function test_hope_selfmod_forward_allows_outer_gradients (line 8) | def test_hope_selfmod_forward_allows_outer_gradients() -> None:

FILE: tests/test_selfmod_local_conv.py
  function test_selfmod_local_conv_is_causal (line 6) | def test_selfmod_local_conv_is_causal() -> None:

FILE: tests/test_selfmod_online.py
  function test_selfmod_updates_on_update_pass_even_with_zero_teach_signal (line 8) | def test_selfmod_updates_on_update_pass_even_with_zero_teach_signal() ->...

FILE: tests/test_strict_streaming_contract.py
  function test_strict_streaming_contract_rejects_non_paper_variant (line 14) | def test_strict_streaming_contract_rejects_non_paper_variant() -> None:
  function test_non_strict_streaming_contract_warns_for_non_paper_variant (line 25) | def test_non_strict_streaming_contract_warns_for_non_paper_variant(
  function test_strict_streaming_contract_allows_paper_defined_variants (line 40) | def test_strict_streaming_contract_allows_paper_defined_variants() -> None:
  function test_online_updates_without_fast_state_warns_when_not_strict (line 51) | def test_online_updates_without_fast_state_warns_when_not_strict(
  function test_online_updates_without_fast_state_fails_in_strict_mode (line 70) | def test_online_updates_without_fast_state_fails_in_strict_mode() -> None:
  function test_online_supervised_pairs_mismatch_warns_when_not_strict (line 85) | def test_online_supervised_pairs_mismatch_warns_when_not_strict(
  function test_online_supervised_pairs_mismatch_fails_in_strict_mode (line 93) | def test_online_supervised_pairs_mismatch_fails_in_strict_mode() -> None:
  function test_algorithm_mode_defaults_to_two_pass_stopgrad_updates (line 98) | def test_algorithm_mode_defaults_to_two_pass_stopgrad_updates() -> None:
  function test_algorithm_mode_rejects_unknown_values (line 103) | def test_algorithm_mode_rejects_unknown_values() -> None:
  function test_algorithm_mode_accepts_boundary_state_mode_name (line 109) | def test_algorithm_mode_accepts_boundary_state_mode_name() -> None:
  function test_boundary_state_mode_requires_online_per_layer_and_fast_state (line 114) | def test_boundary_state_mode_requires_online_per_layer_and_fast_state() ...
  function test_boundary_state_mode_rejects_distributed (line 133) | def test_boundary_state_mode_rejects_distributed() -> None:
  function test_boundary_state_mode_emits_experimental_warning (line 152) | def test_boundary_state_mode_emits_experimental_warning(
  function test_online_cache_requires_boundary_targets (line 176) | def test_online_cache_requires_boundary_targets() -> None:
  function test_online_cache_requires_online_updates (line 190) | def test_online_cache_requires_online_updates() -> None:

FILE: tests/test_surprise_metric.py
  function _cms_delta_l1 (line 8) | def _cms_delta_l1(state, level_name: str) -> float:
  function _logit_entropy (line 13) | def _logit_entropy(logits: torch.Tensor) -> float:
  function _next_token_loss (line 20) | def _next_token_loss(logits: torch.Tensor, tokens: torch.Tensor) -> float:
  function test_surprise_metric_loss_gates_updates_when_threshold_set (line 28) | def test_surprise_metric_loss_gates_updates_when_threshold_set() -> None:
  function test_surprise_metric_entropy_gates_updates_when_threshold_set (line 69) | def test_surprise_metric_entropy_gates_updates_when_threshold_set() -> N...
  function test_surprise_metric_requires_external_value_when_threshold_set (line 110) | def test_surprise_metric_requires_external_value_when_threshold_set() ->...
  function test_surprise_metric_l2_uses_chunk_gate_then_token_mask (line 136) | def test_surprise_metric_l2_uses_chunk_gate_then_token_mask() -> None:

FILE: tests/test_surprise_override.py
  function _entropy (line 6) | def _entropy(logits: torch.Tensor) -> float:
  function test_logit_entropy_surprise_uses_boundary_target_step_when_present (line 12) | def test_logit_entropy_surprise_uses_boundary_target_step_when_present()...
  function test_logit_entropy_surprise_default_excludes_last_unsupervised_step (line 27) | def test_logit_entropy_surprise_default_excludes_last_unsupervised_step(...
  function test_logit_entropy_surprise_returns_none_when_no_supervised_steps (line 41) | def test_logit_entropy_surprise_returns_none_when_no_supervised_steps() ...

FILE: tests/test_teach_signal.py
  function _tiny_config (line 10) | def _tiny_config() -> ModelConfig:
  function _tiny_titan_config (line 28) | def _tiny_titan_config() -> TitanOnlyModelConfig:
  function test_teach_signal_matches_gradient (line 41) | def test_teach_signal_matches_gradient() -> None:
  function test_teach_signal_matches_gradient_titan (line 69) | def test_teach_signal_matches_gradient_titan() -> None:
  function test_per_layer_teach_signal_shapes (line 97) | def test_per_layer_teach_signal_shapes() -> None:
  function test_per_layer_teach_signal_matches_autograd_grads (line 113) | def test_per_layer_teach_signal_matches_autograd_grads() -> None:
  function test_teach_signal_matches_gradient_with_ignore_index (line 129) | def test_teach_signal_matches_gradient_with_ignore_index() -> None:
  function test_teach_signal_matches_gradient_with_boundary_target (line 164) | def test_teach_signal_matches_gradient_with_boundary_target() -> None:
  function test_teach_signal_matches_gradient_with_boundary_target_and_ignore_index (line 194) | def test_teach_signal_matches_gradient_with_boundary_target_and_ignore_i...

FILE: tests/test_tied_weight_guard.py
  function _tiny_hope_model (line 10) | def _tiny_hope_model() -> HOPEModel:
  function test_paper_auditing_guard_accepts_tied_weights (line 23) | def test_paper_auditing_guard_accepts_tied_weights() -> None:
  function test_paper_auditing_guard_rejects_untied_weights (line 29) | def test_paper_auditing_guard_rejects_untied_weights() -> None:

FILE: tests/test_variants.py
  function _base_cfg (line 10) | def _base_cfg(*, block_variant: str) -> ModelConfig:
  function test_hope_hybrid_variant_contains_titan_memory (line 25) | def test_hope_hybrid_variant_contains_titan_memory() -> None:
  function test_hope_attention_variant_excludes_titan_memory (line 32) | def test_hope_attention_variant_excludes_titan_memory() -> None:
  function test_hope_selfmod_variant_excludes_titan_memory (line 43) | def test_hope_selfmod_variant_excludes_titan_memory() -> None:
  function test_transformer_variant_runs_with_and_without_fast_state (line 56) | def test_transformer_variant_runs_with_and_without_fast_state() -> None:

FILE: tests/test_verify_docs_refs.py
  function _load_verify_docs_refs (line 5) | def _load_verify_docs_refs():
  function test_parse_referenced_paths_extracts_repo_paths (line 15) | def test_parse_referenced_paths_extracts_repo_paths() -> None:
  function test_verify_docs_refs_reports_missing_paths (line 31) | def test_verify_docs_refs_reports_missing_paths(tmp_path: Path) -> None:
  function test_verify_docs_refs_validates_markdown_anchors (line 47) | def test_verify_docs_refs_validates_markdown_anchors(tmp_path: Path) -> ...

FILE: tests/test_verify_update_cadence.py
  function _load_verify_cadence (line 6) | def _load_verify_cadence():
  function _write_log (line 18) | def _write_log(path: Path, payload: dict[str, float]) -> None:
  function test_verify_update_cadence_no_flush (line 22) | def test_verify_update_cadence_no_flush(tmp_path: Path) -> None:
  function test_verify_update_cadence_with_flush (line 44) | def test_verify_update_cadence_with_flush(tmp_path: Path) -> None:
  function test_verify_update_cadence_detects_mismatch (line 66) | def test_verify_update_cadence_detects_mismatch(tmp_path: Path) -> None:
  function test_verify_update_cadence_report_schema_is_non_empty (line 88) | def test_verify_update_cadence_report_schema_is_non_empty(tmp_path: Path...

FILE: train.py
  function main (line 11) | def main(cfg: DictConfig) -> None:

FILE: train_deepspeed.py
  function setup_distributed (line 30) | def setup_distributed() -> DistributedContext:
  function load_ds_config (line 40) | def load_ds_config(path: str | Path) -> dict:
  function main (line 46) | def main(cfg: DictConfig) -> None:

FILE: train_dist.py
  function setup_distributed (line 13) | def setup_distributed(backend: str | None = None) -> DistributedContext:
  function main (line 28) | def main(cfg: DictConfig) -> None:

FILE: train_fsdp.py
  function setup_distributed (line 38) | def setup_distributed() -> DistributedContext:
  function build_fsdp_model (line 48) | def build_fsdp_model(cfg: DictConfig, device: torch.device) -> tuple[FSD...
  function unwrap_model (line 72) | def unwrap_model(module: torch.nn.Module) -> torch.nn.Module:
  function save_checkpoint (line 80) | def save_checkpoint(
  function maybe_resume (line 114) | def maybe_resume(cfg: DictConfig, model: FSDP, optimizer: torch.optim.Op...
  function main (line 136) | def main(cfg: DictConfig) -> None:

Download .json

Condensed preview — 337 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,477K chars).

[
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "chars": 46,
    "preview": "blank_issues_enabled: false\ncontact_links: []\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/eval_request.md",
    "chars": 604,
    "preview": "---\nname: Evaluation request\nabout: Propose a new benchmark or diagnostic to add\ntitle: \"[Eval] \"\nlabels: [\"evaluation\","
  },
  {
    "path": ".github/ISSUE_TEMPLATE/faithfulness_gap.md",
    "chars": 737,
    "preview": "---\nname: Faithfulness gap\nabout: Report deviations vs. the Nested Learning / HOPE specs\ntitle: \"[Faithfulness] \"\nlabels"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/perf_regression.md",
    "chars": 770,
    "preview": "---\nname: Performance regression\nabout: Report a training / eval performance drop vs. baseline\ntitle: \"[Perf] \"\nlabels: "
  },
  {
    "path": ".github/workflows/ci.yml",
    "chars": 4942,
    "preview": "name: CI\n\non:\n  push:\n    branches: [\"main\"]\n  pull_request:\n    branches: [\"main\"]\n\njobs:\n  lint-and-test:\n    runs-on:"
  },
  {
    "path": ".github/workflows/packages.yml",
    "chars": 1880,
    "preview": "name: Packages\n\non:\n  push:\n    tags:\n      - \"v*\"\n  workflow_dispatch:\n\npermissions:\n  contents: read\n  packages: write"
  },
  {
    "path": ".github/workflows/release.yml",
    "chars": 3311,
    "preview": "name: Release\n\non:\n  push:\n    tags:\n      - \"v*\"\n\npermissions:\n  contents: write\n  id-token: write\n\njobs:\n  build:\n    "
  },
  {
    "path": ".github/workflows/security.yml",
    "chars": 982,
    "preview": "name: Security\n\non:\n  push:\n    branches: [\"main\"]\n  pull_request:\n    branches: [\"main\"]\n  schedule:\n    - cron: \"0 6 *"
  },
  {
    "path": ".gitignore",
    "chars": 659,
    "preview": "# Environment / tooling\n.venv/\n__pycache__/\n*.pyc\n.pytest_cache/\n.ruff_cache/\n.mypy_cache/\n\n# Local artifacts\nlogs/\narti"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 6097,
    "preview": "# Changelog\n\nAll notable changes to this project will be documented here. The format loosely follows [Keep a Changelog]("
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 18671,
    "preview": "# Nested Learning Reproduction\n\n![CI](https://github.com/kmccleary3301/nested_learning/actions/workflows/ci.yml/badge.sv"
  },
  {
    "path": "TODO.md",
    "chars": 7254,
    "preview": "# Project TODOs\n\n## Planner Finalization – P0 Foundation\n- [x] Add first-class package CLI (`nl`) with `doctor`, `smoke`"
  },
  {
    "path": "configs/ablations/cms_sparse.yaml",
    "chars": 858,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  dim: 384\n  num_layers: 8\n  heads: 6\n  titan_level:\n    name: titan\n    update_"
  },
  {
    "path": "configs/ablations/selfmod_chunked_8_64.yaml",
    "chars": 476,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  self_mod_chunk_size: 8\n  self_mod_chunk_size_mem"
  },
  {
    "path": "configs/ablations/selfmod_momentum_off.yaml",
    "chars": 443,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  self_mod_momentum: 0.0\n\ntrain:\n  online_updates:"
  },
  {
    "path": "configs/ablations/selfmod_momentum_on.yaml",
    "chars": 440,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  self_mod_momentum: 0.9\n\ntrain:\n  online_updates:"
  },
  {
    "path": "configs/ablations/selfmod_no_alpha.yaml",
    "chars": 434,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  self_mod_use_alpha: false\n\ntrain:\n  online_updat"
  },
  {
    "path": "configs/ablations/selfmod_no_cms.yaml",
    "chars": 417,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  cms_levels: []\n\ntrain:\n  online_updates: true\n  "
  },
  {
    "path": "configs/ablations/selfmod_rank1_precond_off.yaml",
    "chars": 445,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  self_mod_use_rank1_precond: false\n\ntrain:\n  onli"
  },
  {
    "path": "configs/data/continual_segments_sample.yaml",
    "chars": 284,
    "preview": "segments:\n  - name: refinedweb_2018\n    shards_dir: data/shards/refinedweb_sample\n  - name: wikipedia_sample\n    shards_"
  },
  {
    "path": "configs/data/fineweb_edu_longdoc_filtered_sample.yaml",
    "chars": 414,
    "preview": "name: fineweb_edu_longdoc_filtered_sample\ntokenizer_output_dir: artifacts/tokenizer/fineweb_edu_longdoc\ndatasets:\n  - na"
  },
  {
    "path": "configs/data/fineweb_edu_mixture_full.yaml",
    "chars": 353,
    "preview": "name: fineweb_edu_full\ntokenizer_output_dir: artifacts/tokenizer/fineweb_edu\ndatasets:\n  - name: fineweb_edu\n    dataset"
  },
  {
    "path": "configs/data/fineweb_edu_mixture_sample.yaml",
    "chars": 355,
    "preview": "name: fineweb_edu_sample\ntokenizer_output_dir: artifacts/tokenizer/fineweb_edu\ndatasets:\n  - name: fineweb_edu\n    datas"
  },
  {
    "path": "configs/data/refinedweb_mixture.yaml",
    "chars": 1239,
    "preview": "name: refinedweb_mix_v1\ntokenizer_output_dir: artifacts/tokenizer/refinedweb_mix\ndatasets:\n  - name: refinedweb\n    data"
  },
  {
    "path": "configs/data/refinedweb_mixture_filtered.yaml",
    "chars": 1294,
    "preview": "name: refinedweb_mix_filtered\ntokenizer_output_dir: artifacts/tokenizer/refinedweb_mix\ndatasets:\n  - name: refinedweb\n  "
  },
  {
    "path": "configs/data/refinedweb_mixture_full.yaml",
    "chars": 1270,
    "preview": "name: refinedweb_mix_full\ntokenizer_output_dir: artifacts/tokenizer/refinedweb_mix\ndatasets:\n  - name: refinedweb\n    da"
  },
  {
    "path": "configs/data/refinedweb_mixture_sample.yaml",
    "chars": 1288,
    "preview": "name: refinedweb_mix_sample\ntokenizer_output_dir: artifacts/tokenizer/refinedweb_mix\ndatasets:\n  - name: refinedweb\n    "
  },
  {
    "path": "configs/deepspeed/zero3.json",
    "chars": 462,
    "preview": "{\n  \"bf16\": {\n    \"enabled\": true\n  },\n  \"train_batch_size\": 64,\n  \"gradient_accumulation_steps\": 1,\n  \"zero_optimizatio"
  },
  {
    "path": "configs/hope/mid.yaml",
    "chars": 2284,
    "preview": "defaults:\n  - _self_\n\nhydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  vocab_size: 32000"
  },
  {
    "path": "configs/hope/mid_fsdp.yaml",
    "chars": 886,
    "preview": "defaults:\n  - mid\n  - _self_\n\nmodel:\n  gradient_checkpointing: true\n\ndata:\n  batch_size: 8  # per-rank micro-batch for 2"
  },
  {
    "path": "configs/hope/pilot.yaml",
    "chars": 21,
    "preview": "defaults:\n  - /pilot\n"
  },
  {
    "path": "configs/hope/pilot_attention.yaml",
    "chars": 115,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_attention\n  qk_l2_norm: true\n  local_conv_window: 4\n\n"
  },
  {
    "path": "configs/hope/pilot_selfmod.yaml",
    "chars": 435,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  # Chunk update cadence (paper §8.2): other memor"
  },
  {
    "path": "configs/hope/pilot_transformer.yaml",
    "chars": 112,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  block_variant: transformer\n  qk_l2_norm: true\n  local_conv_window: 4\n\n"
  },
  {
    "path": "configs/hope/target.yaml",
    "chars": 2912,
    "preview": "defaults:\n  - _self_\n\nhydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  vocab_size: 32000"
  },
  {
    "path": "configs/hope/target_fsdp.yaml",
    "chars": 878,
    "preview": "defaults:\n  - target\n  - _self_\n\nmodel:\n  gradient_checkpointing: true\n\ndata:\n  batch_size: 4  # per-rank micro-batch\n  "
  },
  {
    "path": "configs/mid_smoke.yaml",
    "chars": 1926,
    "preview": "hydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  vocab_size: 32000\n  dim: 256\n  num_laye"
  },
  {
    "path": "configs/mid_stage2.yaml",
    "chars": 2146,
    "preview": "hydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  vocab_size: 32000\n  dim: 768\n  num_laye"
  },
  {
    "path": "configs/mid_stage2_smoke.yaml",
    "chars": 1990,
    "preview": "hydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  vocab_size: 32000\n  dim: 512\n  num_laye"
  },
  {
    "path": "configs/mid_titan_baseline.yaml",
    "chars": 1773,
    "preview": "hydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  type: titan\n  vocab_size: 32000\n  dim: "
  },
  {
    "path": "configs/pilot.yaml",
    "chars": 2632,
    "preview": "defaults:\n  - _self_\n\nhydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  vocab_size: 32000"
  },
  {
    "path": "configs/pilot_paper_faithful.yaml",
    "chars": 1577,
    "preview": "defaults:\n  - /pilot\n  - _self_\n\nmodel:\n  # Explicit paper-defined variant (avoid inheriting repo default `hope_hybrid`)"
  },
  {
    "path": "configs/pilot_selfmod_paper_faithful.yaml",
    "chars": 445,
    "preview": "defaults:\n  - /pilot_paper_faithful\n  - _self_\n\nmodel:\n  block_variant: hope_selfmod\n  # Chunk update cadence (paper §8."
  },
  {
    "path": "configs/pilot_smoke.yaml",
    "chars": 1393,
    "preview": "hydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\n\nmodel:\n  vocab_size: 32000\n  dim: 128\n  num_laye"
  },
  {
    "path": "configs/resolved/cms_sparse_eval.yaml",
    "chars": 2110,
    "preview": "hydra:\n  run:\n    dir: .\n  output_subdir: null\n  job:\n    chdir: false\nmodel:\n  vocab_size: 32000\n  dim: 384\n  num_layer"
  },
  {
    "path": "configs/resolved/phase2_pilot_attention_eval.yaml",
    "chars": 1008,
    "preview": "model:\n  vocab_size: 32000\n  dim: 512\n  num_layers: 12\n  heads: 8\n  teach_scale: 0.10\n  teach_clip: 5.0\n  surprise_thres"
  },
  {
    "path": "configs/resolved/phase2_pilot_transformer_eval.yaml",
    "chars": 1005,
    "preview": "model:\n  vocab_size: 32000\n  dim: 512\n  num_layers: 12\n  heads: 8\n  teach_scale: 0.10\n  teach_clip: 5.0\n  surprise_thres"
  },
  {
    "path": "docker/Dockerfile.dist",
    "chars": 282,
    "preview": "FROM scratch\n\nLABEL org.opencontainers.image.title=\"nested-learning-dist\"\nLABEL org.opencontainers.image.description=\"OC"
  },
  {
    "path": "docs/BUG_REPORT_CHECKLIST.md",
    "chars": 1121,
    "preview": "# Bug Report Checklist\n\nUse this checklist when filing reproducibility or correctness issues.\n\n## Required Context\n\n- Co"
  },
  {
    "path": "docs/COMPATIBILITY_MATRIX.md",
    "chars": 2366,
    "preview": "# Compatibility Matrix\n\nThis document defines the support contract for runtime/backends.\n\n## Support Tiers\n\n- **Tier 1 ("
  },
  {
    "path": "docs/FSDP_SCALING_GUIDE.md",
    "chars": 3541,
    "preview": "# FSDP/ZeRO Scaling Guide (RTX 6000 Ada Dual-GPU Rig)\n\nThis note captures the configuration we will use for the Stage 2 "
  },
  {
    "path": "docs/IMPLEMENTATION_STATUS.md",
    "chars": 4264,
    "preview": "# Implementation Status (Source of Truth)\n\nThis table is the canonical mechanism-status map for this repo.\n\n| Mechanism "
  },
  {
    "path": "docs/P4_REMEDIATION_PLAN.md",
    "chars": 2813,
    "preview": "# P4 Remediation Plan — Status & Tracking (Paper-Faithful HOPE/Nested Learning)\n\nThis file started as an execution check"
  },
  {
    "path": "docs/PACKAGE_RELEASE_CHECKLIST.md",
    "chars": 1750,
    "preview": "# Package Release Checklist (PyPI/GitHub)\n\nUse this checklist for package distribution releases (separate from checkpoin"
  },
  {
    "path": "docs/PAPER_COMPLIANCE.md",
    "chars": 23576,
    "preview": "# Paper Compliance / Fidelity Guide (Nested Learning / HOPE)\n\nThis doc explains the **fidelity‑critical behaviors** (wha"
  },
  {
    "path": "docs/PHASE2_LONG_CONTEXT_COMPARISON.md",
    "chars": 2111,
    "preview": "# Phase 2 – HOPE-Attention vs Transformer (Long-Context Sanity)\n\nThis repo includes a lightweight Phase‑2 sanity check t"
  },
  {
    "path": "docs/PHASE_2_PLAN.md",
    "chars": 7062,
    "preview": "# Phase 2 Plan – Execution & Results Packaging\n\n## Immediate Remediation Tasks (from EX_PHASE_1_CRITIQUE)\n\nBefore resumi"
  },
  {
    "path": "docs/PYPI_TRUSTED_PUBLISHING.md",
    "chars": 1963,
    "preview": "# PyPI Trusted Publishing Setup\n\nThis repository ships `.github/workflows/release.yml` for OIDC-based publishing.\nUse th"
  },
  {
    "path": "docs/STREAMING_CONTRACT.md",
    "chars": 4214,
    "preview": "# Streaming Contract (Mechanism-Auditing Mode)\n\nThis document defines the exact streaming semantics used by the single-G"
  },
  {
    "path": "docs/VERSIONING_POLICY.md",
    "chars": 1329,
    "preview": "# Versioning and Stability Policy\n\nThis repository follows SemVer-style versioning with explicit 0.x constraints.\n\n## Cu"
  },
  {
    "path": "docs/compute_plan.md",
    "chars": 1376,
    "preview": "# Compute Reservation Plan (Stage 2)\n\n## Hardware\n- Cluster: 2× nodes with dual NVIDIA RTX 6000 Ada (48 GB VRAM) + 64-co"
  },
  {
    "path": "docs/continual_classification_eval.md",
    "chars": 2957,
    "preview": "# Continual Classification Evaluation (CLINC / Banking77 / DBpedia14)\n\nThe Nested Learning paper highlights **class-incr"
  },
  {
    "path": "docs/continual_eval.md",
    "chars": 2159,
    "preview": "# Continual-Learning Evaluation Guide\n\nUse `scripts/eval/continual.py` to quantify forgetting across streaming segments."
  },
  {
    "path": "docs/data_pipeline.md",
    "chars": 11687,
    "preview": "# Data Pipeline (Stage 2)\n\nThis document explains how to generate tokenizer artifacts and token shards for Stage 2 train"
  },
  {
    "path": "docs/env_matrix.md",
    "chars": 3257,
    "preview": "# Environment Matrix – Stage 2\n\nThis document captures the exact runtime state used for the Stage 2 sprint so collaborat"
  },
  {
    "path": "docs/experiments_report.md",
    "chars": 14353,
    "preview": "# Experiments Report – Nested Learning Reproduction\n\n_Draft covering work completed through 9 Nov 2025. This document is"
  },
  {
    "path": "docs/future_directions.md",
    "chars": 2622,
    "preview": "# Future Directions – Nested Learning Reproduction\n\nThis roadmap outlines high-impact areas for contributors once the in"
  },
  {
    "path": "docs/phase2_comparison.md",
    "chars": 2548,
    "preview": "# Phase 2 – HOPE-Attention vs Transformer Baseline\n\nPhase 2 is “implementation-complete” when we can compare the **paper"
  },
  {
    "path": "docs/release_checklist.md",
    "chars": 2821,
    "preview": "# Release Checklist (Stage 2)\n\nUse this list before tagging/publishing any checkpoint bundle.\n\n## Faithfulness & Tests\n-"
  },
  {
    "path": "docs/scaling_guidance.md",
    "chars": 5632,
    "preview": "# Scaling Guidance – Nested Learning Reproduction\n\nThis document describes how to extend the current smoke-tested Nested"
  },
  {
    "path": "docs/spec_interfaces.md",
    "chars": 1505,
    "preview": "# Interface Notes for Nested Learning Modules\n\n## LevelClock / LevelSpec (`nested_learning.levels`)\n- `LevelSpec`: name,"
  },
  {
    "path": "docs/sprint_next_plan.md",
    "chars": 10462,
    "preview": "# Sprint Plan – Stage 2 Pilot & Results Sprint\n\n**Window:** Nov 10 – Nov 17, 2025 (7 days)  \n**Goal:** Produce reproduci"
  },
  {
    "path": "docs/stage2_plan.md",
    "chars": 8757,
    "preview": "# Stage 2 Plan – Nested Learning (HOPE) Results Reproduction\n\nThis document details Stage 2 goals: reproduce the key exp"
  },
  {
    "path": "docs/stage2_progress.md",
    "chars": 1710,
    "preview": "# Stage 2 Progress\n\nLast updated: `2026-02-24`\n\n## Sprint Status\n\n- **A-series (algorithm-mode + boundary-state fidelity"
  },
  {
    "path": "docs/templates/checkpoint_report.md",
    "chars": 1614,
    "preview": "# Checkpoint Report Template\n\nCopy this template into `reports/checkpoints/<run>.md` (or similar) for every published ch"
  },
  {
    "path": "docs/zeroshot_eval.md",
    "chars": 4242,
    "preview": "# Zero-shot Evaluation Guide\n\nThe script `scripts/eval/zeroshot.py` evaluates HOPE checkpoints on \ncommon reasoning benc"
  },
  {
    "path": "eval/continual_dummy.json",
    "chars": 271,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/examples/pilot_dummy.pt\",\n    \"segment_losses\": {\n      \"refinedweb_2018\": 10.5508940"
  },
  {
    "path": "eval/continual_mid_stage2.json",
    "chars": 959,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2/step_000100.pt\"\n  ],\n  \"segments\": [\n    {\n      \"name\": \"ref"
  },
  {
    "path": "eval/continual_mid_stage2_smoke.json",
    "chars": 963,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_smoke/step_000060.pt\"\n  ],\n  \"segments\": [\n    {\n      \"name\""
  },
  {
    "path": "eval/continual_mid_stage2_ts10.json",
    "chars": 879,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_ts10/step_000080.pt\"\n  ],\n  \"segments\": [\n    {\n      \"name\":"
  },
  {
    "path": "eval/continual_mid_stage2_ts10_single120_clip.json",
    "chars": 983,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_ts10_single120_clip/step_000120.pt\"\n  ],\n  \"segments\": [\n    "
  },
  {
    "path": "eval/continual_mid_stage2_ts10_single140_schedC.json",
    "chars": 984,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_ts10_single140_schedC/step_000140.pt\"\n  ],\n  \"segments\": [\n  "
  },
  {
    "path": "eval/continual_mid_stage2_ts10_single220_schedD.json",
    "chars": 981,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_ts10_single220_schedD/step_000220.pt\"\n  ],\n  \"segments\": [\n  "
  },
  {
    "path": "eval/continual_mid_stage2_ts10_single80.json",
    "chars": 888,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_ts10_single80/step_000080.pt\"\n  ],\n  \"segments\": [\n    {\n    "
  },
  {
    "path": "eval/continual_mid_stage2_ts10_single80lr2e5.json",
    "chars": 893,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_ts10_single80lr2e5/step_000080.pt\"\n  ],\n  \"segments\": [\n    {"
  },
  {
    "path": "eval/continual_mid_stage2_ts20.json",
    "chars": 879,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_stage2_ts20/step_000080.pt\"\n  ],\n  \"segments\": [\n    {\n      \"name\":"
  },
  {
    "path": "eval/continual_mid_titan_baseline.json",
    "chars": 965,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/mid_titan_baseline/step_000200.pt\"\n  ],\n  \"segments\": [\n    {\n      \"nam"
  },
  {
    "path": "eval/continual_pilot.json",
    "chars": 2252,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_relaunch/step_477000.pt\",\n    \"segment_losses\": {\n      \"refinedweb"
  },
  {
    "path": "eval/continual_pilot_cms_nochunk_step5000.json",
    "chars": 289,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_cms_nochunk/step_005000.pt\",\n    \"segment_losses\": {\n      \"refined"
  },
  {
    "path": "eval/continual_pilot_cms_sparse_step5000.json",
    "chars": 288,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_cms_sparse/step_005000.pt\",\n    \"segment_losses\": {\n      \"refinedw"
  },
  {
    "path": "eval/continual_pilot_multi.json",
    "chars": 3631,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot/step_005000.pt\",\n    \"segment_losses\": {\n      \"refinedweb_2018\": 4"
  },
  {
    "path": "eval/continual_pilot_opt_adamw_step5000.json",
    "chars": 303,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot-opt-adamw-20251115173858/step_005000.pt\",\n    \"segment_losses\": {\n "
  },
  {
    "path": "eval/continual_pilot_opt_muon_step5000.json",
    "chars": 302,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot-opt-muon-20251115180139/step_005000.pt\",\n    \"segment_losses\": {\n  "
  },
  {
    "path": "eval/continual_pilot_selfmod_off_step5000.json",
    "chars": 288,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_selfmod_off/step_005000.pt\",\n    \"segment_losses\": {\n      \"refined"
  },
  {
    "path": "eval/continual_pilot_step22000.json",
    "chars": 273,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/pilot_release/checkpoint.pt\",\n    \"segment_losses\": {\n      \"refinedweb_2018\": 47.078"
  },
  {
    "path": "eval/continual_pilot_step230000.json",
    "chars": 1189,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot/step_230000.pt\",\n    \"segment_losses\": {\n      \"refinedweb_2018\": 8"
  },
  {
    "path": "eval/continual_pilot_teach05_long_step25000.json",
    "chars": 292,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_teach05_long/step_025000.pt\",\n    \"segment_losses\": {\n      \"refine"
  },
  {
    "path": "eval/continual_pilot_teach05_step2000.json",
    "chars": 285,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_teach05/step_002000.pt\",\n    \"segment_losses\": {\n      \"refinedweb_"
  },
  {
    "path": "eval/continual_pilot_teach15_long_step25000.json",
    "chars": 290,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_teach15_long/step_025000.pt\",\n    \"segment_losses\": {\n      \"refine"
  },
  {
    "path": "eval/continual_pilot_teach15_step2000.json",
    "chars": 277,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/pilot_teach15/step_002000.pt\",\n    \"segment_losses\": {\n      \"refinedweb_"
  },
  {
    "path": "eval/continual_smoke.json",
    "chars": 961,
    "preview": "{\n  \"checkpoints\": [\n    \"artifacts/checkpoints/pilot_smoke/step_000010.pt\"\n  ],\n  \"segments\": [\n    {\n      \"name\": \"re"
  },
  {
    "path": "eval/continual_titan.json",
    "chars": 2246,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/mid_titan_long/step_032000.pt\",\n    \"segment_losses\": {\n      \"refinedweb"
  },
  {
    "path": "eval/continual_titan_relaunch_step001000.json",
    "chars": 1084,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/mid_titan_long/step_001000.pt\",\n    \"segment_losses\": {\n      \"refinedweb"
  },
  {
    "path": "eval/continual_titan_step25000.json",
    "chars": 1159,
    "preview": "[\n  {\n    \"checkpoint\": \"artifacts/checkpoints/mid_titan_baseline/step_025000.pt\",\n    \"segment_losses\": {\n      \"refine"
  },
  {
    "path": "eval/niah_dummy.json",
    "chars": 42,
    "preview": "{\n  \"niah_2048\": 0.0,\n  \"niah_4096\": 0.0\n}"
  },
  {
    "path": "eval/niah_mid_stage2.json",
    "chars": 72,
    "preview": "{\n  \"niah_2048\": 0.3333333333333333,\n  \"niah_4096\": 0.3333333333333333\n}"
  },
  {
    "path": "eval/niah_mid_stage2_smoke.json",
    "chars": 62,
    "preview": "{\n  \"niah_2048\": 0.6,\n  \"niah_4096\": 0.6,\n  \"niah_8192\": 0.6\n}"
  },
  {
    "path": "eval/niah_mid_stage2_ts10.json",
    "chars": 22,
    "preview": "{\n  \"niah_2048\": 0.0\n}"
  },
  {
    "path": "eval/niah_mid_stage2_ts10_single120_clip.json",
    "chars": 22,
    "preview": "{\n  \"niah_2048\": 0.0\n}"
  },
  {
    "path": "eval/niah_mid_stage2_ts10_single140_schedC.json",
    "chars": 22,
    "preview": "{\n  \"niah_2048\": 0.4\n}"
  },
  {
    "path": "eval/niah_mid_stage2_ts10_single220_schedD.json",
    "chars": 42,
    "preview": "{\n  \"niah_2048\": 0.4,\n  \"niah_4096\": 0.8\n}"
  },
  {
    "path": "eval/niah_mid_stage2_ts10_single80.json",
    "chars": 22,
    "preview": "{\n  \"niah_2048\": 0.0\n}"
  },
  {
    "path": "eval/niah_mid_stage2_ts10_single80lr2e5.json",
    "chars": 22,
    "preview": "{\n  \"niah_2048\": 0.0\n}"
  },
  {
    "path": "eval/niah_mid_stage2_ts20.json",
    "chars": 22,
    "preview": "{\n  \"niah_2048\": 0.0\n}"
  },
  {
    "path": "eval/niah_mid_titan_baseline.json",
    "chars": 42,
    "preview": "{\n  \"niah_2048\": 0.6,\n  \"niah_4096\": 0.0\n}"
  },
  {
    "path": "eval/niah_pilot.json",
    "chars": 1259,
    "preview": "{\n  \"niah_2048\": 0.625,\n  \"niah_2048_baseline_accuracy\": 0.625,\n  \"niah_2048_memorize_accuracy\": 0.625,\n  \"niah_2048_mem"
  },
  {
    "path": "eval/niah_pilot_cms_nochunk_step5000.json",
    "chars": 130,
    "preview": "{\n  \"niah_2048\": 0.75,\n  \"niah_4096\": 0.25,\n  \"niah_8192\": 0.25,\n  \"niah_16384\": 0.25,\n  \"niah_32768\": 0.75,\n  \"niah_655"
  },
  {
    "path": "eval/niah_pilot_cms_sparse_step5000.json",
    "chars": 132,
    "preview": "{\n  \"niah_2048\": 0.75,\n  \"niah_4096\": 0.5,\n  \"niah_8192\": 0.625,\n  \"niah_16384\": 0.625,\n  \"niah_32768\": 0.5,\n  \"niah_655"
  },
  {
    "path": "eval/niah_pilot_opt_adamw_step5000.json",
    "chars": 128,
    "preview": "{\n  \"niah_2048\": 0.75,\n  \"niah_4096\": 1.0,\n  \"niah_8192\": 0.5,\n  \"niah_16384\": 0.75,\n  \"niah_32768\": 0.5,\n  \"niah_65536\""
  },
  {
    "path": "eval/niah_pilot_opt_muon_step5000.json",
    "chars": 129,
    "preview": "{\n  \"niah_2048\": 0.5,\n  \"niah_4096\": 0.5,\n  \"niah_8192\": 0.25,\n  \"niah_16384\": 0.75,\n  \"niah_32768\": 0.75,\n  \"niah_65536"
  },
  {
    "path": "eval/niah_pilot_selfmod_off_step5000.json",
    "chars": 130,
    "preview": "{\n  \"niah_2048\": 0.75,\n  \"niah_4096\": 0.75,\n  \"niah_8192\": 0.5,\n  \"niah_16384\": 0.75,\n  \"niah_32768\": 0.25,\n  \"niah_6553"
  },
  {
    "path": "eval/niah_pilot_step22000.json",
    "chars": 62,
    "preview": "{\n  \"niah_2048\": 1.0,\n  \"niah_4096\": 0.0,\n  \"niah_8192\": 0.0\n}"
  },
  {
    "path": "eval/niah_pilot_step230000.json",
    "chars": 907,
    "preview": "{\n  \"niah_2048\": 0.25,\n  \"niah_2048_baseline_accuracy\": 0.25,\n  \"niah_2048_memorize_accuracy\": 0.25,\n  \"niah_2048_memori"
  },
  {
    "path": "eval/niah_pilot_teach05_long_step25000.json",
    "chars": 131,
    "preview": "{\n  \"niah_2048\": 0.25,\n  \"niah_4096\": 0.5,\n  \"niah_8192\": 0.375,\n  \"niah_16384\": 0.75,\n  \"niah_32768\": 0.75,\n  \"niah_655"
  },
  {
    "path": "eval/niah_pilot_teach05_step2000.json",
    "chars": 128,
    "preview": "{\n  \"niah_2048\": 0.5,\n  \"niah_4096\": 0.75,\n  \"niah_8192\": 1.0,\n  \"niah_16384\": 0.75,\n  \"niah_32768\": 0.25,\n  \"niah_65536"
  },
  {
    "path": "eval/niah_pilot_teach15_long_step25000.json",
    "chars": 132,
    "preview": "{\n  \"niah_2048\": 0.75,\n  \"niah_4096\": 0.625,\n  \"niah_8192\": 0.375,\n  \"niah_16384\": 0.75,\n  \"niah_32768\": 0.5,\n  \"niah_65"
  },
  {
    "path": "eval/niah_pilot_teach15_step2000.json",
    "chars": 129,
    "preview": "{\n  \"niah_2048\": 0.75,\n  \"niah_4096\": 0.75,\n  \"niah_8192\": 0.75,\n  \"niah_16384\": 0.5,\n  \"niah_32768\": 0.25,\n  \"niah_6553"
  },
  {
    "path": "eval/niah_smoke.json",
    "chars": 92,
    "preview": "{\n  \"niah_2048\": 0.6666666666666666,\n  \"niah_4096\": 0.0,\n  \"niah_8192\": 0.6666666666666666\n}"
  },
  {
    "path": "eval/niah_titan.json",
    "chars": 1253,
    "preview": "{\n  \"niah_2048\": 0.375,\n  \"niah_2048_baseline_accuracy\": 0.375,\n  \"niah_2048_memorize_accuracy\": 0.375,\n  \"niah_2048_mem"
  },
  {
    "path": "eval/niah_titan_relaunch_step001000.json",
    "chars": 970,
    "preview": "{\n  \"niah_2048\": 0.5,\n  \"niah_2048_baseline_accuracy\": 0.5,\n  \"niah_2048_memorize_accuracy\": 0.5,\n  \"niah_2048_memorize_"
  },
  {
    "path": "eval/niah_titan_step25000.json",
    "chars": 880,
    "preview": "{\n  \"niah_2048\": 0.125,\n  \"niah_2048_baseline_accuracy\": 0.125,\n  \"niah_2048_memorize_accuracy\": 0.125,\n  \"niah_2048_mem"
  },
  {
    "path": "eval/passkey_pilot.json",
    "chars": 546,
    "preview": "{\n  \"samples\": 64,\n  \"filler_sentences\": 256,\n  \"accuracy_base\": 0.4375,\n  \"accuracy_memorize\": 0.4375,\n  \"accuracy_delt"
  },
  {
    "path": "eval/passkey_pilot_step230000.json",
    "chars": 229,
    "preview": "{\n  \"samples\": 64,\n  \"filler_sentences\": 256,\n  \"accuracy_base\": 0.484375,\n  \"accuracy_memorize\": 0.484375,\n  \"accuracy_"
  },
  {
    "path": "eval/passkey_titan.json",
    "chars": 539,
    "preview": "{\n  \"samples\": 64,\n  \"filler_sentences\": 256,\n  \"accuracy_base\": 0.46875,\n  \"accuracy_memorize\": 0.46875,\n  \"accuracy_de"
  },
  {
    "path": "eval/passkey_titan_relaunch_step001000.json",
    "chars": 272,
    "preview": "{\n  \"samples\": 64,\n  \"filler_sentences\": 256,\n  \"accuracy_base\": 0.5,\n  \"accuracy_memorize\": 0.5,\n  \"accuracy_delta\": 0."
  },
  {
    "path": "eval/passkey_titan_step25000.json",
    "chars": 218,
    "preview": "{\n  \"samples\": 64,\n  \"filler_sentences\": 256,\n  \"accuracy_base\": 0.53125,\n  \"accuracy_memorize\": 0.546875,\n  \"accuracy_d"
  },
  {
    "path": "eval/pg19_pilot.json",
    "chars": 193,
    "preview": "{\n  \"samples\": 32,\n  \"tokens\": 65504,\n  \"ppl_base\": 285944896.0,\n  \"ppl_memorize\": 285944896.0,\n  \"ppl_delta\": 0.0,\n  \"m"
  },
  {
    "path": "eval/pg19_pilot_step230000.json",
    "chars": 114,
    "preview": "{\n  \"samples\": 4,\n  \"tokens\": 8188,\n  \"ppl_base\": 2497.421875,\n  \"ppl_memorize\": 2497.421875,\n  \"ppl_delta\": 0.0\n}"
  },
  {
    "path": "eval/pg19_titan.json",
    "chars": 190,
    "preview": "{\n  \"samples\": 32,\n  \"tokens\": 65504,\n  \"ppl_base\": 2449.884765625,\n  \"ppl_memorize\": 2449.884765625,\n  \"ppl_delta\": 0.0"
  },
  {
    "path": "eval/pg19_titan_relaunch_step001000.json",
    "chars": 204,
    "preview": "{\n  \"samples\": 32,\n  \"tokens\": 65504,\n  \"ppl_base\": 2.931641597034496e+17,\n  \"ppl_memorize\": 2.931641597034496e+17,\n  \"p"
  },
  {
    "path": "eval/pg19_titan_step25000.json",
    "chars": 137,
    "preview": "{\n  \"samples\": 4,\n  \"tokens\": 8188,\n  \"ppl_base\": 3122.819580078125,\n  \"ppl_memorize\": 3233.149658203125,\n  \"ppl_delta\":"
  },
  {
    "path": "eval/phase2_compare_smoke_lastlayer_metrics.json",
    "chars": 4707,
    "preview": "{\n  \"seed\": 0,\n  \"device\": \"cuda:1\",\n  \"tokenizer_path\": \"artifacts/tokenizer/refinedweb_mix/spm_32000_unigram.model\",\n "
  },
  {
    "path": "eval/zeroshot_full_smoke.json",
    "chars": 409,
    "preview": "{\n  \"piqa_accuracy\": 0.625,\n  \"piqa_samples\": 32,\n  \"hellaswag_accuracy\": 0.0,\n  \"hellaswag_samples\": 32,\n  \"winogrande_"
  },
  {
    "path": "eval/zeroshot_mid_stage2.json",
    "chars": 164,
    "preview": "{\n  \"piqa_accuracy\": 0.5,\n  \"piqa_samples\": 16,\n  \"hellaswag_accuracy\": 0.0,\n  \"hellaswag_samples\": 16,\n  \"winogrande_ac"
  },
  {
    "path": "eval/zeroshot_mid_stage2_smoke.json",
    "chars": 413,
    "preview": "{\n  \"piqa_accuracy\": 0.484375,\n  \"piqa_samples\": 64,\n  \"hellaswag_accuracy\": 0.0,\n  \"hellaswag_samples\": 64,\n  \"winogran"
  },
  {
    "path": "eval/zeroshot_mid_stage2_smoke_piqa_baseline.json",
    "chars": 51,
    "preview": "{\n  \"piqa_accuracy\": 0.5625,\n  \"piqa_samples\": 32\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_smoke_piqa_mem.json",
    "chars": 51,
    "preview": "{\n  \"piqa_accuracy\": 0.5625,\n  \"piqa_samples\": 32\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_ts10.json",
    "chars": 114,
    "preview": "{\n  \"piqa_accuracy\": 0.46875,\n  \"piqa_samples\": 32,\n  \"winogrande_accuracy\": 0.59375,\n  \"winogrande_samples\": 32\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_ts10_single120_clip.json",
    "chars": 114,
    "preview": "{\n  \"piqa_accuracy\": 0.59375,\n  \"piqa_samples\": 32,\n  \"winogrande_accuracy\": 0.40625,\n  \"winogrande_samples\": 32\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_ts10_single140_schedC.json",
    "chars": 116,
    "preview": "{\n  \"piqa_accuracy\": 0.546875,\n  \"piqa_samples\": 64,\n  \"winogrande_accuracy\": 0.484375,\n  \"winogrande_samples\": 64\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_ts10_single220_schedD.json",
    "chars": 120,
    "preview": "{\n  \"piqa_accuracy\": 0.5078125,\n  \"piqa_samples\": 128,\n  \"winogrande_accuracy\": 0.4921875,\n  \"winogrande_samples\": 128\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_ts10_single80.json",
    "chars": 114,
    "preview": "{\n  \"piqa_accuracy\": 0.46875,\n  \"piqa_samples\": 32,\n  \"winogrande_accuracy\": 0.59375,\n  \"winogrande_samples\": 32\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_ts10_single80lr2e5.json",
    "chars": 114,
    "preview": "{\n  \"piqa_accuracy\": 0.46875,\n  \"piqa_samples\": 32,\n  \"winogrande_accuracy\": 0.59375,\n  \"winogrande_samples\": 32\n}"
  },
  {
    "path": "eval/zeroshot_mid_stage2_ts20.json",
    "chars": 114,
    "preview": "{\n  \"piqa_accuracy\": 0.46875,\n  \"piqa_samples\": 32,\n  \"winogrande_accuracy\": 0.59375,\n  \"winogrande_samples\": 32\n}"
  },
  {
    "path": "eval/zeroshot_mid_titan_baseline.json",
    "chars": 117,
    "preview": "{\n  \"piqa_accuracy\": 0.5078125,\n  \"piqa_samples\": 128,\n  \"winogrande_accuracy\": 0.4375,\n  \"winogrande_samples\": 128\n}"
  },
  {
    "path": "eval/zeroshot_pilot.json",
    "chars": 6417,
    "preview": "{\n  \"piqa_accuracy\": 0.5390625,\n  \"piqa_samples\": 256,\n  \"piqa_baseline_accuracy\": 0.53125,\n  \"piqa_memorize_accuracy\": "
  },
  {
    "path": "eval/zeroshot_pilot_cms_nochunk_step5000.json",
    "chars": 580,
    "preview": "{\n  \"piqa_accuracy\": 0.51953125,\n  \"piqa_samples\": 256,\n  \"hellaswag_accuracy\": 0.27734375,\n  \"hellaswag_samples\": 256,\n"
  },
  {
    "path": "eval/zeroshot_pilot_cms_sparse_step5000.json",
    "chars": 568,
    "preview": "{\n  \"piqa_accuracy\": 0.515625,\n  \"piqa_samples\": 256,\n  \"hellaswag_accuracy\": 0.2578125,\n  \"hellaswag_samples\": 256,\n  \""
  },
  {
    "path": "eval/zeroshot_pilot_debug.json",
    "chars": 282,
    "preview": "{\n  \"piqa_accuracy\": 0.59375,\n  \"piqa_samples\": 32,\n  \"piqa_baseline_accuracy\": 0.59375,\n  \"piqa_memorize_accuracy\": 0.5"
  },
  {
    "path": "eval/zeroshot_pilot_dummy_piqa.json",
    "chars": 47,
    "preview": "{\n  \"piqa_accuracy\": 0.0,\n  \"piqa_samples\": 2\n}"
  },
  {
    "path": "eval/zeroshot_pilot_opt_adamw_step5000.json",
    "chars": 233,
    "preview": "{\n  \"piqa_accuracy\": 0.55859375,\n  \"piqa_samples\": 256,\n  \"hellaswag_accuracy\": 0.2734375,\n  \"hellaswag_samples\": 256,\n "
  },
  {
    "path": "eval/zeroshot_pilot_opt_muon_step5000.json",
    "chars": 232,
    "preview": "{\n  \"piqa_accuracy\": 0.53125,\n  \"piqa_samples\": 256,\n  \"hellaswag_accuracy\": 0.3125,\n  \"hellaswag_samples\": 256,\n  \"wino"
  },
  {
    "path": "eval/zeroshot_pilot_selfmod_off_step5000.json",
    "chars": 576,
    "preview": "{\n  \"piqa_accuracy\": 0.515625,\n  \"piqa_samples\": 256,\n  \"hellaswag_accuracy\": 0.265625,\n  \"hellaswag_samples\": 256,\n  \"w"
  },
  {
    "path": "eval/zeroshot_pilot_step22000.json",
    "chars": 53,
    "preview": "{\n  \"piqa_accuracy\": 0.53125,\n  \"piqa_samples\": 128\n}"
  },
  {
    "path": "eval/zeroshot_pilot_step230000.json",
    "chars": 2523,
    "preview": "{\n  \"piqa_accuracy\": 0.515625,\n  \"piqa_samples\": 256,\n  \"piqa_baseline_accuracy\": 0.51171875,\n  \"piqa_memorize_accuracy\""
  },
  {
    "path": "eval/zeroshot_pilot_teach05_long_step25000.json",
    "chars": 578,
    "preview": "{\n  \"piqa_accuracy\": 0.5078125,\n  \"piqa_samples\": 256,\n  \"hellaswag_accuracy\": 0.28515625,\n  \"hellaswag_samples\": 256,\n "
  },
  {
    "path": "eval/zeroshot_pilot_teach05_step2000.json",
    "chars": 566,
    "preview": "{\n  \"piqa_accuracy\": 0.453125,\n  \"piqa_samples\": 128,\n  \"hellaswag_accuracy\": 0.2734375,\n  \"hellaswag_samples\": 128,\n  \""
  },
  {
    "path": "eval/zeroshot_pilot_teach15_long_step25000.json",
    "chars": 570,
    "preview": "{\n  \"piqa_accuracy\": 0.49609375,\n  \"piqa_samples\": 256,\n  \"hellaswag_accuracy\": 0.3046875,\n  \"hellaswag_samples\": 256,\n "
  },
  {
    "path": "eval/zeroshot_pilot_teach15_step2000.json",
    "chars": 569,
    "preview": "{\n  \"piqa_accuracy\": 0.484375,\n  \"piqa_samples\": 128,\n  \"hellaswag_accuracy\": 0.2578125,\n  \"hellaswag_samples\": 128,\n  \""
  },
  {
    "path": "eval/zeroshot_smoke.json",
    "chars": 48,
    "preview": "{\n  \"piqa_accuracy\": 0.5,\n  \"piqa_samples\": 16\n}"
  },
  {
    "path": "eval/zeroshot_titan.json",
    "chars": 6163,
    "preview": "{\n  \"piqa_accuracy\": 0.484375,\n  \"piqa_samples\": 256,\n  \"piqa_baseline_accuracy\": 0.51171875,\n  \"piqa_memorize_accuracy\""
  },
  {
    "path": "eval/zeroshot_titan_relaunch_step001000.json",
    "chars": 3158,
    "preview": "{\n  \"piqa_accuracy\": 0.5234375,\n  \"piqa_samples\": 256,\n  \"piqa_baseline_accuracy\": 0.5234375,\n  \"piqa_memorize_accuracy\""
  },
  {
    "path": "eval/zeroshot_titan_step25000.json",
    "chars": 2421,
    "preview": "{\n  \"piqa_accuracy\": 0.48828125,\n  \"piqa_samples\": 256,\n  \"piqa_baseline_accuracy\": 0.484375,\n  \"piqa_memorize_accuracy\""
  },
  {
    "path": "google_papers/Nested_Learning/Nested_Learning.json",
    "chars": 92994,
    "preview": "{\r\n  \"pages\": [\r\n    {\r\n      \"index\": 0,\r\n      \"markdown\": \"# Nested Learning: The Illusion of Deep Learning Architect"
  },
  {
    "path": "google_papers/Nested_Learning/Nested_Learning.md",
    "chars": 87399,
    "preview": "PAGE 1\r\n# Nested Learning: The Illusion of Deep Learning Architectures \r\n\r\nAli Behrouz<br>Google Research<br>USA<br>alib"
  },
  {
    "path": "google_papers/TITANs/TITANs.json",
    "chars": 125014,
    "preview": "{\r\n  \"pages\": [\r\n    {\r\n      \"index\": 0,\r\n      \"markdown\": \"# Titans: Learning to Memorize at Test Time \\n\\nAli Behrou"
  },
  {
    "path": "google_papers/TITANs/TITANs.md",
    "chars": 116758,
    "preview": "PAGE 1\r\n# Titans: Learning to Memorize at Test Time \r\n\r\nAli Behrouz ${ }^{\\dagger}$, Peilin Zhong ${ }^{\\dagger}$, and V"
  },
  {
    "path": "pyproject.toml",
    "chars": 1757,
    "preview": "[project]\nname = \"nested-learning\"\nversion = \"0.2.0\"\ndescription = \"Reproduction of Google's Nested Learning (HOPE) arch"
  },
  {
    "path": "reports/ablations.md",
    "chars": 17294,
    "preview": "# Planned Ablations – Pilot Run\n\nThis document tracks the ablation studies we intend to run once the 3 B-token pilot che"
  },
  {
    "path": "reports/cadence_mechanism_audit_smoke.json",
    "chars": 587,
    "preview": "{\n  \"ok\": true,\n  \"metric_prefix\": \"layer0.cms.cms_mid\",\n  \"log_path\": \"logs/mechanism_audit_smoke.json\",\n  \"flush_parti"
  },
  {
    "path": "reports/compliance_mechanism_audit_smoke.json",
    "chars": 1947,
    "preview": "{\n  \"config\": \"configs/pilot.yaml\",\n  \"overall_ok\": true,\n  \"checks\": [\n    {\n      \"name\": \"strict_streaming_contract_o"
  },
  {
    "path": "reports/compliance_summary_pilot.json",
    "chars": 1947,
    "preview": "{\n  \"config\": \"configs/pilot.yaml\",\n  \"overall_ok\": true,\n  \"checks\": [\n    {\n      \"name\": \"strict_streaming_contract_o"
  },
  {
    "path": "reports/compliance_summary_pilot_paper_faithful.json",
    "chars": 1242,
    "preview": "{\n  \"config\": \"configs/pilot_paper_faithful.yaml\",\n  \"overall_ok\": true,\n  \"checks\": [\n    {\n      \"name\": \"strict_strea"
  },
  {
    "path": "reports/next_backlog_scoped.md",
    "chars": 604,
    "preview": "# Next Backlog (Scoped, Non-Feature-Creep)\n\n1. Stabilize boundary-state mode for longer single-GPU runs (memory profilin"
  },
  {
    "path": "reports/security_release_gate.md",
    "chars": 1743,
    "preview": "# Security / Release Gate Log\n\nExecuted at: `2026-02-24T00:40:32Z` (UTC)\n\n## Commands Run\n- `rg -n --hidden --glob '!.gi"
  },
  {
    "path": "reports/sprint_completion_report.md",
    "chars": 1583,
    "preview": "# Sprint Completion Report (Mechanism Fidelity Focus)\n\nDate: `2026-02-24`\n\n## What Closed This Sprint\n\n- Boundary-state "
  },
  {
    "path": "reports/stage2_smoke.md",
    "chars": 4147,
    "preview": "# Stage 2 Smoke Artifact Summary\n\n## Hardware\n- 2× NVIDIA RTX 6000 Ada (49 GB VRAM each)\n- PyTorch 2.9.0 (LTS), CUDA 12."
  },
  {
    "path": "scripts/__init__.py",
    "chars": 52,
    "preview": "# Makes `scripts` a package for intra-eval imports.\n"
  },
  {
    "path": "scripts/checkpoint/verify.py",
    "chars": 595,
    "preview": "#!/usr/bin/env python\nfrom __future__ import annotations\n\nimport json\nfrom pathlib import Path\n\nimport typer\n\nfrom neste"
  },
  {
    "path": "scripts/checks/check_data_script_help.sh",
    "chars": 719,
    "preview": "#!/usr/bin/env bash\nset -euo pipefail\n\nROOT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")/../..\" && pwd)\"\ncd \"${ROOT_DIR}\"\n\n"
  },
  {
    "path": "scripts/checks/check_git_tracked_sizes.sh",
    "chars": 658,
    "preview": "#!/usr/bin/env bash\nset -euo pipefail\n\nMAX_BYTES=\"${MAX_TRACKED_FILE_BYTES:-5242880}\"  # 5 MiB default\nFORBIDDEN_EXT_REG"
  },
  {
    "path": "scripts/checks/check_readme_commands.sh",
    "chars": 284,
    "preview": "#!/usr/bin/env bash\nset -euo pipefail\n\n# Keep README's core CLI guidance executable in CI.\nuv run nl --help >/dev/null\nu"
  },
  {
    "path": "scripts/checks/compliance_report.py",
    "chars": 6474,
    "preview": "#!/usr/bin/env python\nfrom __future__ import annotations\n\nimport json\nfrom dataclasses import asdict, dataclass\nfrom pat"
  },
  {
    "path": "scripts/checks/run_fidelity_ci_subset.sh",
    "chars": 1211,
    "preview": "#!/usr/bin/env bash\nset -euo pipefail\n\nexport UV_LINK_MODE=\"${UV_LINK_MODE:-copy}\"\nexport UV_CACHE_DIR=\"${UV_CACHE_DIR:-"
  },
  {
    "path": "scripts/checks/tokenizer_coverage_guard.py",
    "chars": 3265,
    "preview": "#!/usr/bin/env python\nfrom __future__ import annotations\n\nimport json\nfrom pathlib import Path\nfrom typing import Option"
  },
  {
    "path": "scripts/checks/verify_docs_refs.py",
    "chars": 6223,
    "preview": "#!/usr/bin/env python3\nfrom __future__ import annotations\n\nimport argparse\nimport json\nimport re\nfrom pathlib import Pat"
  },
  {
    "path": "scripts/checks/verify_update_cadence.py",
    "chars": 4542,
    "preview": "#!/usr/bin/env python\nfrom __future__ import annotations\n\nimport argparse\nimport json\nfrom pathlib import Path\nfrom typi"
  },
  {
    "path": "scripts/compute/create_reservations.sh",
    "chars": 1101,
    "preview": "#!/usr/bin/env bash\nset -euo pipefail\n\n# Example Slurm reservations for Stage 2 (edit dates/times as needed).\n\nPARTITION"
  },
  {
    "path": "scripts/data/__init__.py",
    "chars": 64,
    "preview": "\"\"\"Data preparation scripts (tokenizer/filtering/sharding).\"\"\"\n\n"
  }
]

// ... and 137 more files (download for full content)

About this extraction

This page contains the full source code of the kmccleary3301/nested_learning GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 337 files (1.3 MB), approximately 371.3k tokens, and a symbol index with 723 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo