Repository: kyegomez/LFM
Branch: main
Commit: 0a6a50a78be8
Files: 45
Total size: 97.9 KB

Directory structure:
gitextract_o3qtge8m/

├── .github/
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   └── feature_request.md
│   ├── PULL_REQUEST_TEMPLATE.yml
│   ├── dependabot.yml
│   ├── labeler.yml
│   └── workflows/
│       ├── code_quality_control.yml
│       ├── cos_integration.yml
│       ├── docs.yml
│       ├── docs_test.yml
│       ├── label.yml
│       ├── lints.yml
│       ├── pr_request_checks.yml
│       ├── pull-request-links.yml
│       ├── pylint.yml
│       ├── python-publish.yml
│       ├── quality.yml
│       ├── ruff.yml
│       ├── run_test.yml
│       ├── stale.yml
│       ├── test.yml
│       ├── testing.yml
│       ├── unit-test.yml
│       └── welcome.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── example.py
├── lfm_torch/
│   ├── __init__.py
│   ├── liquid_t_moe.py
│   ├── model.py
│   └── rnn.py
├── liquid_transformer_example.py
├── liquid_transformer_train.py
├── pyproject.toml
├── requirements.txt
├── research/
│   ├── bench.py
│   ├── sss_linear.py
│   ├── ssub.py
│   └── sub_linear.py
└── scripts/
    ├── code_quality.sh
    ├── merge_all_prs.sh
    ├── test_name.sh
    └── tests.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/FUNDING.yml
================================================
# These are supported funding model platforms

github: [kyegomez]
patreon: # Replace with a single Patreon username
open_collective: # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
custom: #Nothing


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a detailed report on the bug and it's root cause. Conduct root cause error analysis
title: "[BUG] "
labels: bug
assignees: kyegomez

---

**Describe the bug**
A clear and concise description of what the bug is and what the main root cause error is. Test very thoroughly before submitting.

**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error

**Expected behavior**
A clear and concise description of what you expected to happen.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**Additional context**
Add any other context about the problem here.


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: 'kyegomez'

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.yml
================================================
<!-- Thank you for contributing to Zeta!

Replace this comment with:
  - Description: a description of the change, 
  - Issue: the issue # it fixes (if applicable),
  - Dependencies: any dependencies required for this change,
  - Tag maintainer: for a quicker response, tag the relevant maintainer (see below),
  - Twitter handle: we announce bigger features on Twitter. If your PR gets announced and you'd like a mention, we'll gladly shout you out!

If you're adding a new integration, please include:
  1. a test for the integration, preferably unit tests that do not rely on network access,
  2. an example notebook showing its use.

Maintainer responsibilities:
  - nn / Misc / if you don't know who to tag: kye@apac.ai
  - tokenizers: kye@apac.ai
  - training / Prompts: kye@apac.ai
  - models: kye@apac.ai

If no one reviews your PR within a few days, feel free to kye@apac.ai

See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/kyegomez/zeta

================================================
FILE: .github/dependabot.yml
================================================
# https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/configuration-options-for-dependency-updates

version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"

  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "weekly"


================================================
FILE: .github/labeler.yml
================================================
# this is a config file for the github action labeler

# Add 'root' label to any root file changes
# Quotation marks are required for the leading asterisk
root:
- changed-files:
  - any-glob-to-any-file: '*'

# Add 'Documentation' label to any changes within 'docs' folder or any subfolders
Documentation:
- changed-files:
  - any-glob-to-any-file: docs/**

# Add 'Tests' label to any file changes within 'docs' folder
Tests:
- changed-files:
  - any-glob-to-any-file: tests/*

# Add 'Documentation' label to any file changes within 'docs' or 'guides' folders
ghactions:
- changed-files:
  - any-glob-to-any-file:
    - .github/workflows/*
    - .github/*

# Add 'Scripts' label to any file changes within 'docs' folder
Scripts:
- changed-files:
  - any-glob-to-any-file: scripts/*
  
## Equivalent of the above mentioned configuration using another syntax
Documentation:
- changed-files:
  - any-glob-to-any-file: ['docs/*', 'guides/*']

# Add 'Documentation' label to any change to .md files within the entire repository 
Documentation:
- changed-files:
  - any-glob-to-any-file: '**/*.md'

# Add 'source' label to any change to src files within the source dir EXCEPT for the docs sub-folder
source:
- all:
  - changed-files:
    - any-glob-to-any-file: 'src/**/*'
    - all-globs-to-all-files: '!src/docs/*'

# Add 'feature' label to any PR where the head branch name starts with `feature` or has a `feature` section in the name
feature:
 - head-branch: ['^feature', 'feature']

# Add 'release' label to any PR that is opened against the `main` branch
release:
 - base-branch: 'main'


================================================
FILE: .github/workflows/code_quality_control.yml
================================================
name: Linting and Formatting

on:
  push:
    branches:
      - main

jobs:
  lint_and_format:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install --no-cache-dir -r requirements.txt

      - name: Find Python files
        run: find swarms_torch -name "*.py" -type f -exec autopep8 --in-place --aggressive --aggressive {} +

      - name: Push changes
        uses: ad-m/github-push-action@master
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}

================================================
FILE: .github/workflows/cos_integration.yml
================================================
name: Continuous Integration

on:
  push:
    branches:
      - main

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install --no-cache-dir -r requirements.txt

      - name: Run unit tests
        run: pytest tests/unit

      - name: Run integration tests
        run: pytest tests/integration

      - name: Run code coverage
        run: pytest --cov=swarms tests/

      - name: Run linters
        run: pylint swarms

      - name: Build documentation
        run: make docs

      - name: Validate documentation
        run: sphinx-build -b linkcheck docs build/docs

      - name: Run performance tests
        run: pytest tests/performance

================================================
FILE: .github/workflows/docs.yml
================================================
name: Docs WorkFlow

on:
  push:
    branches:
      - master
      - main
      - develop
jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.10'
      - run: pip install mkdocs-material
      - run: pip install "mkdocstrings[python]"
      - run: mkdocs gh-deploy --force

================================================
FILE: .github/workflows/docs_test.yml
================================================
name: Documentation Tests

on:
  push:
    branches:
      - master

jobs:
  test:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install --no-cache-dir -r requirements.txt

      - name: Build documentation
        run: make docs

      - name: Validate documentation
        run: sphinx-build -b linkcheck docs build/docs

================================================
FILE: .github/workflows/label.yml
================================================
# This workflow will triage pull requests and apply a label based on the
# paths that are modified in the pull request.
#
# To use this workflow, you will need to set up a .github/labeler.yml
# file with configuration.  For more information, see:
# https://github.com/actions/labeler

name: Labeler
on: [pull_request_target]

jobs:
  label:

    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write

    steps:
    - uses: actions/labeler@v5.0.0
      with:
        repo-token: "${{ secrets.GITHUB_TOKEN }}"


================================================
FILE: .github/workflows/lints.yml
================================================
name: Linting

on:
  push:
    branches:
      - master

jobs:
  lint:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install --no-cache-dir -r requirements.txt

      - name: Run linters
        run: pylint swarms_torch

================================================
FILE: .github/workflows/pr_request_checks.yml
================================================
name: Pull Request Checks

on:
  pull_request:
    branches:
      - master

jobs:
  test:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install --no-cache-dir -r requirements.txt

      - name: Run tests and checks
        run: |
          pytest tests/
          pylint swarms_torch

================================================
FILE: .github/workflows/pull-request-links.yml
================================================
name: readthedocs/actions
on:
  pull_request_target:
    types:
      - opened
    paths:
      - "docs/**"

permissions:
  pull-requests: write

jobs:
  pull-request-links:
    runs-on: ubuntu-latest
    steps:
      - uses: readthedocs/actions/preview@v1
        with:
          project-slug: swarms_torch

================================================
FILE: .github/workflows/pylint.yml
================================================
name: Pylint

on: [push]

jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.9", "3.10"]
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --no-cache-dir --upgrade pip
        pip install pylint
    - name: Analysing the code with pylint
      run: |
        pylint $(git ls-files '*.py')


================================================
FILE: .github/workflows/python-publish.yml
================================================

name: Upload Python Package

on:
  release:
    types: [published]

permissions:
  contents: read

jobs:
  deploy:

    runs-on: ubuntu-latest

    steps:
    - uses: actions/checkout@v4
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
        python-version: '3.10'
    - name: Install dependencies
      run: |
        python -m pip install --no-cache-dir --upgrade pip
        pip install build
    - name: Build package
      run: python -m build
    - name: Publish package
      uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0
      with:
        user: __token__
        password: ${{ secrets.PYPI_API_TOKEN }}

================================================
FILE: .github/workflows/quality.yml
================================================
name: Quality

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]

jobs:
  lint:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
    steps:
      - name: Checkout actions
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Init environment 
        uses: ./.github/actions/init-environment 
      - name: Run linter
        run: |
          pylint `git diff --name-only --diff-filter=d origin/main HEAD | grep -E '\.py$' | tr '\n' ' '`

================================================
FILE: .github/workflows/ruff.yml
================================================
name: Ruff
on: [ push, pull_request ]
jobs:
  ruff:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: chartboost/ruff-action@v1


================================================
FILE: .github/workflows/run_test.yml
================================================
name: Python application test

on: [push]

jobs:
  build:

    runs-on: ubuntu-latest

    steps:
    - uses: actions/checkout@v4
    - name: Set up Python 3.10
      uses: actions/setup-python@v5
      with:
        python-version: '3.10'
    - name: Install dependencies
      run: |
        python -m pip install --no-cache-dir --upgrade pip
        pip install pytest
        if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
    - name: Run tests with pytest
      run: |
        pytest tests/


================================================
FILE: .github/workflows/stale.yml
================================================
# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
#
# You can adjust the behavior by modifying this file.
# For more information, see:
# https://github.com/actions/stale
name: Mark stale issues and pull requests

on:
  schedule:
  - cron: '26 12 * * *'

jobs:
  stale:

    runs-on: ubuntu-latest
    permissions:
      issues: write
      pull-requests: write

    steps:
    - uses: actions/stale@v9
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
        stale-issue-message: 'Stale issue message'
        stale-pr-message: 'Stale pull request message'
        stale-issue-label: 'no-issue-activity'
        stale-pr-label: 'no-pr-activity'

================================================
FILE: .github/workflows/test.yml
================================================
name: test

on:
  push:
    branches: [master]
  pull_request:
  workflow_dispatch:

env:
  POETRY_VERSION: "1.4.2"

jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version:
          - "3.9"
          - "3.10"
          - "3.11"
        test_type:
          - "core"
          - "extended"
    name: Python ${{ matrix.python-version }} ${{ matrix.test_type }}
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
        uses: "./.github/actions/poetry_setup"
        with:
          python-version: ${{ matrix.python-version }}
          poetry-version: "1.4.2"
          cache-key: ${{ matrix.test_type }}
          install-command: |
              if [ "${{ matrix.test_type }}" == "core" ]; then
                echo "Running core tests, installing dependencies with poetry..."
                poetry install
              else
                echo "Running extended tests, installing dependencies with poetry..."
                poetry install -E extended_testing
              fi
      - name: Run ${{matrix.test_type}} tests
        run: |
          if [ "${{ matrix.test_type }}" == "core" ]; then
            make test
          else
            make extended_tests
          fi
        shell: bash

================================================
FILE: .github/workflows/testing.yml
================================================
name: Unit Tests

on:
  push:
    branches:
      - master

jobs:
  test:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install --no-cache-dir -r requirements.txt

      - name: Run unit tests
        run: pytest tests/

================================================
FILE: .github/workflows/unit-test.yml
================================================
name: build

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

jobs:

  build:

    runs-on: ubuntu-latest

    steps:
    - uses: actions/checkout@v4

    - name: Setup Python
      uses: actions/setup-python@v5
      with:
        python-version: '3.10'

    - name: Install dependencies
      run: pip install --no-cache-dir -r requirements.txt

    - name: Run Python unit tests
      run: python3 -m unittest tests/

    - name: Verify that the Docker image for the action builds
      run: docker build . --file Dockerfile

    - name: Verify integration test results
      run: python3 -m unittest tests/


================================================
FILE: .github/workflows/welcome.yml
================================================
name: Welcome WorkFlow

on:
  issues:
    types: [opened]
  pull_request_target:
    types: [opened]

jobs:
  build:
    name: 👋 Welcome
    permissions: write-all
    runs-on: ubuntu-latest
    steps:
      - uses: actions/first-interaction@v1.3.0
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
          issue-message: "Hello there, thank you for opening an Issue ! 🙏🏻 The team was notified and they will get back to you asap."
          pr-message:  "Hello there, thank you for opening an PR ! 🙏🏻 The team was notified and they will get back to you asap."

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so
.vscode/
.vscode

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
.ruff_cache/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
  - repo: https://github.com/ambv/black
    rev: 22.3.0
    hooks:
    - id: black
  - repo: https://github.com/charliermarsh/ruff-pre-commit
    rev: 'v0.0.255'
    hooks:
      - id: ruff
        args: [--fix]
  - repo: https://github.com/nbQA-dev/nbQA
    rev: 1.6.3
    hooks:
    - id: nbqa-black
      additional_dependencies: [ipython==8.12, black]
    - id: nbqa-ruff 
      args: ["--ignore=I001"]
      additional_dependencies: [ipython==8.12, ruff]

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023 Eternal Reclaimer

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================

# Liquid Foundation Models [LFMs]

[![Join our Discord](https://img.shields.io/badge/Discord-Join%20our%20server-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/agora-999382051935506503) [![Subscribe on YouTube](https://img.shields.io/badge/YouTube-Subscribe-red?style=for-the-badge&logo=youtube&logoColor=white)](https://www.youtube.com/@kyegomez3242) [![Connect on LinkedIn](https://img.shields.io/badge/LinkedIn-Connect-blue?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/kye-g-38759a207/) [![Follow on X.com](https://img.shields.io/badge/X.com-Follow-1DA1F2?style=for-the-badge&logo=x&logoColor=white)](https://x.com/kyegomezb)

This is an attempt to make an open source implementation of LFMs, this is obviously not the official repository because it's closed source. I link papers below which I am using as a referrence.
[Discover more about the model from the original article](https://www.liquid.ai/liquid-foundation-models)

## Installation
```bash
$ pip3 install -U lfm-torch
```

## Usage

```python
import torch
from lfm_torch.model import LFModel
from loguru import logger

# Instantiate and test the model
if __name__ == "__main__":
    batch_size, seq_length, embedding_dim = 32, 128, 512
    token_dim, channel_dim, expert_dim, adapt_dim, num_experts = (
        embedding_dim,
        embedding_dim,
        embedding_dim,
        128,
        4,
    )
    model = LFModel(
        token_dim, channel_dim, expert_dim, adapt_dim, num_experts
    )

    input_tensor = torch.randn(
        batch_size, seq_length, embedding_dim
    )  # 3D text tensor
    output = model(input_tensor)
    logger.info("Model forward pass complete.")
```


## Liquid Transformer 
A novel neural architecture combining Liquid Neural Networks, Transformer attention mechanisms, and Mixture of Experts (MoE) for enhanced adaptive processing and dynamic state updates. Very experimental and early! We're working on a training script [here](./liquid_transformer_train.py). It still needs an actual tokenizer like llama's tokenizer but it's getting there. If you can help with this then let me know.


### Architecture Overview

```mermaid
flowchart TB
    subgraph "Liquid Transformer"
        Input["Input Sequence"] --> TL["Transformer Layer"]
        
        subgraph "Transformer Layer"
            direction TB
            MHA["Multi-Head Attention"] --> LC["Liquid Cell"]
            LC --> MOE["Mixture of Experts"]
            MOE --> LN["Layer Norm + Residual"]
        end
        
        subgraph "Liquid Cell Details"
            direction LR
            HS["Hidden State"] --> WH["W_h Linear"]
            Input2["Input"] --> WI["W_in Linear"]
            WH --> Add((+))
            WI --> Add
            Add --> Act["Activation"]
            Act --> LN2["LayerNorm"]
            LN2 --> DO["Dropout"]
        end
        
        subgraph "MoE Details"
            direction TB
            Input3["Input"] --> Gate["Gating Network"]
            Input3 --> E1["Expert 1"]
            Input3 --> E2["Expert 2"]
            Input3 --> E3["Expert N"]
            Gate --> Comb["Weighted Combination"]
            E1 --> Comb
            E2 --> Comb
            E3 --> Comb
        end
        
        TL --> Output["Output Sequence"]
    end
```


```python
import torch
from loguru import logger

from lfm_torch.liquid_t_moe import LiquidTransformer

# Example usage
if __name__ == "__main__":
    seq_len, batch_size, embed_size = 10, 2, 64
    num_heads, num_experts, expert_size, num_layers = 8, 4, 64, 6

    # Create the model
    model = LiquidTransformer(embed_size, num_heads, num_experts, expert_size, num_layers)

    # Example input tensor
    x = torch.randn(seq_len, batch_size, embed_size)

    # Forward pass
    output = model(x)
    logger.info(f"Model output shape: {output.shape}")
```


# Citations
- All credit for the liquid transformer architecture goes to the original authors from liquid.ai
- https://arxiv.org/abs/2209.12951
- 

# License
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.


================================================
FILE: example.py
================================================
import torch
from lfm_torch.model import LFModel
from loguru import logger

# Instantiate and test the model
if __name__ == "__main__":
    batch_size, seq_length, embedding_dim = 32, 128, 512
    token_dim, channel_dim, expert_dim, adapt_dim, num_experts = (
        embedding_dim,
        embedding_dim,
        embedding_dim,
        128,
        4,
    )
    model = LFModel(
        token_dim, channel_dim, expert_dim, adapt_dim, num_experts
    )

    input_tensor = torch.randn(
        batch_size, seq_length, embedding_dim
    )  # 3D text tensor
    output = model(input_tensor)
    logger.info("Model forward pass complete.")


================================================
FILE: lfm_torch/__init__.py
================================================
from lfm_torch.model import LFModel
from lfm_torch.liquid_t_moe import LiquidTransformer

__all__ = ["LFModel", "LiquidTransformer"]


================================================
FILE: lfm_torch/liquid_t_moe.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from loguru import logger

# from zeta import MixtureOfExperts, FeedForward

# Logging Configuration
logger.add(
    "liquid_transformer.log",
    format="{time} {level} {message}",
    level="DEBUG",
)


class LiquidCell(nn.Module):
    """
    Liquid Neural Network Cell with enhanced production-readiness.

    This liquid cell dynamically updates its hidden state with input features and
    continuously adapts the internal state over time using non-linear updates.

    Args:
        input_size (int): The size of the input features.
        hidden_size (int): The size of the hidden state.
        dropout (float): Dropout rate for regularization.
        layer_norm (bool): Whether to apply layer normalization to stabilize updates.
    """

    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        dropout: float = 0.1,
        layer_norm: bool = True,
    ):
        super(LiquidCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout)

        # Linear layers for input-to-hidden and hidden-to-hidden connections
        self.w_in = nn.Linear(input_size, hidden_size)
        self.w_h = nn.Linear(hidden_size, hidden_size)

        # Optionally add layer normalization
        self.layer_norm = (
            nn.LayerNorm(hidden_size) if layer_norm else None
        )

        # Stable non-linear activation (can switch to ReLU or GELU)
        self.activation = nn.Tanh()

        logger.info(
            f"Initialized LiquidCell with input_size={input_size}, hidden_size={hidden_size}, dropout={dropout}"
        )

    def forward(self, x: Tensor, h: Tensor) -> Tensor:
        """
        Forward pass of the LiquidCell.

        Args:
            x (Tensor): Input tensor of shape (batch_size, input_size).
            h (Tensor): Hidden state tensor of shape (batch_size, hidden_size).

        Returns:
            Tensor: Updated hidden state of shape (batch_size, hidden_size).
        """
        logger.debug(
            f"Input shape: {x.shape}, Hidden state shape: {h.shape}"
        )

        # Update hidden state with dynamic input and previous hidden state
        new_h = self.activation(self.w_in(x) + self.w_h(h))

        # Optionally apply layer normalization
        if self.layer_norm:
            new_h = self.layer_norm(new_h)

        # Apply dropout for regularization
        new_h = self.dropout(new_h)

        logger.debug(f"Updated hidden state shape: {new_h.shape}")
        return new_h

    def initialize_hidden_state(
        self, batch_size: int, device: torch.device
    ) -> Tensor:
        """
        Initialize the hidden state dynamically for the given batch size and device.

        Args:
            batch_size (int): The batch size for which the hidden state is initialized.
            device (torch.device): The device (CPU or GPU) where the hidden state should reside.

        Returns:
            Tensor: Initialized hidden state of shape (batch_size, hidden_size).
        """
        hidden_state = torch.zeros(batch_size, self.hidden_size).to(
            device
        )
        logger.info(
            f"Initialized hidden state of shape {hidden_state.shape} on {device}"
        )
        return hidden_state


class MixtureOfExperts(nn.Module):
    """
    Mixture of Experts (MoE) Layer

    Args:
        num_experts (int): Number of experts.
        expert_size (int): Size of each expert layer.
        output_size (int): Output size for gating network.
    """

    def __init__(
        self, num_experts: int, expert_size: int, output_size: int
    ):
        super(MixtureOfExperts, self).__init__()
        self.experts = nn.ModuleList(
            [
                nn.Linear(expert_size, output_size)
                for _ in range(num_experts)
            ]
        )
        self.gate = nn.Linear(expert_size, num_experts)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass of the MoE layer.

        Args:
            x (Tensor): Input tensor of shape (batch_size, input_size).

        Returns:
            Tensor: Output of the mixture of experts.
        """
        gate_outputs = F.softmax(self.gate(x), dim=1)
        logger.debug(f"Gate outputs: {gate_outputs}")

        expert_outputs = torch.stack(
            [expert(x) for expert in self.experts], dim=1
        )
        logger.debug(f"Expert outputs: {expert_outputs}")

        output = torch.einsum(
            "be,bec->bc", gate_outputs, expert_outputs
        )
        return output


class TransformerLayerWithLiquid(nn.Module):
    """
    A single transformer block integrated with a Liquid Neural Network Cell and Mixture of Experts.

    Args:
        embed_size (int): Size of embedding.
        num_heads (int): Number of attention heads.
        num_experts (int): Number of experts in the MoE layer.
        expert_size (int): Size of each expert layer.
    """

    def __init__(
        self,
        embed_size: int,
        num_heads: int,
        num_experts: int,
        expert_size: int,
    ):
        super(TransformerLayerWithLiquid, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, num_heads)
        self.liquid_cell = LiquidCell(embed_size, embed_size)
        self.moe = MixtureOfExperts(
            num_experts, embed_size, embed_size
        )
        # self.moe = MixtureOfExperts(
        #     dim = embed_size,
        #     num_experts=num_experts,
        # )
        self.layernorm = nn.LayerNorm(embed_size)

    def forward(self, x: Tensor, hidden_state: Tensor) -> Tensor:
        """
        Forward pass of the Transformer layer with Liquid Cell and Mixture of Experts.

        Args:
            x (Tensor): Input tensor of shape (seq_len, batch_size, embed_size).
            hidden_state (Tensor): Hidden state tensor for the liquid cell (batch_size, embed_size).

        Returns:
            Tensor: Output of the transformer layer.
        """
        logger.debug(
            f"Input shape to TransformerLayerWithLiquid: {x.shape}"
        )

        # Self-attention
        attention_output, _ = self.attention(x, x, x)
        logger.debug(
            f"Attention output shape: {attention_output.shape}"
        )

        # Liquid Neural Network Cell
        hidden_state = self.liquid_cell(
            attention_output.mean(dim=0), hidden_state
        )
        logger.debug(
            f"Updated hidden state from LiquidCell: {hidden_state.shape}"
        )

        # Mixture of Experts
        moe_output = self.moe(hidden_state)
        logger.debug(f"MoE output shape: {moe_output.shape}")

        # Layer Norm and Residual Connection
        output = self.layernorm(
            attention_output + moe_output.unsqueeze(0)
        )
        return output


class LiquidTransformer(nn.Module):
    """
    Transformer with multiple layers of liquid neural network cells and mixture of experts.

    Args:
        embed_size (int): Size of embedding.
        num_heads (int): Number of attention heads.
        num_experts (int): Number of experts in each MoE layer.
        expert_size (int): Size of each expert.
        num_layers (int): Number of transformer layers.
    """

    def __init__(
        self,
        embed_size: int,
        num_heads: int,
        num_experts: int,
        expert_size: int,
        num_layers: int,
    ):
        super(LiquidTransformer, self).__init__()
        self.layers = nn.ModuleList(
            [
                TransformerLayerWithLiquid(
                    embed_size, num_heads, num_experts, expert_size
                )
                for _ in range(num_layers)
            ]
        )
        self.hidden_state = torch.zeros(1, embed_size)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass of the Liquid Transformer.

        Args:
            x (Tensor): Input tensor of shape (seq_len, batch_size, embed_size).

        Returns:
            Tensor: Output of the transformer network.
        """
        for layer in self.layers:
            x = layer(x, self.hidden_state)
        return x


# # Example usage
# if __name__ == "__main__":
#     seq_len, batch_size, embed_size = 10, 2, 64
#     num_heads, num_experts, expert_size, num_layers = 8, 4, 64, 6

#     # Create the model
#     model = LiquidTransformer(embed_size, num_heads, num_experts, expert_size, num_layers)

#     # Example input tensor
#     x = torch.randn(seq_len, batch_size, embed_size)

#     # Forward pass
#     output = model(x)
#     logger.info(f"Model output shape: {output.shape}")


================================================
FILE: lfm_torch/model.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from loguru import logger
from typing import Optional, Tuple
from torch.nn.functional as F

class AdaptiveLinear(nn.Module):
    """
    Adaptive Linear layer whose weight and bias adapt based on input.
    """

    def __init__(
        self, in_features: int, out_features: int, adapt_dim: int
    ):
        super(AdaptiveLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.weight = nn.Parameter(
            torch.randn(out_features, in_features)
        )
        self.bias = nn.Parameter(torch.randn(out_features))

        # Linear transformation for adapting the weight based on input
        self.adapt = nn.Linear(adapt_dim, out_features * in_features)

    def forward(
        self, x: torch.Tensor, adapt_input: torch.Tensor
    ) -> torch.Tensor:
        adapt_weight = self.adapt(adapt_input).view(
            self.out_features, self.in_features
        )
        weight = self.weight + adapt_weight
        return F.linear(x, weight, self.bias)


class TokenMixing(nn.Module):
    """
    Token mixing layer that performs token-wise interactions using adaptive linear layers.
    Operates across the sequence dimension (sequence_length).
    """

    def __init__(self, token_dim: int, adapt_dim: int):
        super(TokenMixing, self).__init__()
        self.token_mixing = AdaptiveLinear(
            token_dim, token_dim, adapt_dim
        )

    def forward(
        self, x: torch.Tensor, adapt_input: torch.Tensor
    ) -> torch.Tensor:
        # x: [batch_size, sequence_length, embedding_dim]
        batch_size, seq_length, embed_dim = x.shape
        x = x.view(
            batch_size * seq_length, embed_dim
        )  # Flatten sequence for linear transformation
        x_mixed = self.token_mixing(x, adapt_input)
        return x_mixed.view(batch_size, seq_length, embed_dim)


class ChannelMixing(nn.Module):
    """
    Channel mixing layer that performs cross-channel interactions using adaptive linear layers.
    Operates across the embedding dimension (embedding_dim).
    """

    def __init__(self, channel_dim: int, adapt_dim: int):
        super(ChannelMixing, self).__init__()
        self.channel_mixing = AdaptiveLinear(
            channel_dim, channel_dim, adapt_dim
        )

    def forward(
        self, x: torch.Tensor, adapt_input: torch.Tensor
    ) -> torch.Tensor:
        # x: [batch_size, sequence_length, embedding_dim]
        return self.channel_mixing(x, adapt_input)


class MixtureOfExperts(nn.Module):
    """
    Mixture of Experts (MoE) module that dynamically selects experts based on input.
    Operates after channel and token mixing.
    """

    def __init__(
        self, expert_dim: int, num_experts: int, adapt_dim: int
    ):
        super(MixtureOfExperts, self).__init__()
        self.experts = nn.ModuleList(
            [
                AdaptiveLinear(expert_dim, expert_dim, adapt_dim)
                for _ in range(num_experts)
            ]
        )
        self.gating = nn.Linear(adapt_dim, num_experts)

    def forward(
        self, x: torch.Tensor, adapt_input: torch.Tensor
    ) -> torch.Tensor:
        gate_scores = F.softmax(self.gating(adapt_input), dim=-1)
        output = sum(
            gate_scores[:, i].unsqueeze(1) * expert(x, adapt_input)
            for i, expert in enumerate(self.experts)
        )
        return output


class LFModel(nn.Module):
    """
    Custom LF Model architecture combining token mixing, channel mixing, and MoE.
    Accepts 3D input tensor: [batch_size, sequence_length, embedding_dim].
    """

    def __init__(
        self,
        token_dim: int,
        channel_dim: int,
        expert_dim: int,
        adapt_dim: int,
        num_experts: int,
    ):
        super(LFModel, self).__init__()
        self.featurizer = nn.Linear(token_dim, adapt_dim)
        self.token_mixer = TokenMixing(token_dim, adapt_dim)
        self.channel_mixer = ChannelMixing(channel_dim, adapt_dim)
        self.moe = MixtureOfExperts(
            expert_dim, num_experts, adapt_dim
        )
        self.output_layer = nn.Linear(expert_dim, token_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        logger.info("Input shape: {}", x.shape)

        # Featurization stage
        batch_size, seq_length, embed_dim = x.shape
        adapt_input = self.featurizer(
            x.mean(dim=1)
        )  # Aggregate across sequence for adaptation
        logger.info(
            "Featurization complete. Shape: {}", adapt_input.shape
        )

        # Token Mixing
        token_mixed = self.token_mixer(x, adapt_input)
        logger.info(
            "Token mixing complete. Shape: {}", token_mixed.shape
        )

        # Channel Mixing
        channel_mixed = self.channel_mixer(token_mixed, adapt_input)
        logger.info(
            "Channel mixing complete. Shape: {}", channel_mixed.shape
        )

        # Mixture of Experts
        expert_output = self.moe(channel_mixed, adapt_input)
        logger.info(
            "Mixture of Experts complete. Shape: {}",
            expert_output.shape,
        )

        # Final Output
        output = self.output_layer(expert_output)
        logger.info("Output shape: {}", output.shape)
        return output


================================================
FILE: lfm_torch/rnn.py
================================================
import torch
import torch.nn as nn
from loguru import logger

logger.add("liquid_neural_net.log", rotation="500 MB", level="INFO")


class LiquidNeuron(nn.Module):
    """
    A single neuron in a liquid neural network with time-varying dynamics.

    Attributes:
        input_size (int): Size of the input.
        hidden_size (int): Size of the hidden state.
        tau (float): Time constant to control the neuron dynamics.
    """

    def __init__(
        self, input_size: int, hidden_size: int, tau: float = 0.1
    ):
        """
        Initialize the LiquidNeuron with the given input and hidden size.

        Args:
            input_size (int): Size of the input.
            hidden_size (int): Size of the hidden state.
            tau (float): Time constant that controls the update speed of the neuron state.
        """
        super(LiquidNeuron, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.tau = tau  # Time constant for neuron dynamics

        # Parameters: weights and biases for input and hidden connections
        self.W_input = nn.Parameter(
            torch.randn(hidden_size, input_size)
        )
        self.W_hidden = nn.Parameter(
            torch.randn(hidden_size, hidden_size)
        )
        self.bias = nn.Parameter(torch.zeros(hidden_size))

        # Initial hidden state (zero-initialized)
        self.state = torch.zeros(hidden_size)

    def forward(
        self, x: torch.Tensor, previous_state: torch.Tensor
    ) -> torch.Tensor:
        """
        Forward pass through the liquid neuron.

        The state of the neuron evolves dynamically based on the input and the previous state.

        Equation: s(t+1) = (1 - tau) * s(t) + tau * tanh(W_input * x(t) + W_hidden * s(t) + b)
        Reference: Hasani, Ramin, et al. "Liquid time-constant networks" (2021).

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_size).
            previous_state (torch.Tensor): The previous state of the neuron.

        Returns:
            torch.Tensor: The updated state of the neuron.
        """
        # Dynamic state update based on a differential equation for liquid neuron behavior
        new_state = (
            1 - self.tau
        ) * previous_state + self.tau * torch.tanh(
            torch.matmul(x, self.W_input.T)
            + torch.matmul(previous_state, self.W_hidden.T)
            + self.bias
        )
        return new_state


class LiquidRNN(nn.Module):
    """
    A recurrent neural network (RNN) built using liquid neurons.

    Attributes:
        input_size (int): Size of the input.
        hidden_size (int): Size of the hidden state.
        output_size (int): Size of the output (vocabulary size).
        tau (float): Time constant for neuron dynamics.
    """

    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        output_size: int,
        tau: float = 0.1,
    ):
        """
        Initialize the LiquidRNN with the given input size, hidden size, and output size.

        Args:
            input_size (int): Size of the input.
            hidden_size (int): Size of the hidden state.
            output_size (int): Size of the output (vocabulary size).
            tau (float): Time constant for neuron dynamics (controls neuron update speed).
        """
        super(LiquidRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size

        # Liquid neuron layer
        self.liquid_neuron = LiquidNeuron(
            input_size, hidden_size, tau
        )

        # Output layer
        self.output_layer = nn.Linear(hidden_size, output_size)

        # Initialize hidden state
        self.hidden_state = torch.zeros(hidden_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the LiquidRNN.

        Processes each timestep sequentially, evolving hidden states based on the liquid neuron dynamics.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, input_size).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, sequence_length, output_size).
        """
        batch_size, seq_len, _ = x.shape
        outputs = []
        hidden_state = self.hidden_state

        logger.info(
            f"Starting forward pass with batch_size: {batch_size}, sequence_length: {seq_len}"
        )

        for t in range(seq_len):
            hidden_state = self.liquid_neuron(
                x[:, t, :], hidden_state
            )
            output = self.output_layer(hidden_state)
            outputs.append(output)

        return torch.stack(outputs, dim=1)

    def generate_text(
        self, start_token: torch.Tensor, max_len: int = 100
    ) -> str:
        """
        Generates text using the trained LiquidRNN model.

        Args:
            start_token (torch.Tensor): The starting token for text generation.
            max_len (int): Maximum length of the generated sequence.

        Returns:
            str: The generated text as a string of tokens.
        """
        generated_tokens = [start_token.item()]
        hidden_state = self.hidden_state.unsqueeze(0)

        logger.info(f"Generating text with max length {max_len}")

        # Generate text by predicting one token at a time
        for _ in range(max_len - 1):
            output = self(
                start_token.unsqueeze(0).unsqueeze(0)
            )  # Add batch and sequence dimensions
            next_token = torch.argmax(output, dim=-1)
            generated_tokens.append(next_token.item())
            start_token = next_token.squeeze(0)

        return "".join(map(str, generated_tokens))


# Assuming the LiquidRNN class has been defined as shown earlier
# Here is a simple forward pass on CPU without using GPUs.


def cpu_forward_pass_example():
    """
    Performs a forward pass with the LiquidRNN model using a CPU.
    """
    logger.info("Starting forward pass on CPU...")

    # Example configuration
    input_size = 128  # Input size (e.g., embedding dimension or one-hot encoding size)
    hidden_size = 256  # Size of the hidden state
    output_size = 128  # Output size (e.g., vocabulary size)

    # Create a dummy input tensor (batch_size=2, sequence_length=10, input_size=128)
    batch_size = 2
    sequence_length = 10
    dummy_input = torch.randn(batch_size, sequence_length, input_size)

    # Initialize the LiquidRNN model
    model = LiquidRNN(input_size, hidden_size, output_size)

    # Move the model to CPU (this is already the default)
    device = torch.device("cpu")
    model = model.to(device)

    # Perform the forward pass on the dummy input
    output = model(dummy_input)

    # Log output information
    logger.info(
        f"Output shape: {output.shape}"
    )  # Output shape should be (batch_size, sequence_length, output_size)
    logger.info("Forward pass on CPU completed.")

    return output


# Run the CPU forward pass example
output = cpu_forward_pass_example()

# Output will be printed in the logs


================================================
FILE: liquid_transformer_example.py
================================================
import torch
from loguru import logger

from lfm_torch.liquid_t_moe import LiquidTransformer

# Example usage
if __name__ == "__main__":
    seq_len, batch_size, embed_size = 10, 2, 64
    num_heads, num_experts, expert_size, num_layers = 8, 4, 64, 6

    # Create the model
    model = LiquidTransformer(embed_size, num_heads, num_experts, expert_size, num_layers)

    # Example input tensor
    x = torch.randn(seq_len, batch_size, embed_size)

    # Forward pass
    output = model(x)
    logger.info(f"Model output shape: {output.shape}")


================================================
FILE: liquid_transformer_train.py
================================================
import os
import torch
import torch.nn as nn
from torch.utils.data import IterableDataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from datasets import load_dataset
from transformers import AutoTokenizer
from typing import Dict, List, Optional, Tuple, Union, Iterator
from dataclasses import dataclass
from loguru import logger
import wandb
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
from lfm_torch.liquid_t_moe import LiquidTransformer

# Set tokenizer parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Configure logging
logger.add(
    "training.log",
    rotation="500 MB",
    retention="10 days",
    level="INFO",
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
)

@dataclass
class TrainingConfig:
    """Training configuration parameters."""
    
    # Model parameters
    embed_size: int = 768  # Match BERT embedding size
    num_heads: int = 8
    num_experts: int = 4
    expert_size: int = 768  # Match embed_size
    num_layers: int = 6
    
    # Training parameters
    batch_size: int = 16
    learning_rate: float = 1e-4
    max_steps: int = 100000
    warmup_steps: int = 1000
    max_grad_norm: float = 1.0
    weight_decay: float = 0.01
    
    # Data parameters
    max_length: int = 512
    vocab_size: int = 30522  # BERT vocab size
    
    # System parameters
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_workers: int = 0  # Avoid multiprocessing issues with streaming
    seed: int = 42
    
    # Logging parameters
    wandb_project: str = "liquid-transformer"
    checkpoint_dir: str = "checkpoints"
    checkpoint_steps: int = 1000
    log_steps: int = 10

class ArXivDataset(IterableDataset):
    """Dataset class for arXiv papers."""
    
    def __init__(
        self,
        tokenizer: AutoTokenizer,
        max_length: int = 512,
    ):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.dataset = load_dataset("neuralwork/arxiver", split="train", streaming=True)
        logger.info(f"Initialized streaming dataset")
    
    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text."""
        return text.strip().replace('\n', ' ')

    def __iter__(self) -> Iterator[Dict[str, torch.Tensor]]:
        """Iterate over the dataset."""
        iterator = iter(self.dataset)
        while True:
            try:
                item = next(iterator)
                text = f"Title: {self.preprocess_text(item['title'])} Abstract: {self.preprocess_text(item['abstract'])}"
                
                encoded = self.tokenizer(
                    text,
                    max_length=self.max_length,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt"
                )
                
                # Keep as long tensor for input ids
                yield {
                    "input_ids": encoded["input_ids"][0],
                    "attention_mask": encoded["attention_mask"][0]
                }
            except StopIteration:
                iterator = iter(self.dataset)  # Restart iteration
                continue

class Trainer:
    """Trainer class for Liquid Transformer."""
    
    def __init__(
        self,
        model: nn.Module,
        config: TrainingConfig,
        tokenizer: AutoTokenizer
    ):
        self.model = model.to(config.device)
        self.config = config
        self.tokenizer = tokenizer
        
        # Initialize hidden state
        self.model.hidden_state = torch.zeros(
            config.batch_size,
            config.embed_size,
            device=config.device
        )
        
        # Create embedding layer for input tokens
        self.embedding = nn.Embedding(
            config.vocab_size,
            config.embed_size
        ).to(config.device)
        
        self.optimizer = AdamW(
            list(model.parameters()) + list(self.embedding.parameters()),
            lr=config.learning_rate,
            weight_decay=config.weight_decay
        )
        
        self.scheduler = CosineAnnealingLR(
            self.optimizer,
            T_max=config.max_steps
        )
        
        wandb.init(project=config.wandb_project, config=vars(config))
        os.makedirs(config.checkpoint_dir, exist_ok=True)
        logger.info("Trainer initialized successfully")
    
    def train_step(
        self,
        batch: Dict[str, torch.Tensor]
    ) -> float:
        """Perform a single training step."""
        try:
            self.model.train()
            
            # Move batch to device
            input_ids = batch["input_ids"].to(self.config.device)
            attention_mask = batch["attention_mask"].to(self.config.device)
            
            # Convert input tokens to embeddings
            embedded_input = self.embedding(input_ids)  # [batch_size, seq_len, embed_size]
            
            # Add sequence dimension expected by transformer
            embedded_input = embedded_input.unsqueeze(0)  # [1, batch_size, seq_len, embed_size]
            
            # Update hidden state size if batch size changed
            if self.model.hidden_state.size(0) != embedded_input.size(1):
                self.model.hidden_state = self.model.hidden_state.new_zeros(
                    embedded_input.size(1),
                    self.config.embed_size
                )
            
            # Forward pass
            outputs = self.model(embedded_input)
            
            # Compute reconstruction loss
            loss = nn.MSELoss()(outputs, embedded_input)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                list(self.model.parameters()) + list(self.embedding.parameters()),
                self.config.max_grad_norm
            )
            
            self.optimizer.step()
            self.optimizer.zero_grad()
            
            return loss.item()
            
        except Exception as e:
            logger.error(f"Error in train_step: {str(e)}")
            raise
    
    def save_checkpoint(
        self,
        step: int,
        loss: Optional[float] = None,
    ):
        """Save model checkpoint."""
        checkpoint = {
            "step": step,
            "model_state_dict": self.model.state_dict(),
            "embedding_state_dict": self.embedding.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "scheduler_state_dict": self.scheduler.state_dict(),
            "loss": loss if loss is not None else float('inf'),
            "config": self.config
        }
        
        path = Path(self.config.checkpoint_dir)
        checkpoint_path = path / f"checkpoint_step_{step}.pt"
        torch.save(checkpoint, checkpoint_path)
        logger.info(f"Saved checkpoint at step {step} to {checkpoint_path}")
    
    def train(
        self,
        train_dataset: ArXivDataset,
    ):
        """Train the model."""
        logger.info("Starting training")
        
        train_loader = DataLoader(
            train_dataset,
            batch_size=self.config.batch_size,
            num_workers=self.config.num_workers
        )
        
        global_step = 0
        running_loss = 0.0
        current_loss = None
        
        progress_bar = tqdm(total=self.config.max_steps, desc="Training")
        
        try:
            for batch in train_loader:
                if global_step >= self.config.max_steps:
                    break
                
                current_loss = self.train_step(batch)
                running_loss += current_loss
                global_step += 1
                
                # Update progress bar
                progress_bar.update(1)
                progress_bar.set_postfix({
                    "loss": f"{current_loss:.4f}",
                    "step": global_step
                })
                
                # Log metrics
                if global_step % self.config.log_steps == 0:
                    avg_loss = running_loss / self.config.log_steps
                    wandb.log({
                        "train_loss": avg_loss,
                        "learning_rate": self.scheduler.get_last_lr()[0],
                        "global_step": global_step
                    })
                    running_loss = 0.0
                
                # Save checkpoint if needed
                if global_step % self.config.checkpoint_steps == 0:
                    self.save_checkpoint(global_step, current_loss)
                
                # Update learning rate
                self.scheduler.step()
                
        except KeyboardInterrupt:
            logger.info("Training interrupted by user")
            self.save_checkpoint(global_step, current_loss)
        except Exception as e:
            logger.error(f"Training error: {str(e)}")
            self.save_checkpoint(global_step, current_loss)
            raise
        finally:
            progress_bar.close()
            # Save final checkpoint
            self.save_checkpoint(global_step, current_loss)
            logger.info(f"Training completed after {global_step} steps")

def main():
    """Main training function."""
    try:
        # Set random seeds
        config = TrainingConfig()
        torch.manual_seed(config.seed)
        np.random.seed(config.seed)
        
        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        
        # Create dataset
        train_dataset = ArXivDataset(
            tokenizer=tokenizer,
            max_length=config.max_length,
        )
        
        # Initialize model
        model = LiquidTransformer(
            embed_size=config.embed_size,
            num_heads=config.num_heads,
            num_experts=config.num_experts,
            expert_size=config.expert_size,
            num_layers=config.num_layers
        )
        
        # Initialize trainer
        trainer = Trainer(model, config, tokenizer)
        
        # Start training
        trainer.train(train_dataset)
        
    except Exception as e:
        logger.error(f"Training failed with error: {str(e)}")
        raise
    finally:
        wandb.finish()

if __name__ == "__main__":
    main()

================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "lfm-torch"
version = "0.0.3"
description = "lfm - Pytorch"
license = "MIT"
authors = ["Kye Gomez <kye@apac.ai>"]
homepage = "https://github.com/kyegomez/lfm"
documentation = "https://github.com/kyegomez/lfm"  # Add this if you have documentation.
readme = "README.md"  # Assuming you have a README.md
repository = "https://github.com/kyegomez/lfm"
keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"]
classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
    "License :: OSI Approved :: MIT License",
    "Programming Language :: Python :: 3.9"
]

[tool.poetry.dependencies]
python = "^3.10"
torch = "*"
loguru = "*"


[tool.poetry.group.lint.dependencies]
ruff = "^0.1.6"
types-toml = "^0.10.8.1"
types-redis = "^4.3.21.6"
types-pytz = "^2023.3.0.0"
black = "^23.1.0"
types-chardet = "^5.0.4.6"
mypy-protobuf = "^3.0.0"


[tool.autopep8]
max_line_length = 80
ignore = "E501,W6"  # or ["E501", "W6"]
in-place = true
recursive = true
aggressive = 3


[tool.ruff]
line-length = 70

[tool.black]
line-length = 70
target-version = ['py38']
preview = true


================================================
FILE: requirements.txt
================================================
torch


================================================
FILE: research/bench.py
================================================
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from loguru import logger
from typing import List, Tuple


class ScalingBenchmark:
    def __init__(
        self,
        models: List[nn.Module],
        scaling_factor: float = 1.1,
        input_size_start: int = 16,
        num_tests: int = 10,
    ):
        """
        Initialize the benchmark.

        :param models: A list of models to test.
        :param scaling_factor: How much to increase input size each iteration.
        :param input_size_start: Starting size of input.
        :param num_tests: Number of tests to run.
        """
        logger.info(
            f"Initializing ScalingBenchmark with {len(models)} models"
        )

        self.models = models
        self.scaling_factor = scaling_factor
        self.input_size_start = input_size_start
        self.num_tests = num_tests

    def _generate_input(self, input_size: int) -> torch.Tensor:
        """
        Generates random input tensor of a given size.

        :param input_size: Size of the input tensor to generate.
        :return: Random tensor of shape (input_size, input_size).
        """
        logger.debug(f"Generating input tensor of size {input_size}")
        return torch.randn(input_size, input_size)

    def _test_model(
        self, model: nn.Module, input_size: int
    ) -> Tuple[float, float]:
        """
        Test a model with a specific input size and measure the forward pass time and output.

        :param model: The model to test.
        :param input_size: Size of the input tensor.
        :return: The time taken for the forward pass and the model's output mean.
        """
        logger.debug(f"Testing model with input size {input_size}")

        input_tensor = self._generate_input(input_size)

        model.eval()
        with torch.no_grad():
            start_time = torch.cuda.Event(enable_timing=True)
            end_time = torch.cuda.Event(enable_timing=True)

            start_time.record()
            output = model(input_tensor)
            end_time.record()

            # Waits for everything to finish running
            torch.cuda.synchronize()

            elapsed_time = start_time.elapsed_time(
                end_time
            )  # in milliseconds
            output_mean = output.mean().item()

            logger.debug(
                f"Model test completed: elapsed time {elapsed_time} ms, output mean {output_mean}"
            )

            return elapsed_time, output_mean

    def run_benchmark(self) -> None:
        """
        Run the scaling benchmark on all models.
        Categorizes the models as linear, quadratic, or sub-linear based on performance scaling.
        """
        logger.info("Starting benchmark tests")

        performance_data = {model: [] for model in self.models}

        for i in tqdm(range(self.num_tests), desc="Benchmarking"):
            current_input_size = int(
                self.input_size_start * (self.scaling_factor**i)
            )
            logger.info(
                f"Running test {i + 1}/{self.num_tests} with input size {current_input_size}"
            )

            for model in self.models:
                elapsed_time, output_mean = self._test_model(
                    model, current_input_size
                )
                performance_data[model].append(
                    (current_input_size, elapsed_time)
                )

        self._categorize_models(performance_data)

    def _categorize_models(self, performance_data: dict) -> None:
        """
        Categorize models based on how their performance scales with input size.

        :param performance_data: Dictionary containing performance data for each model.
        """
        logger.info("Categorizing models based on scaling behavior")

        for model, data in performance_data.items():
            input_sizes, times = zip(*data)
            input_sizes = np.array(input_sizes)
            times = np.array(times)

            # Fit to a polynomial of degree 2 (quadratic), 1 (linear), or sub-linear
            quadratic_fit = np.polyfit(input_sizes, times, 2)
            linear_fit = np.polyfit(input_sizes, times, 1)

            quadratic_error = np.sum(
                (np.polyval(quadratic_fit, input_sizes) - times) ** 2
            )
            linear_error = np.sum(
                (np.polyval(linear_fit, input_sizes) - times) ** 2
            )

            logger.info(
                f"Model {model.__class__.__name__} fit results: quadratic_error={quadratic_error}, linear_error={linear_error}"
            )

            if quadratic_error < linear_error:
                logger.success(
                    f"Model {model.__class__.__name__} scales quadratically."
                )
            elif (
                linear_error < quadratic_error
                and linear_error < 0.1 * quadratic_error
            ):
                logger.success(
                    f"Model {model.__class__.__name__} scales linearly."
                )
            else:
                logger.success(
                    f"Model {model.__class__.__name__} scales sub-linearly."
                )


================================================
FILE: research/sss_linear.py
================================================
import torch
import torch.nn as nn
from torch import Tensor
from loguru import logger
import time
from typing import List

logger.info("Setting up Sub-Sub-Linear LLM Model")


class SparseDynamicLayer(nn.Module):
    """
    A layer that dynamically selects a subset of tokens for processing.

    Attributes:
        input_dim (int): The input embedding dimension.
        output_dim (int): The output embedding dimension.
        dropout (float): Dropout rate for token selection.
    """

    def __init__(
        self, input_dim: int, output_dim: int, dropout: float = 0.1
    ):
        super(SparseDynamicLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass through the sparse dynamic layer.

        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length, input_dim).

        Returns:
            Tensor: Output tensor after sparse selection and transformation.
        """
        # Dynamic sparse token selection (probability-driven)
        token_selection_prob = torch.sigmoid(
            self.fc(x)
        )  # Shape (batch_size, seq_len, output_dim)
        selected_tokens = self.dropout(token_selection_prob)

        logger.info(
            f"Selected {selected_tokens.sum()} tokens for processing out of {x.shape[1]} total tokens."
        )
        return selected_tokens


class HierarchicalSubstructureLayer(nn.Module):
    """
    A layer that processes the input sequence hierarchically, by splitting the sequence into substructures
    and processing relevant portions.

    Attributes:
        input_dim (int): The input embedding dimension.
    """

    def __init__(self, input_dim: int):
        super(HierarchicalSubstructureLayer, self).__init__()
        self.input_dim = input_dim
        self.fc = nn.Linear(input_dim, input_dim)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass for hierarchical substructure processing.

        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length, input_dim).

        Returns:
            Tensor: Output tensor after hierarchical substructure processing.
        """
        batch_size, seq_len, _ = x.size()
        logger.info(
            f"Processing {seq_len} tokens into hierarchical substructures."
        )

        # Hierarchical substructure processing
        # For simplicity, we'll break the input sequence into 2 substructures.
        substructure_1 = x[:, : seq_len // 2, :]
        substructure_2 = x[:, seq_len // 2 :, :]

        # Processing each substructure independently
        processed_1 = self.fc(substructure_1)
        processed_2 = self.fc(substructure_2)

        # Reassemble the processed structures
        processed = torch.cat([processed_1, processed_2], dim=1)

        return processed


class ProbabilisticMemoryCompressionLayer(nn.Module):
    """
    A layer that performs probabilistic memory compression to reduce the amount of information passed to subsequent layers.

    Attributes:
        input_dim (int): The input embedding dimension.
        output_dim (int): The output embedding dimension (should match hidden_dim of next layer).
    """

    def __init__(self, input_dim: int, output_dim: int):
        super(ProbabilisticMemoryCompressionLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.fc = nn.Linear(
            input_dim, output_dim
        )  # Directly output hidden_dim size

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass for probabilistic memory compression.

        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length, input_dim).

        Returns:
            Tensor: Compressed memory output.
        """
        batch_size, seq_len, _ = x.size()
        logger.info(f"Compressing memory from {seq_len} tokens.")

        # Apply the compression to match the output_dim (hidden_dim)
        compressed = self.fc(x)

        logger.info(
            f"Memory compressed to {compressed.shape[1]} tokens."
        )
        return compressed


class SubSubLinearLLM(nn.Module):
    """
    Sub-Sub-Linear LLM Model that scales sub-sub-linearly while maintaining learning ability.

    Attributes:
        input_dim (int): Dimension of input embeddings.
        hidden_dim (int): Dimension of hidden layers.
        output_dim (int): Dimension of the output embeddings.
    """

    def __init__(
        self, input_dim: int, hidden_dim: int, output_dim: int
    ):
        super(SubSubLinearLLM, self).__init__()
        self.sparse_layer = SparseDynamicLayer(input_dim, hidden_dim)
        self.hierarchical_layer = HierarchicalSubstructureLayer(
            hidden_dim
        )
        self.compression_layer = ProbabilisticMemoryCompressionLayer(
            hidden_dim, hidden_dim
        )  # Ensure output is hidden_dim
        self.fc_output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass of the Sub-Sub-Linear LLM Model.

        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length, input_dim).

        Returns:
            Tensor: Final output tensor of shape (batch_size, output_dim).
        """
        # Step 1: Sparse dynamic selection
        x = self.sparse_layer(x)

        # Step 2: Hierarchical processing
        x = self.hierarchical_layer(x)

        # Step 3: Probabilistic memory compression
        x = self.compression_layer(x)

        # Final output layer
        # Perform mean pooling along the sequence dimension (dim=1), resulting in shape (batch_size, hidden_dim)
        x = x.mean(dim=1)

        # Now x has shape (batch_size, hidden_dim), which matches fc_output
        output = self.fc_output(x)
        return output


import matplotlib.pyplot as plt
from loguru import logger
import numpy as np
from scipy.stats import linregress


def benchmark_model(
    model: nn.Module,
    input_dim: int,
    seq_lengths: List[int],
    batch_size: int = 32,
    runs: int = 5,
):
    """
    Benchmark the model on different sequence lengths and log the results.

    Args:
        model (nn.Module): The model to benchmark.
        input_dim (int): Input dimensionality.
        seq_lengths (List[int]): List of sequence lengths to test.
        batch_size (int): Batch size for testing.
        runs (int): Number of runs to average for each sequence length.

    Returns:
        dict: A dictionary with sequence lengths as keys and average times as values.
    """
    model.eval()
    times = []

    for seq_len in seq_lengths:
        logger.info(f"Benchmarking sequence length {seq_len}")

        # Generate random input for the given sequence length
        x = torch.randn(batch_size, seq_len, input_dim)

        # Measure time for several runs and average
        elapsed_times = []
        for _ in range(runs):
            start_time = time.time()

            with torch.no_grad():
                output = model(x)

            end_time = time.time()
            elapsed_times.append(end_time - start_time)

        avg_time = np.mean(elapsed_times)
        times.append(avg_time)

        logger.info(
            f"Average time for sequence length {seq_len}: {avg_time:.6f} seconds"
        )

    return {
        seq_len: time for seq_len, time in zip(seq_lengths, times)
    }


def detect_scaling_regime(
    seq_lengths: List[int], times: List[float]
) -> float:
    """
    Detect the scaling regime by fitting a line to the log-log data and computing the slope.

    Args:
        seq_lengths (List[int]): Sequence lengths.
        times (List[float]): Times corresponding to each sequence length.

    Returns:
        float: The slope of the log-log plot indicating the scaling regime.
    """
    log_seq_lengths = np.log(seq_lengths)
    log_times = np.log(times)

    # Fit a linear regression to the log-log data
    slope, intercept, r_value, p_value, std_err = linregress(
        log_seq_lengths, log_times
    )

    logger.info(f"Slope of the log-log plot: {slope:.4f}")
    return slope


def plot_benchmark_results(results: dict, slope: float):
    """
    Plot the benchmark results to analyze scaling behavior.

    Args:
        results (dict): A dictionary with sequence lengths as keys and average times as values.
        slope (float): The slope of the log-log plot for scaling regime detection.
    """
    seq_lengths = list(results.keys())
    times = list(results.values())

    # Plot the results
    plt.figure(figsize=(10, 6))
    plt.plot(
        seq_lengths, times, marker="o", label=f"Slope: {slope:.2f}"
    )
    plt.title("Model Benchmark: Time vs Sequence Length")
    plt.xlabel("Sequence Length")
    plt.ylabel("Average Time (seconds)")
    plt.grid(True)
    plt.xscale("log")
    plt.yscale(
        "log"
    )  # Use log-log scale to detect power-law relationships
    plt.legend()
    plt.show()

    logger.info("Benchmark plot generated.")


if __name__ == "__main__":
    input_dim = 512
    hidden_dim = 256
    output_dim = 128
    seq_lengths = [
        128,
        256,
        512,
        1024,
        2048,
    ]  # Varying sequence lengths
    batch_size = 32
    runs = 5  # Average over 5 runs for each sequence length

    model = SubSubLinearLLM(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        output_dim=output_dim,
    )

    # Run benchmark and get results
    benchmark_results = benchmark_model(
        model, input_dim, seq_lengths, batch_size, runs
    )

    # Extract sequence lengths and times
    seq_lengths = list(benchmark_results.keys())
    times = list(benchmark_results.values())

    # Detect scaling regime (slope of log-log plot)
    slope = detect_scaling_regime(seq_lengths, times)

    # Plot the benchmark results and scaling regime
    plot_benchmark_results(benchmark_results, slope)

    # Automatically detect and print scaling regime
    if slope > 1.5:
        logger.info(
            f"The model scales **quadratically** (slope: {slope:.2f})"
        )
    elif 0.9 <= slope <= 1.5:
        logger.info(
            f"The model scales **linearly** (slope: {slope:.2f})"
        )
    elif 0.5 <= slope < 0.9:
        logger.info(
            f"The model scales **sub-linearly** (slope: {slope:.2f})"
        )
    else:
        logger.info(
            f"The model scales **sub-sub-linearly** (slope: {slope:.2f})"
        )


================================================
FILE: research/ssub.py
================================================
# hcen.py

import torch
import torch.nn as nn
from torch import Tensor
from loguru import logger

logger.add("hcen.log", rotation="1 MB")  # Log file configuration


class EncodingFunction(nn.Module):
    """
    Encoding function f that maps sequences of varying lengths to a fixed-dimensional vector space.
    """

    def __init__(self, input_dim: int, hidden_dim: int):
        super(EncodingFunction, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass of the encoding function.

        Args:
            x (Tensor): Input tensor of shape (batch_size, seq_len, input_dim).

        Returns:
            Tensor: Encoded tensor of shape (batch_size, hidden_dim).
        """
        # Simple mean pooling followed by a linear layer
        x = x.mean(dim=1)  # Shape: (batch_size, input_dim)
        encoded = self.encoder(x)  # Shape: (batch_size, hidden_dim)
        return encoded


class ImportanceScoring(nn.Module):
    """
    Importance scoring function I(C_l) to select the most informative segments.
    """

    def __init__(self, hidden_dim: int):
        super(ImportanceScoring, self).__init__()
        self.scorer = nn.Linear(hidden_dim, 1)

    def forward(self, x: Tensor) -> Tensor:
        """
        Compute importance scores for each compressed representation.

        Args:
            x (Tensor): Tensor of shape (batch_size, num_segments, hidden_dim).

        Returns:
            Tensor: Importance scores of shape (batch_size, num_segments).
        """
        scores = self.scorer(x).squeeze(
            -1
        )  # Shape: (batch_size, num_segments)
        return scores


class AggregationFunction(nn.Module):
    """
    Aggregation function g to combine two compressed representations.
    """

    def __init__(self, hidden_dim: int):
        super(AggregationFunction, self).__init__()
        self.aggregator = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
        """
        Aggregate two compressed representations.

        Args:
            x1 (Tensor): Tensor of shape (batch_size, hidden_dim).
            x2 (Tensor): Tensor of shape (batch_size, hidden_dim).

        Returns:
            Tensor: Aggregated tensor of shape (batch_size, hidden_dim).
        """
        combined = torch.cat(
            [x1, x2], dim=-1
        )  # Shape: (batch_size, hidden_dim * 2)
        aggregated = self.aggregator(
            combined
        )  # Shape: (batch_size, hidden_dim)
        return aggregated


class OutputFunction(nn.Module):
    """
    Output function h to produce the final output from the root compressed representation.
    """

    def __init__(self, hidden_dim: int, output_dim: int):
        super(OutputFunction, self).__init__()
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x: Tensor) -> Tensor:
        """
        Compute the final output.

        Args:
            x (Tensor): Root compressed representation of shape (batch_size, hidden_dim).

        Returns:
            Tensor: Final output tensor of shape (batch_size, output_dim).
        """
        output = self.output_layer(
            x
        )  # Shape: (batch_size, output_dim)
        return output


class HCEN(nn.Module):
    """
    Hierarchical Compressed Encoding Network (HCEN).
    """

    def __init__(
        self, input_dim: int, hidden_dim: int, output_dim: int, k: int
    ):
        super(HCEN, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.k = k  # Number of segments to select at each level
        self.encoding_function = EncodingFunction(
            input_dim, hidden_dim
        )
        self.importance_scoring = ImportanceScoring(hidden_dim)
        self.aggregation_function = AggregationFunction(hidden_dim)
        self.output_function = OutputFunction(hidden_dim, output_dim)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass of HCEN.

        Args:
            x (Tensor): Input tensor of shape (batch_size, seq_len, input_dim).

        Returns:
            Tensor: Final output tensor of shape (batch_size, output_dim).
        """
        batch_size, seq_len, _ = x.size()
        logger.info(f"Input shape: {x.shape}")

        # Initialize segments with the entire sequence
        segments = [
            x
        ]  # List of tensors of shape (batch_size, seq_len_i, input_dim)
        level = 0

        while True:
            logger.info(
                f"Processing level {level} with {len(segments)} segments"
            )

            compressed_reps = []
            # Encode each segment
            for segment in segments:
                if segment.size(-1) == self.input_dim:
                    # Segment is unencoded, so encode it
                    encoded = self.encoding_function(
                        segment
                    )  # Shape: (batch_size, hidden_dim)
                    compressed_reps.append(encoded)
                elif segment.size(-1) == self.hidden_dim:
                    # Segment is already encoded
                    compressed_reps.append(segment)
                else:
                    raise ValueError(
                        f"Unexpected segment size: {segment.size()}"
                    )

            compressed_reps = torch.stack(
                compressed_reps, dim=1
            )  # Shape: (batch_size, num_segments, hidden_dim)

            # If only one compressed representation remains, we can stop
            if compressed_reps.size(1) == 1:
                root_representation = compressed_reps.squeeze(
                    1
                )  # Shape: (batch_size, hidden_dim)
                break

            # Compute importance scores
            importance_scores = self.importance_scoring(
                compressed_reps
            )  # Shape: (batch_size, num_segments)
            logger.debug(
                f"Importance scores shape: {importance_scores.shape}"
            )

            # Select top-k segments based on importance scores
            k = min(self.k, compressed_reps.size(1))
            _, indices = torch.topk(
                importance_scores, k, dim=1
            )  # Indices of top-k segments
            logger.info(f"Selected top-{k} segments at level {level}")

            # Gather selected compressed representations
            batch_indices = (
                torch.arange(batch_size).unsqueeze(-1).expand(-1, k)
            )
            selected_reps = compressed_reps[
                batch_indices, indices
            ]  # Shape: (batch_size, k, hidden_dim)

            # Aggregate selected representations pairwise
            aggregated_reps = []
            i = 0
            while i < selected_reps.size(1):
                x1 = selected_reps[
                    :, i, :
                ]  # Shape: (batch_size, hidden_dim)
                if i + 1 < selected_reps.size(1):
                    x2 = selected_reps[
                        :, i + 1, :
                    ]  # Shape: (batch_size, hidden_dim)
                    aggregated = self.aggregation_function(x1, x2)
                else:
                    # If there's an odd number of representations, carry the last one forward
                    aggregated = x1
                aggregated_reps.append(aggregated)
                i += 2

            # Prepare for next level
            segments = aggregated_reps  # Each segment is a tensor of shape (batch_size, hidden_dim)

            level += 1

        # Final output
        output = self.output_function(
            root_representation
        )  # Shape: (batch_size, output_dim)
        logger.info(f"Output shape: {output.shape}")
        return output


# test_hcen.py

# import torch
# # from hcen import HCEN
# import time
# import matplotlib.pyplot as plt

# # def test_hcen_sublinear_scaling():
# #     """
# #     Test the HCEN model to verify sub-linear computational complexity.
# #     """
# #     input_dim = 128
# #     hidden_dim = 64
# #     output_dim = 10
# #     k = 5  # Number of segments to select at each level
# #     batch_size = 32

# #     sequence_lengths = [2 ** i for i in range(5, 15)]  # Sequence lengths from 32 to 16384
# #     times = []

# #     for seq_len in sequence_lengths:
# #         model = HCEN(input_dim, hidden_dim, output_dim, k)
# #         x = torch.randn(batch_size, seq_len, input_dim)

# #         start_time = time.time()
# #         output = model(x)
# #         end_time = time.time()

# #         elapsed_time = end_time - start_time
# #         times.append(elapsed_time)
# #         print(f"Sequence Length: {seq_len}, Time Taken: {elapsed_time:.6f} seconds")

# #     # Plotting the results
# #     plt.figure(figsize=(10, 6))
# #     plt.plot(sequence_lengths, times, marker='o')
# #     plt.xlabel('Sequence Length (N)')
# #     plt.ylabel('Time Taken (seconds)')
# #     plt.title('HCEN Computational Time vs Sequence Length')
# #     plt.xscale('log')
# #     plt.yscale('log')
# #     plt.grid(True)
# #     plt.show()

# # if __name__ == "__main__":
# #     test_hcen_sublinear_scaling()


# # # Transformer Model (Quadratic Scaling)
# # class TransformerModel(nn.Module):
# #     def __init__(self, input_dim: int, num_heads: int, num_layers: int, output_dim: int):
# #         super(TransformerModel, self).__init__()
# #         self.transformer = nn.Transformer(
# #             d_model=input_dim,
# #             nhead=num_heads,
# #             num_encoder_layers=num_layers,
# #             num_decoder_layers=num_layers,
# #             dim_feedforward=4 * input_dim,
# #             batch_first=True,
# #         )
# #         self.output_layer = nn.Linear(input_dim, output_dim)

# #     def forward(self, x: torch.Tensor) -> torch.Tensor:
# #         # Transformer requires both src and tgt; for simplicity, we'll use the same input
# #         output = self.transformer(x, x)
# #         # Take the mean across the sequence length
# #         output = output.mean(dim=1)
# #         output = self.output_layer(output)
# #         return output

# # # RNN Model (Linear Scaling)
# # class RNNModel(nn.Module):
# #     def __init__(self, input_dim: int, hidden_dim: int, num_layers: int, output_dim: int):
# #         super(RNNModel, self).__init__()
# #         self.rnn = nn.RNN(
# #             input_size=input_dim,
# #             hidden_size=hidden_dim,
# #             num_layers=num_layers,
# #             batch_first=True,
# #         )
# #         self.output_layer = nn.Linear(hidden_dim, output_dim)

# #     def forward(self, x: torch.Tensor) -> torch.Tensor:
# #         # RNN returns output and hidden state; we'll use the final hidden state
# #         _, hn = self.rnn(x)
# #         # hn shape: (num_layers, batch_size, hidden_dim)
# #         hn = hn[-1]  # Take the output from the last layer
# #         output = self.output_layer(hn)
# #         return output

# # def benchmark_models():
# #     """
# #     Benchmark HCEN, Transformer, and RNN models to compare computational scaling.
# #     """
# #     input_dim = 128
# #     hidden_dim = 64
# #     output_dim = 10
# #     k = 5  # Number of segments to select at each level in HCEN
# #     num_heads = 8
# #     num_layers = 2
# #     batch_size = 32

# #     sequence_lengths = [2 ** i for i in range(5, 14)]  # Sequence lengths from 32 to 8192
# #     hcen_times = []
# #     transformer_times = []
# #     rnn_times = []

# #     for seq_len in sequence_lengths:
# #         x = torch.randn(batch_size, seq_len, input_dim)

# #         # HCEN Model
# #         hcen_model = HCEN(input_dim, hidden_dim, output_dim, k)
# #         start_time = time.time()
# #         hcen_output = hcen_model(x)
# #         end_time = time.time()
# #         hcen_elapsed = end_time - start_time
# #         hcen_times.append(hcen_elapsed)

# #         # Transformer Model
# #         # transformer_model = TransformerModel(input_dim, num_heads, num_layers, output_dim)
# #         # start_time = time.time()
# #         # transformer_output = transformer_model(x)
# #         # end_time = time.time()
# #         # transformer_elapsed = end_time - start_time
# #         # transformer_times.append(transformer_elapsed)

# #         # RNN Model
# #         rnn_model = RNNModel(input_dim, hidden_dim, num_layers, output_dim)
# #         start_time = time.time()
# #         rnn_output = rnn_model(x)
# #         end_time = time.time()
# #         rnn_elapsed = end_time - start_time
# #         rnn_times.append(rnn_elapsed)

# #         print(f"Sequence Length: {seq_len}, HCEN Time: {hcen_elapsed:.6f}s, "
# #               f"RNN Time: {rnn_elapsed:.6f}s")

# #     # Plotting the results
# #     plt.figure(figsize=(12, 8))
# #     plt.plot(sequence_lengths, hcen_times, marker='o', label='HCEN (Sub-Linear)')
# #     # plt.plot(sequence_lengths, transformer_times, marker='o', label='Transformer (Quadratic)')
# #     plt.plot(sequence_lengths, rnn_times, marker='o', label='RNN (Linear)')

# #     # Reference lines for O(N), O(N log N), O(N^2)
# #     N = np.array(sequence_lengths)
# #     plt.plot(N, N / N.max() * max(hcen_times + transformer_times + rnn_times), 'k--', label='O(N)')
# #     plt.plot(N, np.log(N) / np.log(N.max()) * max(hcen_times + transformer_times + rnn_times), 'g--', label='O(log N)')
# #     plt.plot(N, (N ** 2) / (N.max() ** 2) * max(hcen_times + transformer_times + rnn_times), 'r--', label='O(N^2)')

# #     plt.xlabel('Sequence Length (N)')
# #     plt.ylabel('Time Taken (seconds)')
# #     plt.title('Model Computational Time vs Sequence Length')
# #     plt.xscale('log')
# #     plt.yscale('log')
# #     plt.legend()
# #     plt.grid(True)
# #     plt.show()

# # if __name__ == "__main__":
# #     benchmark_models()


================================================
FILE: research/sub_linear.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from typing import List, Tuple
from loguru import logger
from pydantic import BaseModel

logger.add(
    "model_log.log", format="{time} {level} {message}", level="DEBUG"
)


# Helper class for managing configuration using Pydantic
class ModelConfig(BaseModel):
    input_dim: int
    num_layers: int
    sparsity: float
    cluster_size: int
    hidden_dim: int
    num_clusters: int
    num_classes: int
    memory_size: int


# Sparse Information Extraction Layer
class SparseInformationExtraction(nn.Module):
    """
    This layer selects a sparse subset of tokens based on their importance.
    """

    def __init__(self, input_dim: int, sparsity: float):
        """
        Initializes the sparse selection layer.
        Args:
            input_dim (int): Dimension of input tokens.
            sparsity (float): Fraction of tokens to select (between 0 and 1).
        """
        super(SparseInformationExtraction, self).__init__()
        self.input_dim = input_dim
        self.sparsity = sparsity

    def forward(self, x: Tensor) -> Tensor:
        """
        Select a sparse subset of tokens based on their magnitudes.
        Args:
            x (Tensor): Input token embeddings of shape (batch_size, seq_len, input_dim).
        Returns:
            Tensor: Sparsely selected tokens.
        """
        logger.debug(f"Original input shape: {x.shape}")

        # Compute the L2 norm across the token embeddings
        token_norms = torch.norm(x, p=2, dim=-1)
        logger.debug(f"Token norms shape: {token_norms.shape}")

        # Select top-k tokens based on sparsity value
        k = int(self.sparsity * x.size(1))
        _, topk_indices = torch.topk(token_norms, k, dim=1)

        # Gather the top-k tokens
        sparse_x = torch.gather(
            x,
            1,
            topk_indices.unsqueeze(-1).expand(-1, -1, self.input_dim),
        )
        logger.debug(f"Sparse input shape: {sparse_x.shape}")

        return sparse_x


# Hierarchical Clustering Layer
class HierarchicalClustering(nn.Module):
    """
    Hierarchically clusters the input tokens into fewer groups.
    """

    def __init__(self, cluster_size: int):
        """
        Initializes the clustering layer.
        Args:
            cluster_size (int): Number of clusters to group tokens into.
        """
        super(HierarchicalClustering, self).__init__()
        self.cluster_size = cluster_size

    def forward(self, x: Tensor) -> Tensor:
        """
        Cluster tokens hierarchically by reshaping and reducing their dimension.
        Args:
            x (Tensor): Sparse input tokens of shape (batch_size, seq_len, input_dim).
        Returns:
            Tensor: Clustered tokens.
        """
        logger.debug(f"Input before clustering: {x.shape}")
        batch_size, seq_len, input_dim = x.shape
        num_clusters = seq_len // self.cluster_size
        x = x.view(
            batch_size, num_clusters, self.cluster_size * input_dim
        )
        logger.debug(f"Input after clustering: {x.shape}")
        return x


# Dynamic Activation Layer
class DynamicMaskingActivation(nn.Module):
    """
    Activates only a subset of neurons based on dynamic masking.
    """

    def __init__(
        self, input_dim: int, hidden_dim: int, mask_fraction: float
    ):
        """
        Initializes the dynamic activation layer.
        Args:
            input_dim (int): Dimension of input layer (matches the output of clustering layer).
            hidden_dim (int): Dimension of hidden layer.
            mask_fraction (float): Fraction of neurons to activate.
        """
        super(DynamicMaskingActivation, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.mask_fraction = mask_fraction
        self.fc = nn.Linear(
            input_dim, hidden_dim
        )  # Adjusted to take input_dim

    def forward(self, x: Tensor) -> Tensor:
        """
        Apply dynamic masking to the hidden layer.
        Args:
            x (Tensor): Input tensor of shape (batch_size, seq_len, input_dim).
        Returns:
            Tensor: Masked activation output.
        """
        logger.debug(f"Input before dynamic masking: {x.shape}")
        batch_size, seq_len, input_dim = x.shape

        # Compute the number of neurons to activate
        k = int(self.mask_fraction * self.hidden_dim)

        # Apply linear transformation
        x = self.fc(x)

        # Mask out a random subset of neurons
        mask = torch.zeros_like(x).bernoulli_(self.mask_fraction)
        x = x * mask
        logger.debug(f"Masked output shape: {x.shape}")

        return F.relu(x)


# Sparse Recursion-Based Memory Layer
class SparseMemory(nn.Module):
    """
    Implements a recursive memory mechanism for sequence compression.
    """

    def __init__(self, input_dim: int, memory_size: int):
        """
        Initializes the memory mechanism.
        Args:
            input_dim (int): Dimension of input embeddings.
            memory_size (int): Size of memory (number of stored representations).
        """
        super(SparseMemory, self).__init__()
        self.memory_size = memory_size
        self.fc = nn.Linear(input_dim, memory_size)

    def forward(
        self, x: Tensor, memory: Tensor
    ) -> Tuple[Tensor, Tensor]:
        """
        Update and compress the memory state.
        Args:
            x (Tensor): Current input tensor of shape (batch_size, seq_len, input_dim).
            memory (Tensor): Previous memory state of shape (batch_size, memory_size).
        Returns:
            Tuple[Tensor, Tensor]: Updated input and memory state.
        """
        logger.debug(f"Input before memory update: {x.shape}")

        # Compress sequence length to match memory size (batch_size, memory_size)
        x_compressed = torch.mean(
            x, dim=1
        )  # Compress along the sequence dimension
        logger.debug(f"Compressed input shape: {x_compressed.shape}")

        # Update the memory state by combining previous memory and new compressed input
        updated_memory = F.relu(self.fc(x_compressed) + memory)
        logger.debug(f"Updated memory shape: {updated_memory.shape}")

        return x_compressed, updated_memory


# Main SDCI Model Architecture
class SDCIModel(nn.Module):
    """
    Main model combining Sparse Information Extraction, Clustering, Masking, and Memory.
    """

    def __init__(self, config: ModelConfig):
        """
        Initializes the SDCI model.
        Args:
            config (ModelConfig): Configuration object containing model parameters.
        """
        super(SDCIModel, self).__init__()
        self.sparse_extraction = SparseInformationExtraction(
            config.input_dim, config.sparsity
        )
        self.clustering = HierarchicalClustering(config.cluster_size)
        self.dynamic_activation = DynamicMaskingActivation(
            input_dim=config.cluster_size
            * config.input_dim,  # Match the clustered output
            hidden_dim=config.hidden_dim,
            mask_fraction=config.sparsity,
        )
        self.memory = SparseMemory(
            config.hidden_dim, config.memory_size
        )
        self.fc_out = nn.Linear(
            config.memory_size, config.num_classes
        )

    def forward(
        self, x: Tensor, memory: Tensor
    ) -> Tuple[Tensor, Tensor]:
        """
        Forward pass through the model.
        Args:
            x (Tensor): Input tensor of shape (batch_size, seq_len, input_dim).
            memory (Tensor): Memory tensor of shape (batch_size, memory_size).
        Returns:
            Tuple[Tensor, Tensor]: Output predictions and updated memory.
        """
        logger.debug("Starting forward pass of the model.")

        # Step 1: Sparse information extraction
        x = self.sparse_extraction(x)

        # Step 2: Hierarchical clustering
        x = self.clustering(x)

        # Step 3: Dynamic masking and activation
        x = self.dynamic_activation(x)

        # Step 4: Recursive memory update
        x, memory = self.memory(x, memory)

        # Step 5: Output layer for classification
        output = self.fc_out(memory)

        return output, memory


import time
import matplotlib.pyplot as plt

# Example configuration
config = ModelConfig(
    input_dim=128,
    num_layers=4,
    sparsity=0.5,
    cluster_size=4,
    hidden_dim=256,
    num_clusters=16,
    num_classes=10,
    memory_size=128,
)

# Initialize the model and memory
model = SDCIModel(config)


# Function to benchmark the model with different input sizes
def benchmark_model(
    model: nn.Module, input_sizes: List[int], batch_size: int = 32
):
    times = []
    memory = torch.zeros(
        batch_size, config.memory_size
    )  # Initialize memory

    for input_size in input_sizes:
        input_tensor = torch.randn(
            batch_size, input_size, config.input_dim
        )  # Generate random input

        # Measure time for forward pass
        start_time = time.time()
        with torch.no_grad():  # Disable gradients for benchmarking
            _ = model(input_tensor, memory)
        elapsed_time = time.time() - start_time

        times.append(elapsed_time)
        logger.info(
            f"Input size {input_size} - Elapsed time: {elapsed_time:.6f} seconds"
        )

    return times


# Define the range of input sizes to test
input_sizes = [128, 256, 512, 1024, 2048]

# Run the benchmark
execution_times = benchmark_model(model, input_sizes)

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(
    input_sizes,
    execution_times,
    label="Model Execution Time",
    marker="o",
)
plt.plot(
    input_sizes,
    [size for size in input_sizes],
    label="Linear Time (O(N))",
    linestyle="--",
)
plt.plot(
    input_sizes,
    [size**2 for size in input_sizes],
    label="Quadratic Time (O(N^2))",
    linestyle="--",
)
plt.xlabel("Input Sequence Length (N)")
plt.ylabel("Execution Time (seconds)")
plt.title("Benchmark: Model Execution Time vs Input Size")
plt.legend()
plt.grid(True)
plt.show()


================================================
FILE: scripts/code_quality.sh
================================================
#!/bin/bash

# Navigate to the directory containing the 'package' folder
# cd /path/to/your/code/directory

# Run autopep8 with max aggressiveness (-aaa) and in-place modification (-i)
# on all Python files (*.py) under the 'package' directory.
autopep8 --in-place --aggressive --aggressive --recursive --experimental --list-fixes package/

# Run black with default settings, since black does not have an aggressiveness level.
# Black will format all Python files it finds in the 'package' directory.
black --experimental-string-processing package/

# Run ruff on the 'package' directory.
# Add any additional flags if needed according to your version of ruff.
ruff --unsafe_fix

# YAPF
yapf --recursive --in-place --verbose --style=google --parallel package


================================================
FILE: scripts/merge_all_prs.sh
================================================
#!/bin/bash

# Check if we are inside a Git repository
if ! git rev-parse --git-dir > /dev/null 2>&1; then
    echo "Error: Must be run inside a Git repository."
    exit 1
fi

# Fetch all open pull requests
echo "Fetching open PRs..."
prs=$(gh pr list --state open --json number --jq '.[].number')

# Check if there are PRs to merge
if [ -z "$prs" ]; then
    echo "No open PRs to merge."
    exit 0
fi

echo "Found PRs: $prs"

# Loop through each pull request number and merge it
for pr in $prs; do
    echo "Attempting to merge PR #$pr"
    merge_output=$(gh pr merge $pr --auto --merge)
    merge_status=$?
    if [ $merge_status -ne 0 ]; then
        echo "Failed to merge PR #$pr. Error: $merge_output"
    else
        echo "Successfully merged PR #$pr"
    fi
done

echo "Processing complete."


================================================
FILE: scripts/test_name.sh
================================================
find ./tests -name "*.py" -type f | while read file
do
  filename=$(basename "$file")
  dir=$(dirname "$file")
  if [[ $filename != test_* ]]; then
    mv "$file" "$dir/test_$filename"
  fi
done

================================================
FILE: scripts/tests.sh
================================================
find ./tests -name '*.py' -exec pytest {} \;