Repository: karlicoss/cachew
Branch: master
Commit: 7e785aac758f
Files: 46
Total size: 251.7 KB

Directory structure:
gitextract_k61syhvn/

├── .ci/
│   ├── release
│   └── run
├── .gitattributes
├── .github/
│   └── workflows/
│       └── main.yml
├── .gitignore
├── .idea/
│   └── dictionaries/
│       └── karlicos.xml
├── LICENSE.txt
├── README.ipynb
├── README.md
├── benchmarks/
│   ├── 20230912-comparison-with-legacy.org
│   ├── 20230912.org
│   └── 20230917.org
├── doc/
│   ├── cachew_disable.md
│   ├── serialization.org
│   └── test_serialization.py
├── generate-readme
├── github-issues.org
├── misc/
│   ├── profile.py
│   └── test_redis/
│       ├── docker-compose.yml
│       └── test.py
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── ruff.toml
├── src/
│   └── cachew/
│       ├── __init__.py
│       ├── backend/
│       │   ├── common.py
│       │   ├── file.py
│       │   └── sqlite.py
│       ├── common.py
│       ├── compat.py
│       ├── experimental.py
│       ├── extra.py
│       ├── legacy.py
│       ├── logging_helper.py
│       ├── marshall/
│       │   ├── cachew.py
│       │   └── common.py
│       ├── py.typed
│       ├── pytest.py
│       ├── tests/
│       │   ├── marshall.py
│       │   ├── test_cachew.py
│       │   ├── test_future_annotations.py
│       │   ├── test_resolve_type_parameters.py
│       │   └── utils.py
│       └── utils.py
├── tox.ini
└── ty.toml

================================================
FILE CONTENTS
================================================

================================================
FILE: .ci/release
================================================
#!/usr/bin/env python3
'''
Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]].

- running manually

  You'll need =UV_PUBLISH_TOKEN= env variable

- running on Github Actions

  Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi

  It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]].
  Packages are deployed on:
  - every master commit, onto test pypi
  - every new tag, onto production pypi
'''

UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN'

import argparse
import os
from pathlib import Path
from subprocess import check_call

is_ci = os.environ.get('CI') is not None


def main() -> None:
    p = argparse.ArgumentParser()
    p.add_argument('--use-test-pypi', action='store_true')
    args = p.parse_args()

    publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else []

    root = Path(__file__).absolute().parent.parent
    os.chdir(root)  # just in case

    check_call(['uv', 'build', '--clear'])

    if not is_ci:
        # CI relies on trusted publishers so doesn't need env variable
        assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed'

    check_call(['uv', 'publish', *publish_url])


if __name__ == '__main__':
    main()


================================================
FILE: .ci/run
================================================
#!/bin/bash
set -eu

cd "$(dirname "$0")"
cd .. # git root

if ! command -v sudo; then
    # CI or Docker sometimes doesn't have it, so useful to have a dummy
    function sudo {
        "$@"
    }
fi

# --parallel-live to show outputs while it's running
tox_cmd='run-parallel --parallel-live'
if [ -n "${CI-}" ]; then
    # install OS specific stuff here
    case "$OSTYPE" in
    darwin*) 
        # macos
        :
        ;;
    cygwin* | msys* | win*)
        # windows
        # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
        tox_cmd='run'
        ;;
    *)
        # must be linux?
        :
        ;;
    esac
fi

# NOTE: expects uv installed
uv tool run --with tox-uv tox $tox_cmd "$@"


================================================
FILE: .gitattributes
================================================
*.ipynb filter=nbstripout

*.ipynb diff=ipynb


================================================
FILE: .github/workflows/main.yml
================================================
# see https://github.com/karlicoss/pymplate for up-to-date reference

name: CI
on:
  push:
    branches: '*'
    tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
    # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:

  # Needed to trigger on others' PRs.
  # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them".
  pull_request:

  # Needed to trigger workflows manually.
  workflow_dispatch:
    inputs:
      debug_enabled:
        type: boolean
        description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
        required: false
        default: false

  schedule:
    - cron: '31 18 * * 5'  # run every Friday


jobs:
  build:
    strategy:
      fail-fast: false
      matrix:
        platform: [ubuntu-latest, macos-latest]  # windows-latest
        python-version: ['3.12', '3.13', '3.14']
        # vvv just an example of excluding stuff from matrix
        # exclude: [{platform: macos-latest, python-version: '3.6'}]

    runs-on: ${{ matrix.platform }}

    # useful for 'optional' pipelines
    # continue-on-error: ${{ matrix.platform == 'windows-latest' }}

    steps:
    # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
    - run: echo "$HOME/.local/bin" >> $GITHUB_PATH

    - uses: actions/checkout@v6
      with:
        submodules: recursive
        fetch-depth: 0  # nicer to have all git history when debugging/for tests

    - uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}
      
    - uses: astral-sh/setup-uv@v7
      with:
        enable-cache: false  # we don't have lock files, so can't use them as cache key

    - uses: mxschmitt/action-tmate@v3
      if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}

    # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...
    - run: bash .ci/run
      env:
        # only compute lxml coverage on ubuntu; it crashes on windows
        CI_MYPY_COVERAGE: ${{ matrix.platform == 'ubuntu-latest' && '--cobertura-xml-report .coverage.mypy' || '' }}

    - if: matrix.platform == 'ubuntu-latest'  # no need to compute coverage for other platforms
      uses: codecov/codecov-action@v5
      with:
        fail_ci_if_error: true  # default false
        token: ${{ secrets.CODECOV_TOKEN }}
        flags: mypy-${{ matrix.python-version }}
        files: .coverage.mypy/cobertura.xml


  pypi:
    # Do not run it for PRs/cron schedule etc.
    # NOTE: release tags are guarded by on: push: tags on the top.
    if: github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') || (github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch)))
    # Ugh, I tried using matrix or something to explicitly generate only test pypi or prod pypi pipelines.
    # But github actions is so shit, it's impossible to do any logic at all, e.g. doesn't support conditional matrix, if/else statements for variables etc.

    needs: [build] # add all other jobs here

    runs-on: ubuntu-latest

    permissions:
      # necessary for Trusted Publishing
      id-token: write

    steps:
    # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
    - run: echo "$HOME/.local/bin" >> $GITHUB_PATH

    - uses: actions/checkout@v6
      with:
        submodules: recursive
        fetch-depth: 0  # pull all commits to correctly infer vcs version

    - uses: actions/setup-python@v6
      with:
        python-version: '3.12'

    - uses: astral-sh/setup-uv@v7
      with:
        enable-cache: false  # we don't have lock files, so can't use them as cache key

    - name: 'release to test pypi'
      # always deploy merged master to test pypi
      if: github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch)
      run: .ci/release --use-test-pypi

    - name: 'release to prod pypi'
      # always deploy tags to release pypi
      if: startsWith(github.event.ref, 'refs/tags/')
      run: .ci/release


================================================
FILE: .gitignore
================================================

# Created by https://www.gitignore.io/api/python,emacs
# Edit at https://www.gitignore.io/?templates=python,emacs

### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*

# Org-mode
.org-id-locations
*_archive

# flymake-mode
*_flymake.*

# eshell files
/eshell/history
/eshell/lastdir

# elpa packages
/elpa/

# reftex files
*.rel

# AUCTeX auto folder
/auto/

# cask packages
.cask/
dist/

# Flycheck
flycheck_*.el

# server auth directory
/server/

# projectiles files
.projectile

# directory configuration
.dir-locals.el

# network security
/network-security.data


### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# End of https://www.gitignore.io/api/python,emacs

untracked/


================================================
FILE: .idea/dictionaries/karlicos.xml
================================================
<component name="ProjectDictionaryState">
  <dictionary name="karlicos">
    <words>
      <w>cachew</w>
      <w>dataclassish</w>
      <w>pylint</w>
      <w>typecheck</w>
    </words>
  </dictionary>
</component>

================================================
FILE: LICENSE.txt
================================================
The MIT License (MIT)

Copyright (c) 2019 Dima Gerasimov

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ast\n",
    "from pathlib import Path\n",
    "\n",
    "import jedi  # ty: ignore[unresolved-import]\n",
    "\n",
    "\n",
    "def git_root() -> Path:\n",
    "    import subprocess\n",
    "\n",
    "    path_s = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip()\n",
    "    path = Path(path_s)\n",
    "    assert path.is_absolute(), path  # just in case\n",
    "    return path\n",
    "\n",
    "\n",
    "src_dir = git_root() / 'src'\n",
    "assert src_dir.exists(), src_dir  # seems like jedi is pretty quiet about missing dirs..\n",
    "\n",
    "project = jedi.Project(src_dir)\n",
    "\n",
    "\n",
    "def _find(name: str):\n",
    "    # ugh. sometimes it returns exact dupes for no apparent reason??\n",
    "    completions = set(project.search(name, all_scopes=True))\n",
    "    assert len(completions) == 1, f\"Expected one completion for {name}, got {completions}\"\n",
    "    [c] = completions\n",
    "    [c] = c.goto()  # todo what is this for?\n",
    "    return c\n",
    "\n",
    "\n",
    "def rlink(name: str) -> str:\n",
    "    c = _find(name)\n",
    "    if c.module_path is None:\n",
    "        # TODO ugh raise an issue on tracker or something??\n",
    "        # seems to only happen for namsepace packages..\n",
    "        assert c.description.startswith('namespace '), c\n",
    "        res = name.replace('.', '/')\n",
    "        assert (src_dir / res).exists(), res\n",
    "        return f'src/{res}'\n",
    "    else:\n",
    "        rpath = Path(c.module_path).relative_to(src_dir)\n",
    "        return f'src/{rpath}#L{c.line}'\n",
    "\n",
    "\n",
    "# TODO ugh.. annoying, seems like Jedi can't get the functions source?\n",
    "# maybe because it's doing partial parsing or something?\n",
    "# there is c._get_module_context().code_lines, but it returns all lines in a source file??\n",
    "def getsource(symbol: str) -> str:\n",
    "    c = _find(symbol)\n",
    "    p = Path(c.module_path)\n",
    "    # TODO check that it's a function?\n",
    "    function_name = symbol.split('.')[-1]\n",
    "    assert p.exists(), p\n",
    "    src = p.read_text()\n",
    "    src_lines = src.splitlines(keepends=True)\n",
    "    for x in ast.walk(ast.parse(src)):\n",
    "        if isinstance(x, ast.FunctionDef) and x.name == function_name:\n",
    "            break\n",
    "    else:\n",
    "        raise RuntimeError(f'Function not found: {symbol}')\n",
    "\n",
    "    # ugh lineno is 1-indexed, and seems like a closed interval?\n",
    "    return ''.join(src_lines[x.lineno - 1 : x.end_lineno])\n",
    "\n",
    "\n",
    "def getdoc(symbol: str) -> str:\n",
    "    c = _find(symbol)\n",
    "    doc = c.docstring()\n",
    "    assert doc is not None, symbol\n",
    "    return doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO just get rid of this in favor of native markdown + rlink?\n",
    "def flink(title: str, name: str | None = None) -> str:\n",
    "    if name is None:\n",
    "        name = title.replace('`', '')  # meh\n",
    "    if name.startswith('tests'):\n",
    "        name = name.replace('tests', 'cachew.tests.test_cachew')\n",
    "        # FIXME just replace in code..\n",
    "\n",
    "    return f\"[{title}]({rlink(name)})\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Markdown as md  # ty: ignore[unresolved-import]\n",
    "\n",
    "dmd = lambda x: display(md(x.strip()))  # ty: ignore[unresolved-reference]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": false,
    "ein.hycell": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "dmd('''\n",
    "<!--\n",
    "THIS FILE IS AUTOGENERATED BY README.ipynb.\n",
    "Ideally you should edit README.ipynb and use 'generate-readme' to produce README.md.\n",
    "But it's okay to edit README.md too directly if you want to fix something -- I can run generate-readme myself later.\n",
    "-->\n",
    "''')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "# What is Cachew?\n",
    "TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.\n",
    "Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.\n",
    "\n",
    "In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.\n",
    "\n",
    "A simple type is defined as:\n",
    "\n",
    "- primitive: `str`/`int`/`float`/`bool`\n",
    "- JSON-like types (`dict`/`list`/`tuple`)\n",
    "- `datetime`\n",
    "- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )\n",
    "- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)\n",
    "- [dataclasses](https://docs.python.org/3/library/dataclasses.html)\n",
    "\n",
    "\n",
    "That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.\n",
    "Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.\n",
    "\n",
    "## Motivation\n",
    "\n",
    "I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.\n",
    "\n",
    "Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,\n",
    "comparing on the next run and returning cached data if nothing changed.\n",
    "\n",
    "Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.\n",
    "\n",
    "\n",
    "# Examples\n",
    "## Processing Wikipedia\n",
    "Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.\n",
    "Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.\n",
    "\n",
    "\n",
    "With this library your can achieve it through single `@cachew` decorator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": false,
    "ein.hycell": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "# FIXME hmm seems like this doesn't work if there are type annotations on cachew_impl? odd\n",
    "# likely this? https://github.com/davidhalter/jedi/issues/2025\n",
    "doc = getdoc('cachew_impl').split('Usage example:')[-1].lstrip()\n",
    "dmd(f\"\"\"```python\n",
    "{doc}\n",
    "```\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.\n",
    "\n",
    "When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Incremental data exports\n",
    "This is my most common usecase of cachew, which I'll illustrate with example.\n",
    "\n",
    "I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.\n",
    "Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).\n",
    "That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:\n",
    "\n",
    "    ...\n",
    "    20190715100026.db\n",
    "    20190716100138.db\n",
    "    20190717101651.db\n",
    "    20190718100118.db\n",
    "    20190719100701.db\n",
    "    ...\n",
    "\n",
    "To access **all** of historic temperature data, I have two options:\n",
    "\n",
    "- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:\n",
    "  \n",
    "      def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n",
    "          for chunk in chunks:\n",
    "              # read measurements from 'chunk' and yield unseen ones\n",
    "\n",
    "  This is very **easy, but slow** and you waste CPU for no reason every time you need data.\n",
    "\n",
    "- Keep a 'master' database and write code to merge chunks in it.\n",
    "\n",
    "  This is very **efficient, but tedious**:\n",
    "  \n",
    "  - requires serializing/deserializing data -- boilerplate\n",
    "  - requires manually managing sqlite database -- error prone, hard to get right every time\n",
    "  - requires careful scheduling, ideally you want to access new data without having to refresh cache\n",
    "\n",
    "  \n",
    "Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:\n",
    "\n",
    "    @cachew      \n",
    "    def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n",
    "        # ...\n",
    "        \n",
    "- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast\n",
    "- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)\n",
    "\n",
    "  All the complexity of handling database is hidden in `cachew` implementation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": false,
    "ein.hycell": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "link = rlink('composite_hash')\n",
    "\n",
    "dmd(f'''\n",
    "# How it works\n",
    "\n",
    "- first your objects get {flink('converted', 'cachew.marshall.cachew.CachewMarshall')} into a simpler JSON-like representation\n",
    "- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).\n",
    "\n",
    "When the function is called, cachew [computes the hash of your function's arguments ]({link})\n",
    "and compares it against the previously stored hash value.\n",
    "\n",
    "- If they match, it would deserialize and yield whatever is stored in the cache database\n",
    "- If the hash mismatches, the original function is called and new data is stored along with the new hash\n",
    "''')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": false,
    "ein.hycell": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "dmd('# Features')\n",
    "types = [f'`{t}`' for t in ['str', 'int', 'float', 'bool', 'datetime', 'date', 'Exception']]\n",
    "dmd(f\"\"\"\n",
    "* automatic schema inference: {flink('1', 'tests.test_return_type_inference')}, {flink('2', 'tests.test_return_type_mismatch')}\n",
    "* supported types:\n",
    "\n",
    "    * primitive: {', '.join(types)}\n",
    "\n",
    "      See {flink('tests.test_types')}, {flink('tests.test_primitive')}, {flink('tests.test_dates')}, {flink('tests.test_exceptions')}\n",
    "    * {flink('@dataclass and NamedTuple', 'tests.test_dataclass')}\n",
    "    * {flink('Optional', 'tests.test_optional')} types\n",
    "    * {flink('Union', 'tests.test_union')} types\n",
    "    * {flink('nested datatypes', 'tests.test_nested')}\n",
    "\n",
    "* detects {flink('datatype schema changes', 'tests.test_schema_change')} and discards old data automatically\n",
    "\"\"\")\n",
    "# * custom hash function TODO example with mtime?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Performance\n",
    "Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.\n",
    "\n",
    "During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.\n",
    "\n",
    "It would almost certainly make your program faster if your computations take more than several seconds.\n",
    "\n",
    "You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": false,
    "ein.hycell": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "dmd(f\"\"\"\n",
    "# Using\n",
    "See {flink('docstring', 'cachew_impl')} for up-to-date documentation on parameters and return types.\n",
    "You can also use {flink('extensive unit tests', 'tests')} as a reference.\n",
    "\n",
    "Some useful (but optional) arguments of `@cachew` decorator:\n",
    "\n",
    "* `cache_path` can be a directory, or a callable that {flink('returns a path', 'tests.test_callable_cache_path')} and depends on function's arguments.\n",
    "\n",
    "   By default, `settings.DEFAULT_CACHEW_DIR` is used.\n",
    "\n",
    "* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.\n",
    "\n",
    "   By default it just uses string representation of the arguments, you can also specify a custom callable.\n",
    "\n",
    "   For instance, it can be used to {flink('discard cache', 'tests.test_custom_hash')} if the input file was modified.\n",
    "\n",
    "* `cls` is the type that would be serialized.\n",
    "\n",
    "   By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "# Installing\n",
    "Package is available on [pypi](https://pypi.org/project/cachew/).\n",
    "\n",
    "    pip3 install --user cachew\n",
    "    \n",
    "## Developing\n",
    "I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "# Implementation\n",
    "\n",
    "* why NamedTuples and dataclasses?\n",
    "  \n",
    "  `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.\n",
    "  Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.\n",
    "   \n",
    "  If you want to find out more why you should use more dataclasses in your code I suggest these links:\n",
    "  \n",
    "  - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)\n",
    "  - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)\n",
    "   \n",
    "* why not `pandas.DataFrame`?\n",
    "\n",
    "  DataFrames are great and can be serialised to csv or pickled.\n",
    "  They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.\n",
    "  They also can't be nested.\n",
    "\n",
    "* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?\n",
    "  \n",
    "  ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.\n",
    "\n",
    "  * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.\n",
    "    Also it doesn't support nested types.\n",
    "    \n",
    "* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?\n",
    "\n",
    "  Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.\n",
    "\n",
    "  Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:\n",
    "  \n",
    "  * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api\n",
    "  * https://pypi.org/project/marshmallow-dataclass\n",
    " \n",
    "  I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).\n",
    "  So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.\n",
    "\n",
    "* why `sqlite` database for storage?\n",
    "\n",
    "  It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.\n",
    "\n",
    "  There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.\n",
    "  It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.\n",
    "  \n",
    "  It would also be interesting to experiment with in-RAM storages.\n",
    "\n",
    "  I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tips and tricks\n",
    "## Optional dependency\n",
    "You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dmd(f\"\"\"```python\n",
    "{getsource('cachew.extra.mcachew')}\n",
    "```\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.\n",
    "\n",
    "## Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dmd(f'''\n",
    "{flink('cachew.settings')} exposes some parameters that allow you to control `cachew` behaviour:\n",
    "- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).\n",
    "   You can also use {flink('cachew.extra.disabled_cachew')} context manager to do it temporarily.\n",
    "- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the \"user cache directory\" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).\n",
    "- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.\n",
    "   Set to `True` to catch errors earlier.\n",
    "- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).\n",
    "\n",
    "''')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Updating this readme\n",
    "This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script."
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "cachew",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  },
  "name": "README.ipynb"
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: README.md
================================================
<!--
THIS FILE IS AUTOGENERATED BY README.ipynb.
Ideally you should edit README.ipynb and use 'generate-readme' to produce README.md.
But it's okay to edit README.md too directly if you want to fix something -- I can run generate-readme myself later.
-->


# What is Cachew?
TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.
Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.

In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.

A simple type is defined as:

- primitive: `str`/`int`/`float`/`bool`
- JSON-like types (`dict`/`list`/`tuple`)
- `datetime`
- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )
- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)
- [dataclasses](https://docs.python.org/3/library/dataclasses.html)


That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.
Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.

## Motivation

I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.

Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,
comparing on the next run and returning cached data if nothing changed.

Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.


# Examples
## Processing Wikipedia
Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.
Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.


With this library your can achieve it through single `@cachew` decorator.


```python
>>> from typing import NamedTuple, Iterator
>>> class Link(NamedTuple):
...     url : str
...     text: str
...
>>> @cachew
... def extract_links(archive_path: str) -> Iterator[Link]:
...     for i in range(5):
...         # simulate slow IO
...         # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours
...         import time; time.sleep(1)
...         yield Link(url=f'http://link{i}.org', text=f'text {i}')
...
>>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run
[Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]

>>> from timeit import Timer
>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)
... # second run is cached, so should take less time
>>> print(f"call took {int(res)} seconds")
call took 0 seconds

>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)
... # now file has changed, so the cache will be discarded
>>> print(f"call took {int(res)} seconds")
call took 5 seconds
```


When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.

When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately.

## Incremental data exports
This is my most common usecase of cachew, which I'll illustrate with example.

I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.
Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).
That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:

    ...
    20190715100026.db
    20190716100138.db
    20190717101651.db
    20190718100118.db
    20190719100701.db
    ...

To access **all** of historic temperature data, I have two options:

- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:
  
      def measurements(chunks: List[Path]) -> Iterator[Measurement]:
          for chunk in chunks:
              # read measurements from 'chunk' and yield unseen ones

  This is very **easy, but slow** and you waste CPU for no reason every time you need data.

- Keep a 'master' database and write code to merge chunks in it.

  This is very **efficient, but tedious**:
  
  - requires serializing/deserializing data -- boilerplate
  - requires manually managing sqlite database -- error prone, hard to get right every time
  - requires careful scheduling, ideally you want to access new data without having to refresh cache

  
Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:

    @cachew      
    def measurements(chunks: List[Path]) -> Iterator[Measurement]:
        # ...
        
- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast
- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)

  All the complexity of handling database is hidden in `cachew` implementation.


# How it works

- first your objects get [converted](src/cachew/marshall/cachew.py#L29) into a simpler JSON-like representation
- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).

When the function is called, cachew [computes the hash of your function's arguments ](src/cachew/__init__.py#L580)
and compares it against the previously stored hash value.

- If they match, it would deserialize and yield whatever is stored in the cache database
- If the hash mismatches, the original function is called and new data is stored along with the new hash


# Features


* automatic schema inference: [1](src/cachew/tests/test_cachew.py#L381), [2](src/cachew/tests/test_cachew.py#L395)
* supported types:

    * primitive: `str`, `int`, `float`, `bool`, `datetime`, `date`, `Exception`

      See [tests.test_types](src/cachew/tests/test_cachew.py#L682), [tests.test_primitive](src/cachew/tests/test_cachew.py#L720), [tests.test_dates](src/cachew/tests/test_cachew.py#L632), [tests.test_exceptions](src/cachew/tests/test_cachew.py#L1124)
    * [@dataclass and NamedTuple](src/cachew/tests/test_cachew.py#L597)
    * [Optional](src/cachew/tests/test_cachew.py#L524) types
    * [Union](src/cachew/tests/test_cachew.py#L827) types
    * [nested datatypes](src/cachew/tests/test_cachew.py#L440)

* detects [datatype schema changes](src/cachew/tests/test_cachew.py#L470) and discards old data automatically


# Performance
Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.

During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.

It would almost certainly make your program faster if your computations take more than several seconds.

You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py).


# Using
See [docstring](src/cachew/__init__.py#L279) for up-to-date documentation on parameters and return types.
You can also use [extensive unit tests](src/cachew/tests/test_cachew.py#L1) as a reference.

Some useful (but optional) arguments of `@cachew` decorator:

* `cache_path` can be a directory, or a callable that [returns a path](src/cachew/tests/test_cachew.py#L417) and depends on function's arguments.

   By default, `settings.DEFAULT_CACHEW_DIR` is used.

* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.

   By default it just uses string representation of the arguments, you can also specify a custom callable.

   For instance, it can be used to [discard cache](src/cachew/tests/test_cachew.py#L115) if the input file was modified.

* `cls` is the type that would be serialized.

   By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.


# Installing
Package is available on [pypi](https://pypi.org/project/cachew/).

    pip3 install --user cachew
    
## Developing
I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI.

# Implementation

* why NamedTuples and dataclasses?
  
  `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.
  Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.
   
  If you want to find out more why you should use more dataclasses in your code I suggest these links:
  
  - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)
  - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)
   
* why not `pandas.DataFrame`?

  DataFrames are great and can be serialised to csv or pickled.
  They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.
  They also can't be nested.

* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?
  
  ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.

  * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.
    Also it doesn't support nested types.
    
* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?

  Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.

  Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:
  
  * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api
  * https://pypi.org/project/marshmallow-dataclass
 
  I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).
  So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.

* why `sqlite` database for storage?

  It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.

  There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.
  It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.
  
  It would also be interesting to experiment with in-RAM storages.

  I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.


# Tips and tricks
## Optional dependency
You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:


```python
def mcachew(*args, **kwargs):
    """
    Stands for 'Maybe cachew'.
    Defensive wrapper around @cachew to make it an optional dependency.
    """
    try:
        import cachew
    except ModuleNotFoundError:
        import warnings

        warnings.warn(
            'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew',
            stacklevel=2,
        )
        return lambda orig_func: orig_func
    else:
        return cachew.cachew(*args, **kwargs)

```


Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.

## Settings


[cachew.settings](src/cachew/__init__.py#L55) exposes some parameters that allow you to control `cachew` behaviour:
- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).
   You can also use [cachew.extra.disabled_cachew](src/cachew/extra.py#L25) context manager to do it temporarily.
- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the "user cache directory" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).
- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.
   Set to `True` to catch errors earlier.
- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).


## Updating this readme
This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script.


================================================
FILE: benchmarks/20230912-comparison-with-legacy.org
================================================
Running on @karlicoss desktop PC, =python3.10=.

This is basically to justify switching to the new serialization method

- old way, =legacy= used to 'flatten' the type into an sqlite row
- new way, =cachew=, just dumps it as a dict, then to bytes via =orjson= and stores in a single sqlite column

The numbers between legacy and cachew can't be directly compared though.
Legacy =serializing= step emits a tuple, which can be inserted directly into the database.
So to compare it with the new way, we need to compare with the sum of =serializing= + =json dump=.
That said this won't be exact comparison either, since legacy binder relied on sqlalchemy to dump custom types to sqlite types (e.g. =datetime= or =Exception=). So legacy will have a slight advantage this way, but it's fine.

So we can see that for:
- =test_union_str_dataclass=
  - new implementation: =0.53 + 0.45s= to serialize; =0.29 + 0.48= to deserialize
  - old implementation: =2.38s= to serialize; =1.92= to deserialize
- =test_nested_dataclass=
  - new implementation: =1.05 + 0.26s= to serialize; =0.50 + 1.42= to deserialize
  - old implementation: =1.92s= to serialize; =1.88= to deserialize

For both tests, serialization if quite a bit faster with the new implementation.
On the second test, they are on par for deserialization, but as I mention these numbers are in favor of the legacy implementation.

In addition, keeping everything in one column unlocks some othe optimizations which wouldn't be possible with multiple columns.


#+begin_example
$ pytest --pyargs cachew.tests.marshall -k 'gc_off and 1000000 and not cattrs' -s
=========================================================== test session starts ============================================================
platform linux -- Python 3.10.12, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /code/cachew_jsonpickle
configfile: pytest.ini
plugins: anyio-3.6.2
collected 100 items / 95 deselected / 5 selected

src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
building      1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.34s
serializing   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.53s
json dump     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
sqlite dump   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.08s
sqlite load   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
jsonl dump    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.18s
jsonl load    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.13s
json load     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.48s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-legacy]
building      1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.35s
serializing   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 2.38s
json dump     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.22s
sqlite dump   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.06s
sqlite load   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
jsonl dump    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
jsonl load    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
json load     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.23s
deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.92s
PASSED
src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-cachew]
building      1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.58s
serializing   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.05s
json dump     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.26s
sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.03s
sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.30s
jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.14s
jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.14s
json load     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.50s
deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.42s
PASSED
src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-legacy]
building      1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.56s
serializing   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.92s
json dump     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.21s
sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.99s
sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.29s
jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.12s
jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.12s
json load     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.24s
deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.88s
PASSED
#+end_example


================================================
FILE: benchmarks/20230912.org
================================================
Running on @karlicoss desktop PC, =python3.10=

- serializing/deserializing here refers to converting object to json-ish python dictionary (not actual json string!)
- json dump/json load refers to converting the dict above to a json string and back
- sqlite dump/jsonl dump refers to saving/loading these strings to a persistent storage


#+begin_example
$ pytest --pyargs --ignore-glob '*test_cachew*' -k marshall -s
=========================================================== test session starts ============================================================
platform linux -- Python 3.10.6, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3
cachedir: .pytest_cache
configfile: pytest.ini
plugins: anyio-3.6.2
collected 37 items / 8 deselected / 29 selected

src/cachew/marshall/cachew.py::test_serialize_and_deserialize PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
building      1000000 objects of type str | cachew.tests.marshall.Name: 0.60s
serializing   1000000 objects of type str | cachew.tests.marshall.Name: 0.85s
json dump     1000000 objects of type str | cachew.tests.marshall.Name: 0.46s
sqlite dump   1000000 objects of type str | cachew.tests.marshall.Name: 1.11s
sqlite load   1000000 objects of type str | cachew.tests.marshall.Name: 0.31s
jsonl dump    1000000 objects of type str | cachew.tests.marshall.Name: 0.13s
jsonl load    1000000 objects of type str | cachew.tests.marshall.Name: 0.13s
json load     1000000 objects of type str | cachew.tests.marshall.Name: 1.04s
deserializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.86s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
building      5000000 objects of type str | cachew.tests.marshall.Name: 3.00s
serializing   5000000 objects of type str | cachew.tests.marshall.Name: 4.38s
json dump     5000000 objects of type str | cachew.tests.marshall.Name: 2.14s
sqlite dump   5000000 objects of type str | cachew.tests.marshall.Name: 5.43s
sqlite load   5000000 objects of type str | cachew.tests.marshall.Name: 1.47s
jsonl dump    5000000 objects of type str | cachew.tests.marshall.Name: 0.62s
jsonl load    5000000 objects of type str | cachew.tests.marshall.Name: 0.64s
json load     5000000 objects of type str | cachew.tests.marshall.Name: 4.74s
deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.06s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
building      5000000 objects of type str | cachew.tests.marshall.Name: 1.77s
serializing   5000000 objects of type str | cachew.tests.marshall.Name: 2.59s
json dump     5000000 objects of type str | cachew.tests.marshall.Name: 1.22s
sqlite dump   5000000 objects of type str | cachew.tests.marshall.Name: 5.28s
sqlite load   5000000 objects of type str | cachew.tests.marshall.Name: 1.58s
jsonl dump    5000000 objects of type str | cachew.tests.marshall.Name: 0.64s
jsonl load    5000000 objects of type str | cachew.tests.marshall.Name: 0.66s
json load     5000000 objects of type str | cachew.tests.marshall.Name: 1.53s
deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.60s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)
src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
building      1000000 objects of type <class 'datetime.datetime'>: 1.05s
serializing   1000000 objects of type <class 'datetime.datetime'>: 1.28s
json dump     1000000 objects of type <class 'datetime.datetime'>: 0.22s
sqlite dump   1000000 objects of type <class 'datetime.datetime'>: 1.14s
sqlite load   1000000 objects of type <class 'datetime.datetime'>: 0.30s
jsonl dump    1000000 objects of type <class 'datetime.datetime'>: 0.14s
jsonl load    1000000 objects of type <class 'datetime.datetime'>: 0.14s
json load     1000000 objects of type <class 'datetime.datetime'>: 0.70s
deserializing 1000000 objects of type <class 'datetime.datetime'>: 2.20s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
building      5000000 objects of type <class 'datetime.datetime'>: 5.08s
serializing   5000000 objects of type <class 'datetime.datetime'>: 6.35s
json dump     5000000 objects of type <class 'datetime.datetime'>: 1.13s
sqlite dump   5000000 objects of type <class 'datetime.datetime'>: 5.58s
sqlite load   5000000 objects of type <class 'datetime.datetime'>: 1.47s
jsonl dump    5000000 objects of type <class 'datetime.datetime'>: 0.69s
jsonl load    5000000 objects of type <class 'datetime.datetime'>: 0.70s
json load     5000000 objects of type <class 'datetime.datetime'>: 6.85s
deserializing 5000000 objects of type <class 'datetime.datetime'>: 11.10s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
building      1000000 objects of type <class 'datetime.datetime'>: 1.37s
serializing   1000000 objects of type <class 'datetime.datetime'>: 1.25s
json dump     1000000 objects of type <class 'datetime.datetime'>: 0.24s
sqlite dump   1000000 objects of type <class 'datetime.datetime'>: 1.12s
sqlite load   1000000 objects of type <class 'datetime.datetime'>: 0.29s
jsonl dump    1000000 objects of type <class 'datetime.datetime'>: 0.14s
jsonl load    1000000 objects of type <class 'datetime.datetime'>: 0.14s
json load     1000000 objects of type <class 'datetime.datetime'>: 0.24s
deserializing 1000000 objects of type <class 'datetime.datetime'>: 2.17s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
building      5000000 objects of type <class 'datetime.datetime'>: 5.10s
serializing   5000000 objects of type <class 'datetime.datetime'>: 6.22s
json dump     5000000 objects of type <class 'datetime.datetime'>: 1.17s
sqlite dump   5000000 objects of type <class 'datetime.datetime'>: 5.43s
sqlite load   5000000 objects of type <class 'datetime.datetime'>: 1.54s
jsonl dump    5000000 objects of type <class 'datetime.datetime'>: 0.70s
jsonl load    5000000 objects of type <class 'datetime.datetime'>: 0.71s
json load     5000000 objects of type <class 'datetime.datetime'>: 1.22s
deserializing 5000000 objects of type <class 'datetime.datetime'>: 10.97s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
building      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.64s
serializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.43s
json dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.30s
sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.16s
sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.30s
jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s
jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s
json load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.02s
deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 2.78s
PASSED
src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
building      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.88s
serializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.80s
json dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.31s
sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.39s
sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.31s
jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s
jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s
json load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.03s
deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 2.61s
PASSED
src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
building      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.57s
serializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.08s
json dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.29s
sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.09s
sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.30s
jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s
jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s
json load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.50s
deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.43s
PASSED
src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
building      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.57s
serializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.39s
json dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.29s
sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.16s
sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.32s
jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.16s
jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s
json load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.50s
deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.29s
PASSED

============================================================ slowest durations =============================================================
44.87s call     src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
38.76s call     src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
28.65s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
20.05s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
9.82s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
9.51s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
8.37s call     src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
8.20s call     src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
6.45s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
5.93s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
5.78s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
3.98s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
0.01s call     src/cachew/marshall/cachew.py::test_serialize_and_deserialize

(68 durations < 0.005s hidden.  Use -vv to show these durations.)
========================================================= short test summary info ==========================================================
SKIPPED [6] src/cachew/tests/marshall.py:171: TODO need to adjust the handling of Union types..
SKIPPED [4] src/cachew/tests/marshall.py:194: TODO support datetime with pytz for cattrs
PASSED src/cachew/marshall/cachew.py::test_serialize_and_deserialize
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
#+end_example


================================================
FILE: benchmarks/20230917.org
================================================
Running on @karlicoss desktop PC, =python3.10=

Just a comparison of =sqlite= and =file= backends.

#+begin_example
$ pytest --pyargs -k 'test_many and gc_off and 3000000' -s
src/cachew/tests/test_cachew.py::test_many[sqlite-gc_off-3000000] [INFO    2023-09-17 02:02:09,946 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: wrote   3000000 objects to   cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)
test_many: initial write to cache took 13.6s
test_many: cache size is 229.220352Mb
[INFO    2023-09-17 02:02:10,780 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: loading 3000000 objects from cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)
test_many: reading from cache took 7.0s
PASSED
src/cachew/tests/test_cachew.py::test_many[file-gc_off-3000000] [INFO    2023-09-17 02:02:23,944 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: wrote   3000000 objects to   cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)
test_many: initial write to cache took 6.1s
test_many: cache size is 202.555667Mb
[INFO    2023-09-17 02:02:23,945 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: loading objects from cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)
test_many: reading from cache took 5.4s
#+end_example


================================================
FILE: doc/cachew_disable.md
================================================
Can put this in the README.md once its been tested a bit

### Disable through Environment Variables

To disable a `cachew` function in some module, you can use the `CACHEW_DISABLE` environment variable. This is a colon-delimited (like a `$PATH`) list of modules to disable. It disables modules given some name recursively, and supports [unix-style globs](https://docs.python.org/3/library/fnmatch.html)

For example, say you were using [HPI](https://github.com/karlicoss/HPI) which internally uses a snippet like `mcachew` above. You may want to enable `cachew` for _most_ modules, but disable them for specific ones. For example take:

```
my/browser
├── active_browser.py
├── all.py
├── common.py
└── export.py
my/reddit
├── __init__.py
├── all.py
├── common.py
├── pushshift.py
└── rexport.py
```

To disable `cachew` in all of these files: `export CACHEW_DISABLE=my.browser:my.reddit` (disables for all submodules)

To disable just for a particular module: `export CACHEW_DISABLE='my.browser.export'`

Similarly to `$PATH` manipulations, you can do this in your shell configuration incrementally:

```
CACHEW_DISABLE='my.reddit.rexport'
if some condition...; then
    CACHEW_DISABLE="my.browser.export:$CACHEW_DISABLE"
fi
export CACHEW_DISABLE
```

You can also use globs, e.g. `CACHEW_DISABLE='my.*.gdpr`

To disable `cachew` everywhere, you could set `export CACHEW_DISABLE='*'`


================================================
FILE: doc/serialization.org
================================================
Cachew works kinda like =functools.lru_cache=, but it also works in-between program runs.
For that, it needs to somehow persist the objects on the disk (unlike =lru_cache= which just keeps references to the objects already in process memory).

While persisting objects to the cache, essentially cachew needs to map them into simpler types, i.e. ones you can keep in a database like strings/ints/binary blobs.

At the moment (as of =v0.13.0=), we use sqlite as the cache store, with =sqlalchemy= as the interface to interact with it.

The way cachew works now is, to save the object in cache:

- first it's "flattened out" to conform to the database row model, so individual fields (including recursive fields) become database columns
- python types are mapped into sqlalchemy types, with extra =sqlalchemy.TypeDecorator= instances to support custom types like =datetime= or =Exception=

You can find a more detailed example [[https://github.com/karlicoss/cachew/blob/175afade0a417bfd533ced174365d246b8a7dabc/src/cachew/__init__.py#L319-L353][here]].

A big problem is that in general it's not really possible to serialize, and especially to deserialize back an arbitrary object in Python, unless you resort to binary serialization like =pickle= (which is very slow and comes with its own hosts of issues).

However in cachew we require the user to supply the *type signature* for the functions that are cached, so we can benefit from it for serializing and deserializing.

Few years ago, when I implemented =cachew= at first, there weren't really many options for serialization driven by type signatures, so I implemented the custom code I mentioned above to support that. In 2023, however, more and more libraries are benefiting from type signatures, in particular for serializing stuff.

So I decided to give it another go, in hope of using some mature library, simplifying cachew's code, and possibly getting a perfromance boost.
It's possible that I missed some documentation so if you think the problems I am describing can actually be worked around, please don't hesitate to let me know.

* Comparison

In cachew the very minimum we're aiming to support are:

- all json-ish types, e.g. =int=/=str=/=dict=/=list= etc
- =dataclass= and =NamedTuple=
- =Optional= and =Union=
- custom types, e.g. =datetime=, =Exception= (e.g. at least preserve exception message)

See [[file:test_serialization.py]] for more specific examples and supporting evidence for my summary here.

** [[https://docs.python.org/3.10/library/pickle.html][pickle]]
Builtin pickle module can handle any objects, without even needing type annotations.

However, it's [[https://www.benfrederickson.com/dont-pickle-your-data/][famously very slow]], so I even didn't consider using it.

It's also not secure in general, although in our case we control the objects we save/load from cache, so it's not a big issue.

** [[https://github.com/jsonpickle/jsonpickle#readme][jsonpickle]]
Jsonpickle -- similar to pickle, can handle any types.

I [[https://github.com/karlicoss/cachew/commit/048df33e65560205d63845f022b027a27719ff48][gave it a go]] just in case, and it's an order of magnitude slower than custom serialization code I already had, which is a no-go.

** [[https://github.com/lidatong/dataclasses-json/#readme][dataclasses-json]]
# TODO link to code
- CON: requires annotating all dataclasses involved with =@dataclass_json=, recursively.
  This is a blocker from using it in =cachew=.
- CON: requires the type to be a =@dataclass= to annotate
  So if you have something simpler you'll have to wrap it into a dummy dataclass or something.
- PRO: supports =Union= correctly

** [[https://github.com/marshmallow-code/marshmallow][marshmallow]]

By default marshmallow doesn't support dataclasses or unions, but there are some extra packages

- for dataclasses https://github.com/lovasoa/marshmallow_dataclass
  - PRO: doesn't require modifying the original class, handles recursion out of the box
  - CON: doesn't handle =Union= correctly
    This is a blocker for cachew.
    In addition it has a custom implementation of Union handling (rather than e.g. relying on =python-marshmallow-union=).
- https://github.com/adamboche/python-marshmallow-union
  I didn't even get to try it since if dataclasses don't work marshmallow is a no-go for me.
  Plus for some reason =marshmallow_dataclass= has a custom Union handling implementation which is different from this one, so it's going to be a huge mess.

** [[https://github.com/pydantic/pydantic#readme][pydantic]]
- PRO: if you use =TypeAdapter=, you can serialize/deserialize arbitrary types without decorating/inheriting from =BaseModel=
- CON: doesn't handle =Union= correctly
  Again, this is a bit blocker. I've created an issue on pydantic bug tracker here: https://github.com/pydantic/pydantic/issues/7391

  Kind of sad, because otherwise pydantic seemed promising!

** [[https://github.com/python-attrs/cattrs#features][cattrs]]
- PRO: doesn't require modifying the classes you serialise
- PRO: rich feature set, clearly aiming to comply with standard python's typing annotations
- CON: there is an issue with handling =NamedTuple=

  It isn't converted to a dictionary like =dataclass= does, [[https://github.com/python-attrs/cattrs/issues/425][likely a bug]]?
- =Union= types are supported, but require some extra configuration

  Unions work, but you have to 'register' them first.
  A bit annoying that this is necessary even for simple unions like =int | str=, although [[https://github.com/python-attrs/cattrs/issues/423][possible]] to workaround.

  The plus side is that cattr has a builtin utility for Union type discrimination.

  I guess for my application I could traverse the type and register all necessary Unions with =catrrs=?
  # TODO create an issue to support opting in everywhere by default?


Since the above seems quite good, I did a quick cachew hack on [[https://github.com/karlicoss/cachew/tree/cattrs][cattrs branch]] to try and use it.

The pipeline is the following:
- serialize type to a dictionary with primitive types via =cattrs=
- serialize dictionary to a byte string via =orjson=
- persist the byte string as an sqlite database row

(for deserializing we just do the same in reverse)

You can find the results [[https://github.com/karlicoss/cachew/commit/82691b10cd1d4ced4862dff21cf038fb83f9525c][here]] -- cattrs proved to be quite a huge speedup over my custom serialization code!

It needs a bit more work and evaluation for use in =cachew=, however it's super promising!

# TODO https://catt.rs/en/stable/preconf.html#orjson

Some interesting reading about cattrs:
- https://threeofwands.com/why-cattrs-is-so-fast/#v2-the-genconverter
- https://threeofwands.com/why-i-use-attrs-instead-of-pydantic

* Verdict

The biggest shared issues are that most of this libraries:
- require modifying the original class definitions, either by inheriting or decorating
- don't handle =Union= at all or don't handle it corectly (usually relying on the structural equivalence rather than actual types)

So for most of them, I even didn't get to trying to support custom types and measuing performance with =cachew=.

Of all of them only =cattrs= stood out, it takes builtin python typing and performance very seriously, and very configurable.
So if you need no bullshit serialization in python, I can definitely recommend it.
I might switch to it in [[https://github.com/karlicoss/promnesia][promnesia]] (where we have full control over the type we serialize in the database), and could potentially be used in HPI for [[https://github.com/karlicoss/HPI/blob/master/my/core/serialize.py][my.core.serialize]].


================================================
FILE: doc/test_serialization.py
================================================
#!/usr/bin/env python3
from dataclasses import dataclass
from typing import NamedTuple, Union


def test_dataclasses_json():
    # pip install dataclasses-json
    from dataclasses_json import dataclass_json

    @dataclass
    class Inner:
        value: int

    @dataclass
    class Outer:
        inner: Inner

    ### issue 1: requires @dataclass_json annotation on all involved dataclasses
    obj = Outer(inner=Inner(value=123))  # noqa: F841

    # we don't control the types that are passed to us, so we can't use the @dataclass_json
    # but we can just call the decorator directly

    # HOWEVER: this modifies the original class, Outer!!
    OuterJson = dataclass_json(Outer)  # noqa: F841
    # it adds 'from_dict', 'from_json', 'schema', 'to_dict', 'to_json' attributes to it

    # now if you try
    # print(OuterJson.schema().dump(obj))
    # you get a warning that it wants you to add annotations to Inner classes too.
    # this isn't really an option for us.
    ###

    ### issue 2: can't dump anything unless the top level type is a dataclass?
    ### could wrap into a dummy dataclass or something, but is wasteful in terms of performance
    ###

    ### nice thing: correctly serializes Union types, even if they share the same attributes
    @dataclass_json
    @dataclass
    class City:
        name: str

    @dataclass_json
    @dataclass
    class Country:
        name: str

    @dataclass_json
    @dataclass
    class WithUnion:
        union: Union[City, Country]  # noqa: UP007

    objs = [
        WithUnion(union=City(name='London')),
        WithUnion(union=Country(name='UK')),
    ]

    schema = WithUnion.schema()
    json = schema.dumps(objs, many=True)
    objs2 = schema.loads(json, many=True)
    print("objects  ", objs)
    print("json     ", json)
    # NOTE: it dumps [{"union": {"name": "London", "__type": "City"}}, {"union": {"name": "UK", "__type": "Country"}}]
    # so types are correctly distinguished
    print("restored ", objs2)
    assert objs == objs2, (objs, objs2)
    ###


def test_marshmallow_dataclass():
    # pip3 install --user marshmallow-dataclass[union]
    import marshmallow_dataclass

    ### issue 1: the top level type has to be a dataclass?
    ### although possible that we could use regular marshmallow for that instead
    ###

    ### issue 2: doesn't handle unions correctly
    @dataclass
    class City:
        name: str

    @dataclass
    class Country:
        name: str

    @dataclass
    class WithUnion:
        union: Union[City, Country]  # noqa: UP007

    objs = [
        WithUnion(union=City(name="London")),
        WithUnion(union=Country(name="UK")),
    ]

    # NOTE: good, doesn't require adding annotations on the original classes
    schema = marshmallow_dataclass.class_schema(WithUnion)()

    json = schema.dumps(objs, many=True)
    objs2 = schema.loads(json, many=True)
    print("objects  ", objs)
    print("json     ", json)
    # NOTE: it dumps [{"union": {"value": 123}}, {"union": {"value": 123}}]
    # so it doesn't distingush based on types => won't deserialize correctly
    print("restored ", objs2)
    # assert objs == objs2, (objs, objs2)
    # ^ this assert fails!
    ###


def test_pydantic():
    from pydantic import TypeAdapter

    ### issue: doesn't handle Unions correctly
    @dataclass
    class City:
        name: str

    @dataclass
    class Country:
        name: str

    @dataclass
    class WithUnion:
        union: Union[City, Country]  # noqa: UP007

    objs = [
        WithUnion(union=City(name="London")),
        WithUnion(union=Country(name="UK")),
    ]

    # NOTE: nice, doesn't require annotating the original classes with anything
    Schema = TypeAdapter(list[WithUnion])

    json = Schema.dump_python(
        objs,
        # round_rtip: Whether to output the serialized data in a way that is compatible with deserialization
        # not sure, doesn't seem to impact anything..
        round_trip=True,
    )
    objs2 = Schema.validate_python(json)

    print("objects  ", objs)
    print("json     ", json)
    print("restored ", objs2)

    # assert objs == objs2, (objs, objs2)
    # ^ this assert fails!
    # created an issue https://github.com/pydantic/pydantic/issues/7391
    ###


def test_cattrs():
    from cattrs import Converter
    from cattrs.strategies import configure_tagged_union

    converter = Converter()

    ### issue: NamedTuples aren't unstructured? asked here https://github.com/python-attrs/cattrs/issues/425
    class X(NamedTuple):
        value: int

    d = converter.unstructure(X(value=123), X)  # noqa: F841
    # NOTE: this assert doesn't pass!
    # assert isinstance(d, dict)
    ###

    ### good: handles Union correctly (although some extra configuring required)
    @dataclass
    class City:
        name: str

    @dataclass
    class Country:
        name: str

    @dataclass
    class WithUnion:
        union: Union[City, Country]  # noqa: UP007

    objs = [
        WithUnion(union=City(name="London")),
        WithUnion(union=Country(name="UK")),
    ]

    configure_tagged_union(
        union=City | Country,
        converter=converter,
    )
    # NOTE: nice -- doesn't require decorating original classes
    json = converter.unstructure(objs, list[WithUnion])
    assert isinstance(json, list)
    objs2 = converter.structure(json, list[WithUnion])

    print("objects  ", objs)
    # NOTE: dumps it as [{'union': {'name': 'London', '_type': 'City'}}, {'union': {'name': 'UK', '_type': 'Country'}}]
    print("json     ", json)
    print("restored ", objs2)

    assert objs == objs2, (objs, objs2)
    ###

    ### issue: unions of simple types aren't supported?
    # see https://github.com/python-attrs/cattrs/issues/423
    mixed: list[int | str] = [
        123,
        'Jakarta',
    ]
    json = converter.unstructure(mixed, list[int | str])
    # NOTE: this fails
    # mixed2 = converter.structure(json , list[int | str])
    ###


test_dataclasses_json()
test_marshmallow_dataclass()
test_pydantic()
test_cattrs()


================================================
FILE: generate-readme
================================================
#!/bin/bash
set -eu

cd "$(dirname "$0")"

# --no-input seems to work well
# but if need more targeted approach, pparently can mark certain cells with tag and use '--TagRemovePreprocessor.remove_cell_tags={"noexport"}' ?
exec uvx --with jupyter --from jupyter-core jupyter nbconvert --execute --to markdown --no-input README.ipynb

# TODO run it on CI to make sure it renders and up to date?


================================================
FILE: github-issues.org
================================================
#+todo: OPEN | CLOSED
* Issues of cachew
:PROPERTIES:
:since:    
:url:      https://api.github.com/repos/karlicoss/cachew
:END:
** OPEN keep hash along each cached entity instead of separate table?
:PROPERTIES:
:tags:     ("prio-B")
:id:       15
:date-modification: 2020-01-08T22:26:04+0000
:date-creation: 2020-01-08T22:26:04+0000
:author:   "karlicoss"
:END:
: At the moment there are two separate tables: one for latest hash value, another for cached entities.
: It might be simpler and safer to keep a single table, with hash along with each cached entity.
: 
** OPEN support multiple cached values?
:PROPERTIES:
:tags:     ("prio-B")
:id:       14
:date-modification: 2020-01-08T22:26:03+0000
:date-creation: 2020-01-08T22:26:02+0000
:author:   "karlicoss"
:END:
: At the moment it's LRU(1) cache, it some usecases it makes sense to cache more values though
: 
** OPEN support pathlib.Path
:PROPERTIES:
:tags:     ("prio-C")
:id:       13
:date-modification: 2020-01-08T22:26:02+0000
:date-creation: 2020-01-08T22:26:01+0000
:author:   "karlicoss"
:END:
: Path is a trivial wrapper around str. I guess generally think of a good way to allow adhoc mapping of simple types.
: Perhaps current Exception makes sense.
: 
** OPEN support defensive behaviour
:PROPERTIES:
:tags:     ("prio-C")
:id:       12
:date-modification: 2020-01-08T22:26:01+0000
:date-creation: 2020-01-08T22:26:00+0000
:author:   "karlicoss"
:END:
: E.g. if we can't serialize for some reason, bail the database but at least yield values anyway
: 
** OPEN Add Redis support
:PROPERTIES:
:id:       9
:date-modification: 2020-01-06T00:48:59+0000
:date-creation: 2020-01-06T00:48:59+0000
:author:   "softinio"
:END:
: Add Redis support as an alternative to sqlite
: 
: This would be a great feature as it will make this solution easier to use in an enterprise production environment as getting a redis instance shared amonst multiple instances of your app is very easy and cost effective to use.
: 
** OPEN better pytz support?
:PROPERTIES:
:tags:     ("prio-C")
:id:       6
:date-modification: 2020-01-05T13:34:51+0000
:date-creation: 2020-01-05T13:33:25+0000
:author:   "karlicoss"
:END:
** CLOSED Optional feature: Exception support
:PROPERTIES:
:id:       11
:date-modification: 2020-01-08T21:56:56+0000
:date-creation: 2020-01-08T21:34:03+0000
:author:   "karlicoss"
:END:
** CLOSED Add doc on defensive/optional usage
:PROPERTIES:
:id:       10
:date-modification: 2020-01-06T23:48:54+0000
:date-creation: 2020-01-06T23:47:39+0000
:author:   "karlicoss"
:END:
** CLOSED Safer concurrent writes handling
:PROPERTIES:
:id:       8
:date-modification: 2020-01-05T22:32:13+0000
:date-creation: 2020-01-05T22:08:24+0000
:author:   "karlicoss"
:END:
** CLOSED Update readme
:PROPERTIES:
:id:       7
:date-modification: 2020-01-05T15:29:37+0000
:date-creation: 2020-01-05T15:24:38+0000
:author:   "karlicoss"
:END:
** CLOSED support for dataclasses
:PROPERTIES:
:id:       1
:date-modification: 2020-01-05T13:34:50+0000
:date-creation: 2019-07-30T21:45:30+0100
:author:   "karlicoss"
:END:
** CLOSED Fix Json support for python3.6
:PROPERTIES:
:id:       2
:date-modification: 2020-01-05T13:33:28+0000
:date-creation: 2019-12-08T12:21:58+0000
:author:   "karlicoss"
:END:
** CLOSED Fix bug when default argument is explicitly specified
:PROPERTIES:
:id:       3
:date-modification: 2020-01-05T13:33:27+0000
:date-creation: 2019-12-08T17:56:51+0000
:author:   "karlicoss"
:END:
** CLOSED Union types
:PROPERTIES:
:id:       4
:date-modification: 2020-01-05T13:33:27+0000
:date-creation: 2019-12-19T23:32:55+0000
:author:   "karlicoss"
:END:
** CLOSED support top level primitive types
:PROPERTIES:
:id:       5
:date-modification: 2020-01-05T13:33:26+0000
:date-creation: 2019-12-20T00:09:00+0000
:author:   "karlicoss"
:END:


================================================
FILE: misc/profile.py
================================================
#!/usr/bin/env python3
import sqlite3
from collections.abc import Iterator
from pathlib import Path

import sqlalchemy
from codetiming import Timer
from more_itertools import ilen

from cachew import cachew

# todo not sure it really helps much?
import gc  # isort: skip

gc.disable()


def timer(name: str) -> Timer:
    return Timer(name=name, text=name + ': ' + '{:.2f}s')


def test_ints() -> None:
    N = 5_000_000

    base = Path('/tmp/cachew_profiling/')
    # shutil.rmtree(base)
    base.mkdir(exist_ok=True, parents=True)

    cache_path = base / 'ints'

    def fun_nocachew(n) -> Iterator[int]:
        yield from range(n)

    @cachew(cache_path=cache_path, force_file=True)
    def fun(n) -> Iterator[int]:
        yield from range(n)

    # with timer('no caching'):
    #     ilen(fun_nocachew(N))

    # with timer('initial call'):
    #     ilen(fun(N))

    assert cache_path.exists()  # just in case
    with timer('reading directly via sqlite'):
        total = 0
        with sqlite3.connect(cache_path) as conn:
            for (_x,) in conn.execute('SELECT * FROM cache'):
                total += 1
        assert total == N  # just in case

    with timer('reading directly via sqlalchemy'):
        total = 0
        engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}')

        from sqlalchemy import Column, MetaData, Table

        meta = MetaData()
        table_cache = Table('cache', meta, Column('_cachew_primitive', sqlalchemy.Integer))
        with engine.connect() as conn:
            with timer('sqlalchemy querying'):
                rows = conn.execute(table_cache.select())
                for (_x,) in rows:
                    total += 1
        engine.dispose()
        assert total == N  # just in case

    cache_size_mb = cache_path.stat().st_size / 10**6
    print(f'cache size: {cache_size_mb:.1f} Mb')

    with timer('subsequent call'):
        ilen(fun(N))


test_ints()


================================================
FILE: misc/test_redis/docker-compose.yml
================================================
services:
  redis:
    image: "redis:alpine"
    # restart: always
    command:
      - "sh"
      - "-euc"
      - |
        exec redis-server
      # - |
      #   echo "requirepass '$$REDIS_PASSWORD'" > /etc/redis.conf
      #   exec redis-server /etc/redis.conf
    # environment:
    #   REDIS_PASSWORD: "password"
    ports:
      - 6379:6379
    volumes:
      - "redis-cachew:/data:rw"

volumes:
  redis-cachew:


================================================
FILE: misc/test_redis/test.py
================================================
#!/usr/bin/env python3
from time import time

import redis  # ty: ignore[unresolved-import]
from loguru import logger  # ty: ignore[unresolved-import]
from more_itertools import ilen

r = redis.Redis(host='localhost', port=6379, db=0)


N = 1_000_000


def items():
    yield from map(str, range(N))


TAG = 'keys'


def reset():
    r.delete(TAG)


def write():
    for i, obj in enumerate(items()):
        key = f'obj:{i}'
        r.hset(key, 'data', obj)
        r.lpush(TAG, key)


def read():
    keys = r.lrange(TAG, 0, -1)
    result = (r.hget(key, 'data') for key in keys)
    print('total', ilen(result))


# TODO could use lmove for atomic operations?
def write2():
    for obj in items():
        r.lpush(TAG, obj)


def read2():
    result = r.lrange(TAG, 0, -1)
    print('total', ilen(result))


reset()

a = time()
write2()
b = time()
logger.info(f'writing took {b - a:.1f}s')

a = time()
read2()
b = time()
logger.info(f'reading took {b - a:.1f}s')


# with read()/write()
# 100000 strings:
# 2023-09-09 01:50:23.498 | INFO     | __main__:<module>:37 - writing took 13.1s
# 2023-09-09 01:50:30.052 | INFO     | __main__:<module>:42 - reading took 6.6s
# hmm kinda slow..


# with read2/write2, writing about 7secs, and reading is instantaneous??
# for 1M objects, writing took 60 secs, and reading 0.2s?
# lol could be promising...
# I guess it's not iterative, but could retrieve items in batches?


================================================
FILE: mypy.ini
================================================
[mypy]
pretty = True
show_error_context = True
show_column_numbers = True
show_error_end = True

check_untyped_defs = True

# see https://mypy.readthedocs.io/en/stable/error_code_list2.html
warn_redundant_casts = True
strict_equality = True
warn_unused_ignores = True
enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable


# an example of suppressing
# [mypy-my.config.repos.pdfannots.pdfannots]
# ignore_errors = True


================================================
FILE: pyproject.toml
================================================
# see https://github.com/karlicoss/pymplate for up-to-date reference
[project]
dynamic = ["version"]  # version is managed by build backend
name = "cachew"
dependencies = [
    "platformdirs",     # default cache dir
    "sqlalchemy>=1.0",  # cache DB interaction
    "orjson",           # fast json serialization
    "typing-extensions",# for depreceated decorator
]
requires-python = ">=3.12"

## these need to be set if you're planning to upload to pypi
# description = "TODO"
license = {file = "LICENSE.txt"}
authors = [
    {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
]
maintainers = [
    {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
]
# keywords = []
# # see: http://pypi.python.org/pypi?%3Aaction=list_classifiers
# classifiers = [
# ]


[project.urls]
Homepage = "https://github.com/karlicoss/cachew"
##


[project.optional-dependencies]
optional = [
    "colorlog",
]

[dependency-groups]
# TODO: not sure, on the one hand could just use 'standard' dev dependency group
# On the other hand, it's a bit annoying that it's always included by default? 
# To make sure it's not included, need to use `uv run --exact --no-default-groups ...`
testing = [
    "pytest>=9",  # need version 9 for proper namespace package support
    "ruff",

    "pytz",

    "more-itertools",
    "patchy",  # for injecting sleeps and testing concurrent behaviour
    "enlighten",  # used in logging helper, but not really required
    "cattrs",  # benchmarking alternative marshalling implementation
    "pyinstrument",  # for profiling from within tests
    "codetiming", # Timer context manager
]
typecheck = [
    { include-group = "testing" },
    "mypy",
    "lxml", # for mypy html coverage
    "ty>=0.0.3",

    "types-pytz",  # optional runtime only dependency

    "cachew[optional]",
]


[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

# unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894
[tool.hatch.build.targets.wheel]
packages = ["src/cachew"]

[tool.hatch.version]
source = "vcs"

[tool.hatch.version.raw-options]
version_scheme = "python-simplified-semver"
local_scheme = "dirty-tag"


================================================
FILE: pytest.ini
================================================
[pytest]
# discover files that don't follow test_ naming. Useful to keep tests along with the source code
python_files = *.py

# this is necessary for --pyargs to discover implicit namespace packages correctly
consider_namespace_packages = true

# see https://docs.pytest.org/en/stable/reference/reference.html#confval-strict
# disable for now -- some macos tests ('file backend') are flaky
# strict = true

addopts =
  # prevent pytest cache from being created... it craps into project dir and I never use it anyway
  -p no:cacheprovider

  # -rap to print tests summary even when they are successful
  -rap
  --verbose

  # otherwise it won't discover doctests
  --doctest-modules

  # show all test durations (unless they are too short)
  --durations=0


================================================
FILE: ruff.toml
================================================
line-length = 120  # impacts import sorting

lint.extend-select = [
    "ALL",
]

lint.ignore = [
    "D",     # annoying nags about docstrings
    "N",     # pep naming
    "TCH",   # type checking rules, mostly just suggests moving imports under TYPE_CHECKING
    "S",     # bandit (security checks) -- tends to be not very useful, lots of nitpicks
    "DTZ",   # datetimes checks -- complaining about missing tz and mostly false positives
    "FIX",   # complains about fixmes/todos -- annoying
    "TD",    # complains about todo formatting -- too annoying
    "ANN",   # missing type annotations? seems way to strict though
    "EM" ,   # suggests assigning all exception messages into a variable first... pretty annoying

### too opinionated style checks
    "E501",  # too long lines
    "E731",  # assigning lambda instead of using def
    "E741",  # Ambiguous variable name: `l`
    "E742",  # Ambiguous class name: `O
    "E401",  # Multiple imports on one line
    "F403",  # import *` used; unable to detect undefined names
###

###
    "E722",  # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing..
    "F811",  # Redefinition of unused  # this gets in the way of pytest fixtures (e.g. in cachew)

## might be nice .. but later and I don't wanna make it strict
    "E402",  # Module level import not at top of file

### these are just nitpicky, we usually know better
    "PLR0911",  # too many return statements
    "PLR0912",  # too many branches
    "PLR0913",  # too many function arguments
    "PLR0915",  # too many statements
    "PLR1714",  # consider merging multiple comparisons
    "PLR2044",  # line with empty comment
    "PLR5501",  # use elif instead of else if
    "PLR2004",  # magic value in comparison -- super annoying in tests
###
    "PLR0402",  # import X.Y as Y -- TODO maybe consider enabling it, but double check

    "B009",  # calling gettattr with constant attribute -- this is useful to convince mypy
    "B010",  # same as above, but setattr
    "B017",  # pytest.raises(Exception)
    "B023",  # seems to result in false positives?

    # complains about useless pass, but has sort of a false positive if the function has a docstring?
    # this is common for click entrypoints (e.g. in __main__), so disable
    "PIE790",

    # a bit too annoying, offers to convert for loops to list comprehension
    # , which may heart readability
    "PERF401",

    # suggests no using exception in for loops
    # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost"
    "PERF203",

    "RET504", # unnecessary assignment before returning -- that can be useful for readability
    "RET505", # unnecessary else after return -- can hurt readability

    "PLW0603",  # global variable update.. we usually know why we are doing this
    "PLW2901",  # for loop variable overwritten, usually this is intentional

    "PT011",  # pytest raises is too broad

    "COM812",  # trailing comma missing -- mostly just being annoying with long multiline strings

    "TRY003",  # suggests defining exception messages in exception class -- kinda annoying
    "TRY201",  # raise without specifying exception name -- sometimes hurts readability
    "TRY400",  # a bit dumb, and results in false positives (see https://github.com/astral-sh/ruff/issues/18070)
    "TRY401",  # redundant exception in logging.exception call? TODO double check, might result in excessive logging

    "TID252",  # Prefer absolute imports over relative imports from parent modules

    ## too annoying
    "T20",     # just complains about prints and pprints (TODO maybe consider later?)
    "Q",       # flake quotes, too annoying
    "C90",     # some complexity checking
    "G004",    # logging statement uses f string
    "ERA001",  # commented out code
    "SLF001",  # private member accessed
    "BLE001",  # do not catch 'blind' Exception
    "INP001",  # complains about implicit namespace packages
    "SIM102",  # if statements collapsing, often hurts readability
    "SIM103",  # multiple conditions collapsing, often hurts readability
    "SIM105",  # suggests using contextlib.suppress instad of try/except -- this wouldn't be mypy friendly
    "SIM108",  # suggests using ternary operation instead of if -- hurts readability
    "SIM110",  # suggests using any(...) instead of for look/return -- hurts readability
    "SIM117",  # suggests using single with statement instead of nested -- doesn't work in tests
    "RSE102",  # complains about missing parens in exceptions
    ##

    "PLC0415", # "imports should be at the top level" -- not realistic

    "ISC001",  # implicit string concatenation -- we do use it in tests
]


extend-exclude = [
    "src/cachew/legacy.py",  # TODO dunno, remove it for good?
]


================================================
FILE: src/cachew/__init__.py
================================================
import fnmatch
import functools
import importlib.metadata
import inspect
import json
import logging
import os
import stat
import warnings
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from pathlib import Path
from typing import (
    TYPE_CHECKING,
    Any,
    Literal,
    cast,
    get_args,
    get_origin,
    get_type_hints,
    overload,
)

try:
    # orjson might not be available on some architectures, so let's make it defensive just in case
    from orjson import dumps as orjson_dumps
    from orjson import loads as orjson_loads
except:
    warnings.warn("orjson couldn't be imported. It's _highly_ recommended for better caching performance", stacklevel=2)

    def orjson_dumps(*args, **kwargs):  # type: ignore[misc]
        # sqlite needs a blob
        return json.dumps(*args, **kwargs).encode('utf8')

    orjson_loads = json.loads  # ty: ignore[invalid-assignment]

import platformdirs

from .backend.common import AbstractBackend
from .backend.file import FileBackend
from .backend.sqlite import SqliteBackend
from .common import CachewException, SourceHash, TypeNotSupported
from .logging_helper import make_logger
from .marshall.cachew import CachewMarshall, build_schema
from .utils import resolve_type_parameters

# in case of changes in the way cachew stores data, this should be changed to discard old caches
CACHEW_VERSION: str = importlib.metadata.version(__name__)

type PathIsh = Path | str

Backend = Literal['sqlite', 'file']


class settings:
    '''
    Global settings, you can override them after importing cachew
    '''

    '''
    Toggle to disable caching
    '''
    ENABLE: bool = True

    DEFAULT_CACHEW_DIR: PathIsh = Path(platformdirs.user_cache_dir('cachew'))

    '''
    Set to true if you want to fail early. Otherwise falls back to non-cached version
    '''
    THROW_ON_ERROR: bool = False

    DEFAULT_BACKEND: Backend = 'sqlite'


def get_logger() -> logging.Logger:
    return make_logger(__name__)


BACKENDS: dict[Backend, type[AbstractBackend]] = {
    'file': FileBackend,
    'sqlite': SqliteBackend,
}


type PathProvider[**P] = PathIsh | Callable[P, PathIsh]
type HashFunction[**P] = Callable[P, SourceHash]


def default_hash(*args, **kwargs) -> SourceHash:
    # TODO eh, demand hash? it's not safe either... ugh
    # can lead to werid consequences otherwise..
    return str(args + tuple(sorted(kwargs.items())))  # good enough??


# TODO give it as an example in docs
def mtime_hash(path: Path, *args, **kwargs) -> SourceHash:
    mt = path.stat().st_mtime
    return default_hash(f'{path}.{mt}', *args, **kwargs)


Failure = str  # deliberately not a type =, used in type checks
type Kind = Literal['single', 'multiple']
type Inferred = tuple[Kind, type[Any]]


def infer_return_type(func) -> Failure | Inferred:
    """
    >>> def const() -> int:
    ...     return 123
    >>> infer_return_type(const)
    ('single', <class 'int'>)

    >>> from typing import Optional
    >>> def first_character(s: str) -> Optional[str]:
    ...     return None if len(s) == 0 else s[0]
    >>> kind, opt = infer_return_type(first_character)
    >>> # in 3.8, Optional[str] is printed as Union[str, None], so need to hack around this
    >>> (kind, opt == Optional[str])
    ('single', True)

    # tuple is an iterable.. but presumably should be treated as a single value
    >>> from typing import Tuple
    >>> def a_tuple() -> Tuple[int, str]:
    ...     return (123, 'hi')
    >>> infer_return_type(a_tuple)
    ('single', tuple[int, str])

    >>> from typing import Collection, NamedTuple
    >>> class Person(NamedTuple):
    ...     name: str
    ...     age: int
    >>> def person_provider() -> Collection[Person]:
    ...     return []
    >>> infer_return_type(person_provider)
    ('multiple', <class 'cachew.Person'>)

    >>> def single_str() -> str:
    ...     return 'hello'
    >>> infer_return_type(single_str)
    ('single', <class 'str'>)

    >>> def single_person() -> Person:
    ...     return Person(name="what", age=-1)
    >>> infer_return_type(single_person)
    ('single', <class 'cachew.Person'>)

    >>> from typing import Sequence
    >>> def int_provider() -> Sequence[int]:
    ...     return (1, 2, 3)
    >>> infer_return_type(int_provider)
    ('multiple', <class 'int'>)

    >>> from typing import Iterator
    >>> def union_provider() -> Iterator[str | int]:
    ...     yield 1
    ...     yield 'aaa'
    >>> infer_return_type(union_provider)
    ('multiple', str | int)

    >>> from typing import Iterator
    >>> type Str = str
    >>> type Int = int
    >>> type IteratorStrInt = Iterator[Str | Int]
    >>> def iterator_str_int() -> IteratorStrInt:
    ...     yield 1
    ...     yield 'aaa'
    >>> infer_return_type(iterator_str_int)
    ('multiple', str | int)

    # a bit of an edge case
    >>> from typing import Tuple
    >>> def empty_tuple() -> Iterator[Tuple[()]]:
    ...     yield ()
    >>> infer_return_type(empty_tuple)
    ('multiple', tuple[()])

    ... # doctest: +ELLIPSIS

    >>> def untyped():
    ...     return 123
    >>> infer_return_type(untyped)
    'no return type annotation...'

    >>> from typing import List
    >>> class Custom:
    ...     pass
    >>> def unsupported() -> Custom:
    ...     return Custom()
    >>> infer_return_type(unsupported)
    "can't infer type from <class 'cachew.Custom'>: can't cache <class 'cachew.Custom'>"

    >>> def unsupported_list() -> List[Custom]:
    ...     return [Custom()]
    >>> infer_return_type(unsupported_list)
    "can't infer type from list[cachew.Custom]: can't cache <class 'cachew.Custom'>"
    """
    try:
        hints = get_type_hints(func)
    except Exception as ne:
        # get_type_hints might fail if types are forward defined or missing
        # see test_future_annotation for an example
        return str(ne)
    rtype = hints.get('return', None)
    if rtype is None:
        return f"no return type annotation on {func}"

    rtype = resolve_type_parameters(rtype)

    def bail(reason: str) -> str:
        return f"can't infer type from {rtype}: " + reason

    # first we wanna check if the top level type is some sort of iterable that makes sense ot cache
    # e.g. List/Sequence/Iterator etc
    return_multiple = _returns_multiple(rtype)

    if return_multiple:
        # then the actual type to cache will be the argument of the top level one
        args = get_args(rtype)
        if args is None:
            return bail("has no __args__")

        if len(args) != 1:
            return bail(f"wrong number of __args__: {args}")

        (cached_type,) = args
    else:
        cached_type = rtype

    try:
        build_schema(Type=cached_type)
    except TypeNotSupported as ex:
        return bail(f"can't cache {ex.type_}")

    return ('multiple' if return_multiple else 'single', cached_type)


def _returns_multiple(rtype) -> bool:
    origin = get_origin(rtype)
    if origin is None:
        return False
    if origin is tuple:
        # usually tuples are more like single values rather than a sequence? (+ this works for namedtuple)
        return False
    try:
        return issubclass(origin, Iterable)
    except TypeError:
        # that would happen if origin is not a 'proper' type, e.g. is a Union or something
        # seems like exception is the easiest way to check
        return False


# https://stackoverflow.com/questions/653368/how-to-create-a-python-decorator-that-can-be-used-either-with-or-without-paramet
def doublewrap(f):
    @functools.wraps(f)
    def new_dec(*args, **kwargs):
        if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
            # actual decorated function
            return f(args[0])
        else:
            # decorator arguments
            return lambda realf: f(realf, *args, **kwargs)

    return new_dec


def cachew_error(e: Exception, *, logger: logging.Logger) -> None:
    if settings.THROW_ON_ERROR:
        # TODO would be nice to throw from the original code line -- maybe mess with the stack here?
        raise e
    logger.error("error while setting up cache, falling back to non-cached version")
    logger.exception(e)


use_default_path = cast(Path, object())


# using cachew_impl here just to use different signatures during type checking (see below)
@doublewrap
def cachew_impl[**P](
    func=None,  # TODO should probably type it after switch to python 3.10/proper paramspec
    cache_path: PathProvider[P] | None = use_default_path,
    *,
    force_file: bool = False,
    cls: type | tuple[Kind, type] | None = None,
    depends_on: HashFunction[P] = default_hash,
    logger: logging.Logger | None = None,
    chunk_by: int = 100,
    # NOTE: allowed values for chunk_by depend on the system.
    # some systems (to be more specific, sqlite builds), it might be too large and cause issues
    # ideally this would be more defensive/autodetected, maybe with a warning?
    # you can use 'test_many' to experiment
    # - too small values (e.g. 10)  are slower than 100 (presumably, too many sql statements)
    # - too large values (e.g. 10K) are slightly slower as well (not sure why?)
    synthetic_key: str | None = None,
    backend: Backend | None = None,
    **kwargs,
):
    r"""
    Database-backed cache decorator. TODO more description?
    # TODO use this doc in readme?

    :param cache_path: if not set, `cachew.settings.DEFAULT_CACHEW_DIR` will be used.
    :param force_file: if set to True, assume `cache_path` is a regular file (instead of a directory)
    :param cls: if not set, cachew will attempt to infer it from return type annotation. See :func:`infer_return_type` and :func:`cachew.tests.test_cachew.test_return_type_inference`.
    :param depends_on: hash function to determine whether the underlying . Can potentially benefit from the use of side effects (e.g. file modification time). TODO link to test?
    :param logger: custom logger, if not specified will use logger named `cachew`. See :func:`get_logger`.
    :return: iterator over original or cached items

    Usage example:
    >>> from typing import NamedTuple, Iterator
    >>> class Link(NamedTuple):
    ...     url : str
    ...     text: str
    ...
    >>> @cachew
    ... def extract_links(archive_path: str) -> Iterator[Link]:
    ...     for i in range(5):
    ...         # simulate slow IO
    ...         # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours
    ...         import time; time.sleep(1)
    ...         yield Link(url=f'http://link{i}.org', text=f'text {i}')
    ...
    >>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run
    [Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]

    >>> from timeit import Timer
    >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)
    ... # second run is cached, so should take less time
    >>> print(f"call took {int(res)} seconds")
    call took 0 seconds

    >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)
    ... # now file has changed, so the cache will be discarded
    >>> print(f"call took {int(res)} seconds")
    call took 5 seconds
    """
    if logger is None:
        module_name = getattr(func, '__module__', None)
        if module_name is not None and module_name in logging.Logger.manager.loggerDict:
            # if logger for the function's module already exists, reuse it
            logger = logging.getLogger(module_name)
        else:
            # rely on default cachew logger
            logger = get_logger()

    class AddFuncName(logging.LoggerAdapter):
        def process(self, msg, kwargs):
            extra = self.extra
            assert extra is not None
            func_name = extra['func_name']
            return f'[{func_name}] {msg}', kwargs

    assert func is not None
    func_name = callable_name(func)
    adapter = AddFuncName(logger, {'func_name': func_name})
    logger = cast(logging.Logger, adapter)

    hashf = kwargs.get('hashf')
    if hashf is not None:
        warnings.warn("'hashf' is deprecated. Please use 'depends_on' instead", stacklevel=2)
        depends_on = hashf

    # todo not very nice that ENABLE check is scattered across two places
    if not settings.ENABLE or cache_path is None:
        logger.debug('cache explicitly disabled (settings.ENABLE is False or cache_path is None)')
        return func

    if cache_path is use_default_path:
        cache_path = settings.DEFAULT_CACHEW_DIR
        logger.debug(f'no cache_path specified, using the default {cache_path}')

    use_kind: Kind | None = None
    use_cls: type | None = None
    if cls is not None:
        # defensive here since typing. objects passed as cls might fail on isinstance
        try:
            is_tuple = isinstance(cls, tuple)
        except:
            is_tuple = False
        if is_tuple:
            use_kind, use_cls = cls  # type: ignore[misc]
        else:
            use_kind = 'multiple'
            use_cls = cls  # type: ignore[assignment]

    # TODO fuzz infer_return_type, should never crash?
    inference_res = infer_return_type(func)
    if isinstance(inference_res, Failure):
        msg = f"failed to infer cache type: {inference_res}. See https://github.com/karlicoss/cachew#features for the list of supported types."
        if use_cls is None:
            ex = CachewException(msg)
            cachew_error(ex, logger=logger)
            return func
        else:
            # it's ok, assuming user knows better
            logger.debug(msg)
            assert use_kind is not None
    else:
        (inferred_kind, inferred_cls) = inference_res
        if use_cls is None:
            logger.debug(f'using inferred type {inferred_kind} {inferred_cls}')
            (use_kind, use_cls) = (inferred_kind, inferred_cls)
        else:
            assert use_kind is not None
            if (use_kind, use_cls) != inference_res:
                logger.warning(
                    f"inferred type {inference_res} mismatches explicitly specified type {(use_kind, use_cls)}"
                )
                # TODO not sure if should be more serious error...

    if use_kind == 'single':
        # pretend it's an iterable, this is just simpler for cachew_wrapper
        @functools.wraps(func)
        def _func(*args, **kwargs):
            return [func(*args, **kwargs)]

    else:
        _func = func

    assert use_cls is not None

    ctx = Context(
        func         =_func,
        cache_path   =cache_path,
        force_file   =force_file,
        cls_         =use_cls,
        depends_on   =depends_on,
        logger       =logger,
        chunk_by     =chunk_by,
        synthetic_key=synthetic_key,
        backend      =backend,
    )  # fmt: skip

    # hack to avoid extra stack frame (see test_recursive*)
    @functools.wraps(func)
    def binder(*args, **kwargs):
        kwargs['_cachew_context'] = ctx
        res = cachew_wrapper(*args, **kwargs)

        if use_kind == 'single':
            lres = list(res)
            assert len(lres) == 1, lres  # shouldn't happen
            return lres[0]
        return res

    return binder


if TYPE_CHECKING:
    # we need two versions due to @doublewrap
    # this is when we just annotate as @cachew without any args
    @overload
    def cachew[F: Callable](fun: F) -> F: ...

    # NOTE: we won't really be able to make sure the args of cache_path are the same as args of the wrapped function
    # because when cachew() is called, we don't know anything about the wrapped function yet
    # but at least it works for checking that cachew_path and depdns_on have the same args :shrug:
    @overload
    def cachew[F, **P](
        cache_path: PathProvider[P] | None = ...,
        *,
        force_file: bool = ...,
        cls: type | tuple[Kind, type] | None = ...,
        depends_on: HashFunction[P] = ...,
        logger: logging.Logger | None = ...,
        chunk_by: int = ...,
        synthetic_key: str | None = ...,
        backend: Backend | None = ...,
    ) -> Callable[[F], F]: ...

    def cachew(*args, **kwargs):  # make ty happy
        raise NotImplementedError
else:
    cachew = cachew_impl


def callable_name(func: Callable) -> str:
    # some functions don't have __module__
    mod = getattr(func, '__module__', None) or ''
    return f'{mod}:{getattr(func, "__qualname__")}'


def callable_module_name(func: Callable) -> str | None:
    return getattr(func, '__module__', None)


# could cache this, but might be worth not to, so the user can change it on the fly?
def _parse_disabled_modules(logger: logging.Logger | None = None) -> list[str]:
    # e.g. CACHEW_DISABLE=my.browser:my.reddit
    if 'CACHEW_DISABLE' not in os.environ:
        return []
    disabled = os.environ['CACHEW_DISABLE']
    if disabled.strip() == '':
        return []
    if ',' in disabled and logger:
        logger.warning(
            'CACHEW_DISABLE contains a comma, but this expects a $PATH-like, colon-separated list; '
            f'try something like CACHEW_DISABLE={disabled.replace(",", ":")}'
        )
    # remove any empty strings incase did something like CACHEW_DISABLE=my.module:$CACHEW_DISABLE
    return [p for p in disabled.split(':') if p.strip() != '']


def _matches_disabled_module(module_name: str, pattern: str) -> bool:
    '''
    >>> _matches_disabled_module('my.browser', 'my.browser')
    True
    >>> _matches_disabled_module('my.browser', 'my.*')
    True
    >>> _matches_disabled_module('my.browser', 'my')
    True
    >>> _matches_disabled_module('my.browser', 'my.browse*')
    True
    >>> _matches_disabled_module('my.browser.export', 'my.browser')
    True
    >>> _matches_disabled_module('mysomething.else', '*')  # CACHEW_DISABLE='*' disables everything
    True
    >>> _matches_disabled_module('my.browser', 'my.br?????')  # fnmatch supports unix-like patterns
    True
    >>> _matches_disabled_module('my.browser', 'my.browse')
    False
    >>> _matches_disabled_module('mysomething.else', 'my')  # since not at '.' boundary, doesn't match
    False
    >>> _matches_disabled_module('mysomething.else', '')
    False
    >>> _matches_disabled_module('my.browser', 'my.browser.export')
    False
    '''

    if module_name == pattern:
        return True

    module_parts = module_name.split('.')
    pattern_parts = pattern.split('.')

    # e.g. if pattern is 'module.submod.inner_module' and module is just 'module.submod'
    # theres no possible way for it to match
    if len(module_parts) < len(pattern_parts):
        return False

    for mp, pp in zip(module_parts, pattern_parts, strict=False):
        if fnmatch.fnmatch(mp, pp):
            continue
        return False
    return True


def _module_is_disabled(module_name: str, logger: logging.Logger) -> bool:
    disabled_modules = _parse_disabled_modules(logger)
    for pat in disabled_modules:
        if _matches_disabled_module(module_name, pat):
            logger.debug(
                f"caching disabled for {module_name} (matched '{pat}' from 'CACHEW_DISABLE={os.environ['CACHEW_DISABLE']})'"
            )
            return True
    return False


# fmt: off
_CACHEW_CACHED       = 'cachew_cached'  # TODO add to docs
_SYNTHETIC_KEY       = 'synthetic_key'
_SYNTHETIC_KEY_VALUE = 'synthetic_key_value'
_DEPENDENCIES        = 'dependencies'
# fmt: on


@dataclass
class Context[**P]:
    # fmt: off
    func         : Callable
    cache_path   : PathProvider[P]
    force_file   : bool
    cls_         : type
    depends_on   : HashFunction[P]
    logger       : logging.Logger
    chunk_by     : int
    synthetic_key: str | None
    backend      : Backend | None

    def composite_hash(self, *args, **kwargs) -> dict[str, Any]:
        fsig = inspect.signature(self.func)
        # defaults wouldn't be passed in kwargs, but they can be an implicit dependency (especially inbetween program runs)
        defaults = {
            k: v.default
            for k, v in fsig.parameters.items()
            if v.default is not inspect.Parameter.empty
        }
        # but only pass default if the user wants it in the hash function?
        hsig = inspect.signature(self.depends_on)
        defaults = {
            k: v
            for k, v in defaults.items()
            if k in hsig.parameters or 'kwargs' in hsig.parameters
        }
        kwargs = {**defaults, **kwargs}
        schema = str(self.cls_)
        hash_parts = {
            'cachew'      : CACHEW_VERSION,
            'schema'      : schema,
            _DEPENDENCIES : str(self.depends_on(*args, **kwargs)),
        }
        synthetic_key = self.synthetic_key
        if synthetic_key is not None:
            hash_parts[_SYNTHETIC_KEY      ] = synthetic_key
            hash_parts[_SYNTHETIC_KEY_VALUE] = kwargs[synthetic_key]
            # FIXME assert it's in kwargs in the first place?
            # FIXME support positional args too? maybe extract the name from signature somehow? dunno
            # need to test it
        return hash_parts
    # fmt: on


def cachew_wrapper[**P](
    *args,
    _cachew_context: Context[P],
    **kwargs,
):
    C = _cachew_context
    # fmt: off
    func          = C.func
    cache_path    = C.cache_path
    force_file    = C.force_file
    cls           = C.cls_
    logger        = C.logger
    chunk_by      = C.chunk_by
    synthetic_key = C.synthetic_key
    backend_name  = C.backend
    # fmt: on

    used_backend = backend_name or settings.DEFAULT_BACKEND

    func_name = callable_name(func)
    if not settings.ENABLE:
        logger.debug('cache explicitly disabled (settings.ENABLE is False)')
        yield from func(*args, **kwargs)
        return

    mod_name = callable_module_name(func)
    if mod_name is not None and _module_is_disabled(mod_name, logger):
        yield from func(*args, **kwargs)
        return

    def get_db_path() -> Path | None:
        db_path: Path
        if callable(cache_path):
            pp = cache_path(*args, **kwargs)
            if pp is None:
                logger.debug('cache explicitly disabled (cache_path is None)')
                # early return, in this case we just yield the original items from the function
                return None
            else:
                db_path = Path(pp)
        else:
            db_path = Path(cache_path)

        db_path.parent.mkdir(parents=True, exist_ok=True)

        # need to be atomic here, hence calling stat() once and then just using the results
        try:
            # note: stat follows symlinks (which is what we want)
            st = db_path.stat()
        except FileNotFoundError:
            # doesn't exist. then it's controlled by force_file
            if force_file:
                # just use db_path as is
                pass
            else:
                db_path.mkdir(parents=True, exist_ok=True)
                db_path = db_path / func_name
        else:
            # already exists, so just use callable name if it's a dir
            if stat.S_ISDIR(st.st_mode):
                db_path = db_path / func_name

        logger.debug(f'using {used_backend}:{db_path} for cache')
        return db_path

    def try_use_synthetic_key() -> None:
        if synthetic_key is None:
            return
        # attempt to use existing cache if possible, as a 'prefix'

        old_hash_d: dict[str, Any] = {}
        if old_hash is not None:
            try:
                old_hash_d = json.loads(old_hash)
            except json.JSONDecodeError:
                # possible if we used old cachew version (<=0.8.1), hash wasn't json
                pass

        hash_diffs = {
            k: new_hash_d.get(k) == old_hash_d.get(k)
            for k in (*new_hash_d.keys(), *old_hash_d.keys())
            # the only 'allowed' differences for hash, otherwise need to recompute (e.g. if schema changed)
            if k not in {_SYNTHETIC_KEY_VALUE, _DEPENDENCIES}
        }
        cache_compatible = all(hash_diffs.values())
        if not cache_compatible:
            return

        def missing_keys(cached: list[str], wanted: list[str]) -> list[str] | None:
            # FIXME assert both cached and wanted are sorted? since we rely on it
            # if not, then the user could use some custom key for caching (e.g. normalise filenames etc)
            # although in this case passing it into the function wouldn't make sense?

            if len(cached) == 0:
                # no point trying to reuse anything, cache should be empty?
                return None
            if len(wanted) == 0:
                # similar, no way to reuse cache
                return None
            if cached[0] != wanted[0]:
                # there is no common prefix, so no way to reuse cache really
                return None
            last_cached = cached[-1]
            # ok, now actually figure out which items are missing
            for i, k in enumerate(wanted):
                if k > last_cached:
                    # ok, rest of items are missing
                    return wanted[i:]
            # otherwise too many things are cached, and we seem to wante less
            return None

        new_values: list[str] = new_hash_d[_SYNTHETIC_KEY_VALUE]
        old_values: list[str] = old_hash_d[_SYNTHETIC_KEY_VALUE]
        missing = missing_keys(cached=old_values, wanted=new_values)
        if missing is not None:
            # can reuse cache
            kwargs[_CACHEW_CACHED] = cached_items()
            kwargs[synthetic_key] = missing

    early_exit = False

    def written_to_cache():
        nonlocal early_exit

        datas = func(*args, **kwargs)

        if isinstance(backend, FileBackend):
            # FIXME uhhh.. this is a bit crap
            # but in sqlite mode we don't want to publish new hash before we write new items
            # maybe should use tmp table for hashes as well?
            backend.write_new_hash(new_hash)
        else:
            # happens later for sqlite
            pass

        flush_blobs = backend.flush_blobs

        chunk: list[Any] = []

        def flush() -> None:
            nonlocal chunk
            if len(chunk) > 0:
                flush_blobs(chunk=chunk)
                chunk = []

        total_objects = 0
        for obj in datas:
            try:
                total_objects += 1
                yield obj
            except GeneratorExit:
                early_exit = True
                return

            dct = marshall.dump(obj)
            blob = orjson_dumps(dct)
            chunk.append(blob)
            if len(chunk) >= chunk_by:
                flush()
        flush()

        backend.finalize(new_hash)
        logger.info(f'wrote   {total_objects} objects to   cachew ({used_backend}:{db_path})')

    def cached_items():
        total_cached = backend.cached_blobs_total()
        total_cached_s = '' if total_cached is None else f'{total_cached} '
        logger.info(f'loading {total_cached_s}objects from cachew ({used_backend}:{db_path})')

        for blob in backend.cached_blobs():
            j = orjson_loads(blob)
            obj = marshall.load(j)
            yield obj

    # NOTE: annoyingly huge try/catch ahead...
    # but it lets us save a function call, hence a stack frame
    # see test_recursive*
    try:
        db_path = get_db_path()
        if db_path is None:
            yield from func(*args, **kwargs)
            return

        BackendCls = BACKENDS[used_backend]

        new_hash_d = C.composite_hash(*args, **kwargs)
        new_hash: SourceHash = json.dumps(new_hash_d)
        logger.debug(f'new hash: {new_hash}')

        marshall: CachewMarshall[Any] = CachewMarshall(Type_=cls)

        with BackendCls(cache_path=db_path, logger=logger) as backend:
            old_hash = backend.get_old_hash()
            logger.debug(f'old hash: {old_hash}')

            if new_hash == old_hash:
                logger.debug('hash matched: loading from cache')
                yield from cached_items()
                return

            logger.debug('hash mismatch: computing data and writing to db')

            try_use_synthetic_key()

            got_write = backend.get_exclusive_write()
            if not got_write:
                # NOTE: this is the bit we really have to watch out for and not put in a helper function
                # otherwise it's causing an extra stack frame on every call
                # the rest (reading from cachew or writing to cachew) happens once per function call? so not a huge deal
                yield from func(*args, **kwargs)
                return

            # at this point we're guaranteed to have an exclusive write transaction
            yield from written_to_cache()
    except Exception as e:
        # sigh... see test_early_exit_shutdown...
        if early_exit and 'Cannot operate on a closed database' in str(e):
            return

        # todo hmm, kinda annoying that it tries calling the function twice?
        # but gonna require some sophisticated cooperation with the cached wrapper otherwise
        cachew_error(e, logger=logger)
        yield from func(*args, **kwargs)


__all__ = [
    'CachewException',
    'HashFunction',
    'SourceHash',
    'cachew',
    'get_logger',
]


================================================
FILE: src/cachew/backend/common.py
================================================
import logging
from abc import abstractmethod
from collections.abc import Iterator, Sequence
from pathlib import Path

from ..common import SourceHash


class AbstractBackend:
    @abstractmethod
    def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
        raise NotImplementedError

    @abstractmethod
    def __enter__(self):
        raise NotImplementedError

    def __exit__(self, *args) -> None:
        raise NotImplementedError

    def get_old_hash(self) -> SourceHash | None:
        raise NotImplementedError

    def cached_blobs_total(self) -> int | None:
        raise NotImplementedError

    def cached_blobs(self) -> Iterator[bytes]:
        raise NotImplementedError

    def get_exclusive_write(self) -> bool:
        '''
        Returns whether it actually managed to get it
        '''
        raise NotImplementedError

    def write_new_hash(self, new_hash: SourceHash) -> None:
        raise NotImplementedError

    def flush_blobs(self, chunk: Sequence[bytes]) -> None:
        raise NotImplementedError

    def finalize(self, new_hash: SourceHash) -> None:
        raise NotImplementedError


================================================
FILE: src/cachew/backend/file.py
================================================
import logging
from collections.abc import Iterator, Sequence
from pathlib import Path
from typing import (
    BinaryIO,
)

from ..common import SourceHash
from .common import AbstractBackend


class FileBackend(AbstractBackend):
    jsonl: Path
    jsonl_tmp: Path
    jsonl_fr: BinaryIO | None
    jsonl_tmp_fw: BinaryIO | None

    def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
        self.logger = logger
        self.jsonl = cache_path
        self.jsonl_tmp = Path(str(self.jsonl) + '.tmp')

        self.jsonl_fr = None
        self.jsonl_tmp_fw = None

    def __enter__(self) -> 'FileBackend':
        try:
            self.jsonl_fr = self.jsonl.open('rb')
        except FileNotFoundError:
            self.jsonl_fr = None
        return self

    def __exit__(self, *args) -> None:
        if self.jsonl_tmp_fw is not None:
            # might still exist in case of early exit
            self.jsonl_tmp.unlink(missing_ok=True)

            # NOTE: need to unlink first
            # otherwise possible that someone else might open the file before we unlink it
            self.jsonl_tmp_fw.close()

        if self.jsonl_fr is not None:
            self.jsonl_fr.close()

    def get_old_hash(self) -> SourceHash | None:
        if self.jsonl_fr is None:
            return None
        hash_line = self.jsonl_fr.readline().rstrip(b'\n')
        return hash_line.decode('utf8')

    def cached_blobs_total(self) -> int | None:
        # not really sure how to support that for a plaintext file?
        # could wc -l but it might be costly..
        return None

    def cached_blobs(self) -> Iterator[bytes]:
        assert self.jsonl_fr is not None  # should be guaranteed by get_old_hash
        yield from self.jsonl_fr  # yields line by line

    def get_exclusive_write(self) -> bool:
        # NOTE: opening in x (exclusive write) mode just in case, so it throws if file exists
        try:
            self.jsonl_tmp_fw = self.jsonl_tmp.open('xb')
        except FileExistsError:
            self.jsonl_tmp_fw = None
            return False
        else:
            return True

    def write_new_hash(self, new_hash: SourceHash) -> None:
        assert self.jsonl_tmp_fw is not None
        self.jsonl_tmp_fw.write(new_hash.encode('utf8') + b'\n')

    def flush_blobs(self, chunk: Sequence[bytes]) -> None:
        fw = self.jsonl_tmp_fw
        assert fw is not None
        for blob in chunk:
            fw.write(blob)
            fw.write(b'\n')

    def finalize(self, new_hash: SourceHash) -> None:  # noqa: ARG002
        # TODO defensive??
        self.jsonl_tmp.rename(self.jsonl)


================================================
FILE: src/cachew/backend/sqlite.py
================================================
import logging
import sqlite3
import time
import warnings
from collections.abc import Iterator, Sequence
from pathlib import Path

import sqlalchemy
import sqlalchemy.exc
from sqlalchemy import Column, Table, event, text
from sqlalchemy.dialects import sqlite

from ..common import SourceHash
from .common import AbstractBackend


class SqliteBackend(AbstractBackend):
    def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
        self.logger = logger
        self.engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}', connect_args={'timeout': 0})
        # NOTE: timeout is necessary so we don't lose time waiting during recursive calls
        # by default, it's several seconds? you'd see 'test_recursive' test performance degrade

        @event.listens_for(self.engine, 'connect')
        def set_sqlite_pragma(dbapi_connection, connection_record):  # noqa: ARG001
            # without wal, concurrent reading/writing is not gonna work

            # ugh. that's odd, how are we supposed to set WAL if the very fact of setting wal might lock the db?
            while True:
                try:
                    dbapi_connection.execute('PRAGMA journal_mode=WAL')
                    break
                except sqlite3.OperationalError as oe:
                    if 'database is locked' not in str(oe):
                        # ugh, pretty annoying that exception doesn't include database path for some reason
                        raise RuntimeError(f'Error while setting WAL on {cache_path}') from oe
                time.sleep(0.1)

        self.connection = self.engine.connect()

        """
        Erm... this is pretty confusing.
        https://docs.sqlalchemy.org/en/13/dialects/sqlite.html#transaction-isolation-level

        Somehow without this thing sqlalchemy logs BEGIN (implicit) instead of BEGIN TRANSACTION which actually works in sqlite...

        Judging by sqlalchemy/dialects/sqlite/base.py, looks like some sort of python sqlite driver problem??

        test_transaction should check this behaviour
        """

        @event.listens_for(self.connection, 'begin')
        def do_begin(conn):
            # NOTE there is also BEGIN CONCURRENT in newer versions of sqlite. could use it later?
            conn.execute(text('BEGIN DEFERRED'))

        self.meta = sqlalchemy.MetaData()
        self.table_hash = Table('hash', self.meta, Column('value', sqlalchemy.String))

        # fmt: off
        # actual cache
        self.table_cache     = Table('cache'    , self.meta, Column('data', sqlalchemy.BLOB))
        # temporary table, we use it to insert and then (atomically?) rename to the above table at the very end
        self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.BLOB))
        # fmt: on

    def __enter__(self) -> 'SqliteBackend':
        # NOTE: deferred transaction
        self.transaction = self.connection.begin()
        # FIXME this is a bit crap.. is there a nicer way to use another ctx manager here?
        self.transaction.__enter__()
        return self

    def __exit__(self, *args) -> None:
        self.transaction.__exit__(*args)
        self.connection.close()
        self.engine.dispose()

    def get_old_hash(self) -> SourceHash | None:
        # first, try to do as much as possible read-only, benefiting from deferred transaction
        old_hashes: Sequence
        try:
            # not sure if there is a better way...
            cursor = self.connection.execute(self.table_hash.select())
        except sqlalchemy.exc.OperationalError as e:
            # meh. not sure if this is a good way to handle this..
            if 'no such table: hash' in str(e):
                old_hashes = []
            else:
                raise e
        else:
            old_hashes = cursor.fetchall()

        assert len(old_hashes) <= 1, old_hashes  # shouldn't happen

        old_hash: SourceHash | None
        if len(old_hashes) == 0:
            old_hash = None
        else:
            old_hash = old_hashes[0][0]  # returns a tuple...
        return old_hash

    def cached_blobs_total(self) -> int | None:
        [(total,)] = self.connection.execute(sqlalchemy.select(sqlalchemy.func.count()).select_from(self.table_cache))
        return total

    def cached_blobs(self) -> Iterator[bytes]:
        rows = self.connection.execute(self.table_cache.select())
        # by default, sqlalchemy wraps all results into Row object
        # this can cause quite a lot of overhead if you're reading many rows
        # it seems that in principle, sqlalchemy supports just returning bare underlying tuple from the dbapi
        # but from browsing the code it doesn't seem like this functionality exposed
        # if you're looking for cues, see
        # - ._source_supports_scalars
        # - ._generate_rows
        # - ._row_getter
        # by using this raw iterator we speed up reading the cache quite a bit
        # asked here https://github.com/sqlalchemy/sqlalchemy/discussions/10350
        raw_row_iterator = getattr(rows, '_raw_row_iterator', None)
        if raw_row_iterator is None:
            warnings.warn(
                "CursorResult._raw_row_iterator method isn't found. This could lead to degraded cache reading performance.",
                stacklevel=2,
            )
            row_iterator = rows
        else:
            row_iterator = raw_row_iterator()

        for (blob,) in row_iterator:
            yield blob

    def get_exclusive_write(self) -> bool:
        # NOTE on recursive calls
        # somewhat magically, they should work as expected with no extra database inserts?
        # the top level call 'wins' the write transaction and once it's gathered all data, will write it
        # the 'intermediate' level calls fail to get it and will pass data through
        # the cached 'bottom' level is read only and will be yielded without a write transaction
        try:
            # first 'write' statement will upgrade transaction to write transaction which might fail due to concurrency
            # see https://www.sqlite.org/lang_transaction.html
            # NOTE: because of 'checkfirst=True', only the last .create will guarantee the transaction upgrade to write transaction
            self.table_hash.create(self.connection, checkfirst=True)

            # 'table' used to be old 'cache' table name, so we just delete it regardless
            # otherwise it might overinfalte the cache db with stale values
            self.connection.execute(text('DROP TABLE IF EXISTS `table`'))

            # NOTE: we have to use .drop and then .create (e.g. instead of some sort of replace)
            # since it's possible to have schema changes inbetween calls
            # checkfirst=True because it might be the first time we're using cache
            self.table_cache_tmp.drop(self.connection, checkfirst=True)
            self.table_cache_tmp.create(self.connection)
        except sqlalchemy.exc.OperationalError as e:
            if e.code == 'e3q8' and 'database is locked' in str(e):
                # someone else must be have won the write lock
                # not much we can do here
                # NOTE: important to close early, otherwise we might hold onto too many file descriptors during yielding
                # see test_recursive_deep
                # (normally connection is closed in SqliteBackend.__exit__)
                self.connection.close()
                # in this case all the callee can do is just to call the actual function
                return False
            else:
                raise e
        return True

    def flush_blobs(self, chunk: Sequence[bytes]) -> None:
        # uhh. this gives a huge speedup for inserting
        # since we don't have to create intermediate dictionaries
        # TODO move this to __init__?
        insert_into_table_cache_tmp_raw = str(
            self.table_cache_tmp.insert().compile(dialect=sqlite.dialect(paramstyle='qmark'))
        )
        # I also tried setting paramstyle='qmark' in create_engine, but it seems to be ignored :(
        # idk what benefit sqlalchemy gives at this point, seems to just complicate things
        self.connection.exec_driver_sql(insert_into_table_cache_tmp_raw, [(c,) for c in chunk])

    def finalize(self, new_hash: SourceHash) -> None:
        # delete hash first, so if we are interrupted somewhere, it mismatches next time and everything is recomputed
        self.connection.execute(self.table_hash.delete())

        # checkfirst is necessary since it might not have existed in the first place
        # e.g. first time we use cache
        self.table_cache.drop(self.connection, checkfirst=True)

        # meh https://docs.sqlalchemy.org/en/14/faq/metadata_schema.html#does-sqlalchemy-support-alter-table-create-view-create-trigger-schema-upgrade-functionality
        # also seems like sqlalchemy doesn't have any primitives to escape table names.. sigh
        self.connection.execute(text(f"ALTER TABLE `{self.table_cache_tmp.name}` RENAME TO `{self.table_cache.name}`"))

        self.connection.execute(self.table_hash.insert().values([{'value': new_hash}]))


================================================
FILE: src/cachew/common.py
================================================
from dataclasses import dataclass

# TODO better name to represent what it means?
type SourceHash = str


class CachewException(RuntimeError):
    pass


@dataclass
class TypeNotSupported(CachewException):
    type_: type
    reason: str

    def __str__(self) -> str:
        return f"{self.type_} isn't supported by cachew: {self.reason}. See https://github.com/karlicoss/cachew#features for the list of supported types."


================================================
FILE: src/cachew/compat.py
================================================
import sys

if sys.version_info[:2] >= (3, 13):
    from warnings import deprecated
else:
    from typing_extensions import deprecated


__all__ = ["deprecated"]


================================================
FILE: src/cachew/experimental.py
================================================
from typing import TYPE_CHECKING

if not TYPE_CHECKING:
    from .compat import deprecated

    @deprecated("Exceptions are not an experimental feature anymore and enabled by default.")
    def enable_exceptions() -> None:
        pass

    @deprecated("Exceptions are not an experimental feature anymore and enabled by default.")
    def disable_exceptions() -> None:
        pass


================================================
FILE: src/cachew/extra.py
================================================
# todo Ideally, needs doublewraps as well? also typing helpers
def mcachew(*args, **kwargs):
    """
    Stands for 'Maybe cachew'.
    Defensive wrapper around @cachew to make it an optional dependency.
    """
    try:
        import cachew
    except ModuleNotFoundError:
        import warnings

        warnings.warn(
            'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew',
            stacklevel=2,
        )
        return lambda orig_func: orig_func
    else:
        return cachew.cachew(*args, **kwargs)


from contextlib import contextmanager


@contextmanager
def disabled_cachew():
    from . import settings

    orig = settings.ENABLE
    try:
        settings.ENABLE = False
        yield
    finally:
        settings.ENABLE = orig


================================================
FILE: src/cachew/legacy.py
================================================
import typing
import warnings
from collections.abc import Iterable, Iterator, Sequence
from dataclasses import dataclass
from datetime import date, datetime
from itertools import chain, islice
from pathlib import Path
from typing import (
    Any,
    Generic,
    NamedTuple,
    Optional,
    TypeVar,
    Union,
)

import sqlalchemy
from sqlalchemy import Column

from .pytest import parametrize
from .common import CachewException


def get_union_args(cls) -> Optional[tuple[type]]:
    if getattr(cls, '__origin__', None) != Union:
        return None

    args = cls.__args__
    args = tuple(e for e in args if e is not type(None))
    assert len(args) > 0
    return args  # ty: ignore[invalid-return-type]


def is_union(cls) -> bool:
    return get_union_args(cls) is not None


Types = Union[
    type[str],
    type[int],
    type[float],
    type[bool],
    type[datetime],
    type[date],
    type[dict],
    type[list],
    type[Exception],
    type[NamedTuple],
]

Values = Union[
    str,
    int,
    float,
    bool,
    datetime,
    date,
    dict,
    list,
    Exception,
    NamedTuple,
]

PRIMITIVE_TYPES = {
    str,
    int,
    float,
    bool,
    datetime,
    date,
    dict,
    list,
    Exception,
}


def is_primitive(cls: type) -> bool:
    """
    >>> from typing import Dict, Any
    >>> is_primitive(int)
    True
    >>> is_primitive(set)
    False
    >>> is_primitive(dict)
    True
    """
    return cls in PRIMITIVE_TYPES


class IsoDateTime(sqlalchemy.TypeDecorator):
    # in theory could use something more effecient? e.g. blob for encoded datetime and tz?
    # but practically, the difference seems to be pretty small, so perhaps fine for now
    impl = sqlalchemy.String

    cache_ok = True

    @property
    def python_type(self):
        return datetime

    def process_literal_param(self, value, dialect):
        raise NotImplementedError()  # make pylint happy

    def process_bind_param(self, value: Optional[datetime], dialect) -> Optional[str]:  # noqa: ARG002
        if value is None:
            return None
        # ok, it's a bit hacky... attempt to preserve pytz infromation
        iso = value.isoformat()
        tz = getattr(value, 'tzinfo', None)
        if tz is None:
            return iso
        try:
            import pytz
        except ImportError:
            self.warn_pytz()
            return iso
        else:
            if isinstance(tz, pytz.BaseTzInfo):
                zone = tz.zone
                # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
                assert zone is not None, tz
                return iso + ' ' + zone
            else:
                return iso

    def process_result_value(self, value: Optional[str], dialect) -> Optional[datetime]:  # noqa: ARG002
        if value is None:
            return None
        spl = value.split(' ')
        dt = datetime.fromisoformat(spl[0])
        if len(spl) <= 1:
            return dt
        zone = spl[1]
        # else attempt to decypher pytz tzinfo
        try:
            import pytz
        except ImportError:
            self.warn_pytz()
            return dt
        else:
            tz = pytz.timezone(zone)
            return dt.astimezone(tz)

    def warn_pytz(self) -> None:
        warnings.warn('install pytz for better timezone support while serializing with cachew', stacklevel=2)


# a bit hacky, but works...
class IsoDate(IsoDateTime):
    impl = sqlalchemy.String

    cache_ok = True

    @property
    def python_type(self):
        return date

    def process_literal_param(self, value, dialect):
        raise NotImplementedError()  # make pylint happy

    def process_result_value(self, value: Optional[str], dialect) -> Optional[date]:  # type: ignore[override]
        res = super().process_result_value(value, dialect)
        if res is None:
            return None
        return res.date()


jtypes = (int, float, bool, type(None))


class ExceptionAdapter(sqlalchemy.TypeDecorator):
    '''
    Enables support for caching Exceptions. Exception is treated as JSON and serialized.

    It's useful for defensive error handling, in case of cachew in particular for preserving error state.

    I elaborate on it here: [mypy-driven error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss).
    '''

    impl = sqlalchemy.JSON

    cache_ok = True

    @property
    def python_type(self):
        return Exception

    def process_literal_param(self, value, dialect):
        raise NotImplementedError()  # make pylint happy

    def process_bind_param(self, value: Optional[Exception], dialect) -> Optional[list[Any]]:  # noqa: ARG002
        if value is None:
            return None
        sargs: list[Any] = []
        for a in value.args:
            if any(isinstance(a, t) for t in jtypes):
                sargs.append(a)
            elif isinstance(a, date):
                sargs.append(a.isoformat())
            else:
                sargs.append(str(a))
        return sargs

    def process_result_value(self, value: Optional[str], dialect) -> Optional[Exception]:  # noqa: ARG002
        if value is None:
            return None
        # sadly, can't do much to convert back from the strings? Unless I serialize the type info as well?
        return Exception(*value)


# fmt: off
PRIMITIVES = {
    str      : sqlalchemy.String,
    int      : sqlalchemy.Integer,
    float    : sqlalchemy.Float,
    bool     : sqlalchemy.Boolean,
    datetime : IsoDateTime,
    date     : IsoDate,
    dict     : sqlalchemy.JSON,
    list     : sqlalchemy.JSON,
    Exception: ExceptionAdapter,
}
# fmt: on
assert set(PRIMITIVES.keys()) == PRIMITIVE_TYPES


def strip_optional(cls) -> tuple[type, bool]:
    """
    >>> from typing import Optional, NamedTuple
    >>> strip_optional(Optional[int])
    (<class 'int'>, True)
    >>> class X(NamedTuple):
    ...     x: int
    >>> strip_optional(X)
    (<class 'cachew.legacy.X'>, False)
    """
    is_opt: bool = False

    args = get_union_args(cls)
    if args is not None and len(args) == 1:
        cls = args[0]  # meh
        is_opt = True

    return (cls, is_opt)


def strip_generic(tp):
    """
    >>> from typing import List
    >>> strip_generic(List[int])
    <class 'list'>
    >>> strip_generic(str)
    <class 'str'>
    """
    GA = getattr(typing, '_GenericAlias')  # ugh, can't make both mypy and pylint happy here?
    if isinstance(tp, GA):
        return tp.__origin__
    return tp


NT = TypeVar('NT')
# sadly, bound=NamedTuple is not working yet in mypy
# https://github.com/python/mypy/issues/685
# also needs to support dataclasses?


@dataclass
class NTBinder(Generic[NT]):
    """
    >>> class Job(NamedTuple):
    ...    company: str
    ...    title: Optional[str]
    >>> class Person(NamedTuple):
    ...     name: str
    ...     age: int
    ...     job: Optional[Job]

    NTBinder is a helper class for inteacting with sqlite database.
    Hierarchy is flattened:
    >>> binder = NTBinder.make(Person)
    >>> [(c.name, type(c.type)) for c in binder.columns]
    ... # doctest: +NORMALIZE_WHITESPACE
    [('name',         <class 'sqlalchemy.sql.sqltypes.String'>),
     ('age',          <class 'sqlalchemy.sql.sqltypes.Integer'>),
     ('_job_is_null', <class 'sqlalchemy.sql.sqltypes.Boolean'>),
     ('job_company',  <class 'sqlalchemy.sql.sqltypes.String'>),
     ('job_title',    <class 'sqlalchemy.sql.sqltypes.String'>)]


    >>> person = Person(name='alan', age=40, job=None)

    to_row converts object to a sql-friendly tuple. job=None, so we end up with True in _job_is_null field
    >>> tuple(binder.to_row(person))
    ('alan', 40, True, None, None)

    from_row does reverse conversion
    >>> binder.from_row(('alan', 40, True, None, None))
    Person(name='alan', age=40, job=None)

    >>> binder.from_row(('ann', 25, True, None, None, 'extra'))
    Traceback (most recent call last):
    ...
    cachew.common.CachewException: unconsumed items in iterator ['extra']
    """

    name: Optional[str]  # None means toplevel
    type_: Types
    span: int  # not sure if span should include optional col?
    primitive: bool
    optional: bool
    union: Optional[type]  # helper, which isn't None if type is Union
    fields: Sequence[Any]  # mypy can't handle cyclic definition at this point :(

    @staticmethod
    def make(tp: type[NT], name: Optional[str] = None) -> 'NTBinder[NT]':
        tp, optional = strip_optional(tp)  # ty: ignore[invalid-assignment]
        union: Optional[type]
        fields: tuple[Any, ...]
        primitive: bool

        union_args = get_union_args(tp)
        if union_args is not None:
            CachewUnion = NamedTuple('_CachewUnionRepr', [(x.__name__, Optional[x]) for x in union_args])  # type: ignore[misc]
            union = CachewUnion
            primitive = False
            fields = (NTBinder.make(tp=CachewUnion, name='_cachew_union_repr'),)
            span = 1
        else:
            union = None
            tp = strip_generic(tp)
            primitive = is_primitive(tp)

            if primitive:
                if name is None:
                    name = '_cachew_primitive'  # meh. presumably, top level
            if primitive:
                fields = ()
                span = 1
            else:
                annotations = typing.get_type_hints(tp)
                if annotations == {}:
                    raise CachewException(
                        f"{tp} (field '{name}'): doesn't look like a supported type to cache. See https://github.com/karlicoss/cachew#features for the list of supported types."
                    )
                fields = tuple(NTBinder.make(tp=ann, name=fname) for fname, ann in annotations.items())
                span = sum(f.span for f in fields) + (1 if optional else 0)
        return NTBinder(
            name=name,
            type_=tp,  # type: ignore[arg-type]
            span=span,
            primitive=primitive,
            optional=optional,
            union=union,
            fields=fields,
        )

    @property
    def columns(self) -> list[Column]:
        return list(self.iter_columns())

    # TODO not necessarily namedtuple? could be primitive type
    def to_row(self, obj: NT) -> tuple[Optional[Values], ...]:
        return tuple(self._to_row(obj))

    def from_row(self, row: Iterable[Any]) -> NT:
        riter = iter(row)
        res = self._from_row(riter)
        remaining = list(islice(riter, 0, 1))
        if len(remaining) != 0:
            raise CachewException(f'unconsumed items in iterator {remaining}')
        assert res is not None  # nosec # help mypy; top level will not be None
        return res

    def _to_row(self, obj) -> Iterator[Optional[Values]]:
        if self.primitive:
            yield obj
        elif self.union is not None:
            CachewUnion = self.union
            (uf,) = self.fields
            # TODO assert only one of them matches??
            union = CachewUnion(**{f.name: obj if isinstance(obj, f.type_) else None for f in uf.fields})
            yield from uf._to_row(union)
        else:
            if self.optional:
                is_none = obj is None
                yield is_none
            else:
                is_none = False
                assert obj is not None  # TODO hmm, that last assert is not very symmetric...

            if is_none:
                for _ in range(self.span - 1):
                    yield None
            else:
                yield from chain.from_iterable(f._to_row(getattr(obj, f.name)) for f in self.fields)

    def _from_row(self, row_iter):
        if self.primitive:
            return next(row_iter)
        elif self.union is not None:
            CachewUnion = self.union  # noqa: F841
            (uf,) = self.fields
            # TODO assert only one of them is not None?
            union_params = [r for r in uf._from_row(row_iter) if r is not None]
            assert len(union_params) == 1, union_params
            return union_params[0]
        else:
            if self.optional:
                is_none = next(row_iter)
            else:
                is_none = False

            if is_none:
                for _ in range(self.span - 1):
                    x = next(row_iter)
                    assert x is None, x  # huh. assert is kinda opposite of producing value
                return None
            else:
                return self.type_(*(f._from_row(row_iter) for f in self.fields))

    # TODO not sure if we want to allow optionals on top level?
    def iter_columns(self) -> Iterator[Column]:
        used_names: set[str] = set()

        def col(name: str, tp) -> Column:
            while name in used_names:
                name = '_' + name
            used_names.add(name)
            return Column(name, tp)

        if self.primitive:
            if self.name is None:
                raise AssertionError
            yield col(self.name, PRIMITIVES[self.type_])
        else:
            prefix = '' if self.name is None else self.name + '_'
            if self.optional:
                yield col(f'_{prefix}is_null', sqlalchemy.Boolean)
            for f in self.fields:
                for c in f.iter_columns():
                    yield col(f'{prefix}{c.name}', c.type)

    def __str__(self):
        lines = ['  ' * level + str(x.name) + ('?' if x.optional else '') + f' <span {x.span}>' for level, x in self.flatten()]
        return '\n'.join(lines)

    def __repr__(self):
        return str(self)

    def flatten(self, level=0):
        yield (level, self)
        for f in self.fields:
            yield from f.flatten(level=level + 1)


def test_mypy_annotations() -> None:
    # mypy won't handle, so this has to be dynamic
    vs = []
    for t in Types.__args__:  # type: ignore[attr-defined]
        (arg,) = t.__args__
        vs.append(arg)

    def types(ts):
        return sorted(ts, key=lambda t: str(t))

    assert types(vs) == types(Values.__args__)  # type: ignore[attr-defined]

    for p in PRIMITIVE_TYPES:
        assert p in Values.__args__  # type: ignore[attr-defined]


@parametrize(
    ('tp', 'val'),
    [
        (int, 22),
        (bool, False),
        (Optional[str], 'abacaba'),
        (Union[str, int], 1),
    ],
)
def test_ntbinder_primitive(tp, val) -> None:
    b = NTBinder.make(tp, name='x')
    row = b.to_row(val)
    vv = b.from_row(list(row))
    assert vv == val


def test_unique_columns(tmp_path: Path) -> None:  # noqa: ARG001
    class Job(NamedTuple):
        company: str
        title: Optional[str]

    class Breaky(NamedTuple):
        job_title: int
        job: Optional[Job]

    assert [c.name for c in NTBinder.make(Breaky).columns] == [
        'job_title',
        '_job_is_null',
        'job_company',
        '_job_title',
    ]


================================================
FILE: src/cachew/logging_helper.py
================================================
from __future__ import annotations

import logging
import os
import warnings
from functools import lru_cache
from typing import TYPE_CHECKING


def test() -> None:
    import sys
    from collections.abc import Callable

    M: Callable[[str], None] = lambda s: print(s, file=sys.stderr)

    ## prepare exception for later
    try:
        None.whatever  # type: ignore[attr-defined]  # noqa: B018
    except Exception as e:
        ex = e
    ##

    M("   Logging module's defaults are not great:")
    l = logging.getLogger('default_logger')
    l.error(
        "For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level"
    )

    M("\n   The reason is that you need to remember to call basicConfig() first. Let's do it now:")
    logging.basicConfig()
    l.error(
        "OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number"
    )

    M(
        "\n   Also exception logging is kinda lame, doesn't print traceback by default unless you remember to pass exc_info:"
    )
    l.exception(ex)  # type: ignore[possibly-undefined]

    M(
        "\n\n    With make_logger you get a reasonable logging format, colours (via colorlog library) and other neat things:"
    )

    ll = make_logger('test')  # No need for basicConfig!
    ll.info("default level is INFO")
    ll.debug("... so this shouldn't be displayed")
    ll.warning("warnings are easy to spot!")

    M("\n    Exceptions print traceback by default now:")
    ll.exception(ex)

    M(
        "\n    You can (and should) use it via regular logging.getLogger after that, e.g. let's set logging level to DEBUG now"
    )
    logging.getLogger('test').setLevel(logging.DEBUG)
    ll.debug("... now debug messages are also displayed")


DEFAULT_LEVEL = 'INFO'
FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)-4d]{end} %(message)s'
FORMAT_NOCOLOR = FORMAT.format(start='', end='')


Level = int
LevelIsh = Level | str | None


def mklevel(level: LevelIsh) -> Level:
    if level is None:
        return logging.NOTSET
    if isinstance(level, int):
        return level
    return getattr(logging, level.upper())


def get_collapse_level() -> Level | None:
    # TODO not sure if should be specific to logger name?
    cl = os.environ.get('LOGGING_COLLAPSE', None)
    if cl is not None:
        return mklevel(cl)
    # legacy name, maybe deprecate?
    cl = os.environ.get('COLLAPSE_DEBUG_LOGS', None)
    if cl is not None:
        return logging.DEBUG
    return None


def get_env_level(name: str) -> Level | None:
    PREFIX = 'LOGGING_LEVEL_'  # e.g. LOGGING_LEVEL_my_hypothesis=debug
    # shell doesn't allow using dots in var names without escaping, so also support underscore syntax
    lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None)
    if lvl is not None:
        return mklevel(lvl)
    return None


def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> None:
    """
    Wrapper to simplify logging setup.
    """
    if isinstance(logger, str):
        logger = logging.getLogger(logger)

    if level is None:
        level = DEFAULT_LEVEL

    # env level always takes precedence
    env_level = get_env_level(logger.name)
    if env_level is not None:
        lvl = env_level
    else:
        lvl = mklevel(level)

    if logger.level == logging.NOTSET:
        # if it's already set, the user requested a different logging level, let's respect that
        logger.setLevel(lvl)

    _setup_handlers_and_formatters(name=logger.name)


# cached since this should only be done once per logger instance
@lru_cache(None)
def _setup_handlers_and_formatters(name: str) -> None:
    logger = logging.getLogger(name)

    logger.addFilter(AddExceptionTraceback())

    ch = logging.StreamHandler()
    collapse_level = get_collapse_level()
    ch = logging.StreamHandler() if collapse_level is None else CollapseLogsHandler(maxlevel=collapse_level)

    # default level for handler is NOTSET, which will make it process all messages
    # we rely on the logger to actually accept/reject log msgs
    logger.addHandler(ch)

    # this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand)
    # even if log entry is handled by this logger ... not sure what's the point of this behaviour??
    logger.propagate = False

    try:
        # try colorlog first, so user gets nice colored logs
        import colorlog
    except ModuleNotFoundError:
        warnings.warn("You might want to 'pip install colorlog' for nice colored logs", stacklevel=2)
        formatter = logging.Formatter(FORMAT_NOCOLOR)
    else:
        # log_color/reset are specific to colorlog
        FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s')
        # colorlog should detect tty in principle, but doesn't handle everything for some reason
        # see https://github.com/borntyping/python-colorlog/issues/71
        if ch.stream.isatty():
            formatter = colorlog.ColoredFormatter(FORMAT_COLOR)
        else:
            formatter = logging.Formatter(FORMAT_NOCOLOR)

    ch.setFormatter(formatter)


# by default, logging.exception isn't logging traceback unless called inside of the exception handler
# which is a bit annoying since we have to pass exc_info explicitly
# also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default
# todo also amend by post about defensive error handling?
class AddExceptionTraceback(logging.Filter):
    def filter(self, record: logging.LogRecord) -> bool:
        if record.levelname == 'ERROR':
            exc = record.msg
            if isinstance(exc, BaseException):
                if record.exc_info is None or record.exc_info == (None, None, None):
                    exc_info = (type(exc), exc, exc.__traceback__)
                    record.exc_info = exc_info
        return True


# todo also save full log in a file?
class CollapseLogsHandler(logging.StreamHandler):
    '''
    Collapses subsequent debug log lines and redraws on the same line.
    Hopefully this gives both a sense of progress and doesn't clutter the terminal as much?
    '''

    last: bool = False

    maxlevel: Level = logging.DEBUG  # everything with less or equal level will be collapsed

    def __init__(self, *args, maxlevel: Level, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.maxlevel = maxlevel

    def emit(self, record: logging.LogRecord) -> None:
        try:
            msg = self.format(record)
            cur = record.levelno <= self.maxlevel and '\n' not in msg
            if cur:
                if self.last:
                    self.stream.write('\033[K' + '\r')  # clear line + return carriage
            else:
                if self.last:
                    self.stream.write('\n')  # clean up after the last line
            self.last = cur
            columns, _ = os.get_terminal_size(0)
            # ugh. the columns thing is meh. dunno I guess ultimately need curses for that
            # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc)
            self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\n'))
            self.flush()
        except:
            self.handleError(record)


def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger:
    logger = logging.getLogger(name)
    setup_logger(logger, level=level)
    return logger


# ughh. hacky way to have a single enlighten instance per interpreter, so it can be shared between modules
# not sure about this. I guess this should definitely be behind some flag
# OK, when stdout is not a tty, enlighten doesn't log anything, good
def get_enlighten():
    # TODO could add env variable to disable enlighten for a module?
    from unittest.mock import (
        Mock,  # Mock to return stub so cients don't have to think about it
    )

    # for now hidden behind the flag since it's a little experimental
    if os.environ.get('ENLIGHTEN_ENABLE', None) is None:
        return Mock()

    try:
        import enlighten  # type: ignore[import-untyped]
    except ModuleNotFoundError:
        warnings.warn("You might want to 'pip install enlighten' for a nice progress bar", stacklevel=2)

        return Mock()

    # dirty, but otherwise a bit unclear how to share enlighten manager between packages that call each other
    instance = getattr(enlighten, 'INSTANCE', None)
    if instance is not None:
        return instance
    instance = enlighten.get_manager()
    setattr(enlighten, 'INSTANCE', instance)
    return instance


if __name__ == '__main__':
    test()


## legacy/deprecated methods for backwards compatilibity
if not TYPE_CHECKING:
    LazyLogger = make_logger
    logger = make_logger
##


================================================
FILE: src/cachew/marshall/cachew.py
================================================
from __future__ import annotations

import types
from abc import abstractmethod
from collections import abc
from collections.abc import Sequence
from dataclasses import dataclass, is_dataclass
from datetime import UTC, date, datetime
from numbers import Real
from typing import (  # noqa: UP035
    Any,
    Dict,
    List,
    NamedTuple,
    Optional,
    Tuple,
    Union,
    get_args,
    get_origin,
    get_type_hints,
)
from zoneinfo import ZoneInfo

from ..common import TypeNotSupported
from ..utils import is_namedtuple, resolve_type_parameters
from .common import AbstractMarshall, Json


class CachewMarshall[T](AbstractMarshall[T]):
    def __init__(self, Type_: type[T]) -> None:
        self.schema = build_schema(Type_)

    def dump(self, obj: T) -> Json:
        return self.schema.dump(obj)

    def load(self, dct: Json) -> T:
        return self.schema.load(dct)


# NOTE: using slots gives a small speedup (maybe 5%?)
# I suppose faster access to fields or something..


@dataclass(slots=True)
class Schema:
    type: Any

    @abstractmethod
    def dump(self, obj):
        raise NotImplementedError

    @abstractmethod
    def load(self, dct):
        raise NotImplementedError


@dataclass(slots=True)
class SPrimitive(Schema):
    def dump(self, obj):
        # NOTE: returning here directly (instead of calling identity lambda) gives about 20% speedup
        # I think custom types should have their own Schema subclass
        return obj
        # prim = primitives_to.get(self.type)
        # assert prim is not None
        # return prim(o)

    def load(self, dct):
        return dct
        # prim = primitives_from.get(self.type)
        # assert prim is not None
        # return prim(d)


@dataclass(slots=True)
class SDataclass(Schema):
    # using list of tuples instead of dict gives about 5% speedup
    fields: tuple[tuple[str, Schema], ...]

    def dump(self, obj):
        # TODO would be nice if we didn't create a dictionary here
        # considering it is going to be serialized to json anyway
        # maybe we need to yield json bits actually?
        return {
            # would be kinda nice if we didn't have to use getattr here
            # but I think for dataclass this is actually the fastest way
            # TODO for NamedTuples could just use them as tuples.. think about separating
            k: ks.dump(getattr(obj, k))
            for k, ks in self.fields
        }

    def load(self, dct):
        # dict comprehension is meh, but not sure if there is a faster way?
        return self.type(**{
            k: ks.load(dct[k])
            for k, ks in self.fields
        })  # fmt: skip


@dataclass(slots=True)
class SUnion(Schema):
    # it's a bit faster to cache indices here, gives about 15% speedup
    args: tuple[tuple[int, Schema], ...]

    def dump(self, obj):
        if obj is None:
            # if it's a None, then doesn't really matter how to serialize and deserialize it
            return (0, None)

        # TODO could do a bit of magic here and remember the last index that worked?
        # that way if some objects dominate the Union, the first isinstance would always work
        for tidx, a in self.args:
            if isinstance(obj, a.type):  # this takes quite a lot of time (sort of expected?)
                # using lists instead of dicts gives a bit of a speedup (about 15%)
                # so probably worth it even though a bit cryptic
                # also could add a tag or something?
                # NOTE: using tuple instead of list gives a tiiny speedup
                jj = a.dump(obj)
                return (tidx, jj)
                # {
                #     '__union_index__': tidx,
                #     '__value__': jj,
                # }
        raise RuntimeError(f"shouldn't happen: {self.args} {obj}")

    def load(self, dct):
        # tidx = d['__union_index__']
        # s = self.args[tidx]
        # return s.load(d['__value__'])
        tidx, val = dct
        if val is None:
            # counterpart for None handling in .dump method
            return None

        _, s = self.args[tidx]
        return s.load(val)


@dataclass(slots=True)
class SList(Schema):
    arg: Schema

    def dump(self, obj):
        return tuple(self.arg.dump(i) for i in obj)

    def load(self, dct):
        return [self.arg.load(i) for i in dct]


@dataclass(slots=True)
class STuple(Schema):
    args: tuple[Schema, ...]

    def dump(self, obj):
        return tuple(a.dump(i) for a, i in zip(self.args, obj, strict=True))

    def load(self, dct):
        return tuple(a.load(i) for a, i in zip(self.args, dct, strict=True))


@dataclass(slots=True)
class SSequence(Schema):
    arg: Schema

    def dump(self, obj):
        return tuple(self.arg.dump(i) for i in obj)

    def load(self, dct):
        return tuple(self.arg.load(i) for i in dct)


@dataclass(slots=True)
class SDict(Schema):
    ft: SPrimitive
    tt: Schema

    def dump(self, obj):
        return {
            k: self.tt.dump(v)
            for k, v in obj.items()
        }  # fmt: skip

    def load(self, dct):
        return {
            k: self.tt.load(v)
            for k, v in dct.items()
        }  # fmt: skip


# TODO unify with primitives?
JTypes = {int, str, type(None), float, bool}


def _exc_helper(args):
    for a in args:
        at = type(a)
        if at in JTypes:
            yield a
        elif issubclass(at, date):
            # TODO would be nice to restore datetime from cache too
            # maybe generally save exception as a union? or intact and let orjson save it?
            yield a.isoformat()
        else:
            yield str(a)  # not much we can do..


@dataclass(slots=True)
class SException(Schema):
    def dump(self, obj: Exception) -> Json:
        return tuple(_exc_helper(obj.args))

    def load(self, dct: Json):
        return self.type(*dct)


try:
    # defensive to avoid dependency on pytz when we switch to python >= 3.9
    import pytz
except ModuleNotFoundError:
    # dummy, this is only needed for isinstance check below
    class pytz_BaseTzInfo:
        zone: str

    def make_tz_pytz(zone: str):
        raise RuntimeError(f"Install pytz to deserialize {zone}")

else:
    pytz_BaseTzInfo = pytz.BaseTzInfo  # type: ignore[misc,assignment]

    make_tz_pytz = pytz.timezone


# just ints to avoid inflating db size
# for now, we try to preserve actual timezone object just in case since they do have somewhat incompatible apis
_TZTAG_ZONEINFO = 1
_TZTAG_PYTZ = 2


@dataclass(slots=True)
class SDatetime(Schema):
    def dump(self, obj: datetime) -> Json:
        iso = obj.isoformat()
        tz = obj.tzinfo
        if tz is None:
            return (iso, None, None)

        if isinstance(tz, ZoneInfo):
            return (iso, tz.key, _TZTAG_ZONEINFO)
        elif isinstance(tz, pytz_BaseTzInfo):
            zone = tz.zone
            # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
            assert zone is not None, (obj, tz)
            return (iso, zone, _TZTAG_PYTZ)
        else:
            return (iso, None, None)

    def load(self, dct: tuple):
        iso, zone, zone_tag = dct
        dt = datetime.fromisoformat(iso)
        if zone is None:
            return dt

        make_tz = ZoneInfo if zone_tag == _TZTAG_ZONEINFO else make_tz_pytz
        tz = make_tz(zone)
        return dt.astimezone(tz)


@dataclass(slots=True)
class SDate(Schema):
    def dump(self, obj: date) -> Json:
        return obj.isoformat()

    def load(self, dct: str):
        return date.fromisoformat(dct)


PRIMITIVES = {
    # int and float are handled a bit differently to allow implicit casts
    # isinstance(.., Real) works both for int and for float
    # Real can't be serialized back, but if you look in SPrimitive, it leaves the values intact anyway
    # since the actual serialization of primitives is handled by orjson
    int: Real,
    float: Real,
    str: str,
    type(None): type(None),
    bool: bool,
    # if type is Any, there isn't much we can do to dump it -- just dump into json and rely on the best
    # so in this sense it works exacly like primitives
    Any: Any,
}


def build_schema(Type) -> Schema:
    # just to avoid confusion in case of weirdness with stringish type annotations
    assert not isinstance(Type, str), Type

    Type = resolve_type_parameters(Type)

    ptype = PRIMITIVES.get(Type)
    if ptype is not None:
        return SPrimitive(type=ptype)

    origin = get_origin(Type)
    # origin is 'unsubscripted/erased' version of type
    # if origin is NOT None, it's some sort of generic type

    if origin is None:
        if issubclass(Type, Exception):
            return SException(type=Type)

        if issubclass(Type, datetime):
            return SDatetime(type=Type)

        if issubclass(Type, date):
            return SDate(type=Type)

        if not (is_dataclass(Type) or is_namedtuple(Type)):
            raise TypeNotSupported(type_=Type, reason='unknown type')
        try:
            hints = get_type_hints(Type)
        except TypeError as te:
            # this can happen for instance on 3.9 if pipe syntax was used for Union types
            # would be nice to provide a friendlier error though
            raise TypeNotSupported(type_=Type, reason='failed to get type hints') from te
        fields = tuple((k, build_schema(t)) for k, t in hints.items())
        return SDataclass(
            type=Type,
            fields=fields,
        )

    args = get_args(Type)
    is_union = origin is Union or origin is types.UnionType

    if is_union:
        # We 'erasing' types (since generic types don't work with isinstance checks).
        # So we need to make sure the types are unique to make sure we can deserialise them.
        schemas = [build_schema(a) for a in args]
        union_types = [s.type for s in schemas if s.type is not Real]
        if len(set(union_types)) != len(union_types):
            raise TypeNotSupported(type_=Type, reason=f'runtime union arguments are not unique: {union_types}')
        return SUnion(
            type=origin,
            args=tuple(
                (tidx, s)
                for tidx, s in enumerate(schemas)
            ),
        )  # fmt: skip

    is_listish = origin is list
    if is_listish:
        (t,) = args
        return SList(
            type=origin,
            arg=build_schema(t),
        )

    # hmm check for is typing.Sequence doesn't pass for some reason
    # perhaps because it's a deprecated alias?
    is_tuplish = origin is tuple or origin is abc.Sequence
    if is_tuplish:
        if origin is tuple:
            # this is for Tuple[()], which is the way to represent empty tuple
            # before python 3.11, get_args for that gives ((),) instead of an empty tuple () as one might expect
            if args == ((),):
                args = ()
            return STuple(
                type=origin,
                args=tuple(build_schema(a) for a in args),
            )
        else:
            (t,) = args
            return SSequence(
                type=origin,
                arg=build_schema(t),
            )

    is_dictish = origin is dict
    if is_dictish:
        (ft, tt) = args
        fts = build_schema(ft)
        tts = build_schema(tt)
        assert isinstance(fts, SPrimitive)
        return SDict(
            type=origin,
            ft=fts,
            tt=tts,
        )

    raise RuntimeError(f"unsupported: {Type=} {origin=} {args=}")


######### tests


def _test_identity(obj, Type_, expected=None):
    if expected is None:
        expected = obj

    m = CachewMarshall(Type_)

    j = m.dump(obj)
    obj2 = m.load(j)

    # Exception's don't support equality normally, so we need to do some hacks..
    def normalise(x):
        if isinstance(x, Exception):
            return (type(x), x.args)
        if type(x) is list:
            return [(type(i), i.args) if isinstance(i, Exception) else i for i in x]
        return x

    # ugh that doesn't work
    # def exc_eq(s, other):
    #     return (type(s), s.args) == (type(other), other.args)
    # Exception.__eq__ = exc_eq

    assert normalise(expected) == normalise(obj2), (expected, obj2)
    return (j, obj2)


## this is used for test below...
# however if we define this inside the test function, it fails if from __future__ import annotations is present on the file..
type _IntType = int
type _StrIntType = str | int
##


# TODO customise with cattrs
def test_serialize_and_deserialize() -> None:
    import pytest

    helper = _test_identity

    # primitives
    helper(1, int)
    helper('aaa', str)
    helper(None, type(None))
    # TODO emit other value as none type? not sure what should happen

    # implicit casts, simple version
    helper(None, int)
    helper(None, str)
    helper(1, float)

    # implicit casts, inside other types
    # technically not type safe, but might happen in practice
    # doesn't matter how to deserialize None anyway so let's allow this
    helper(None, str | int)
    # old syntax
    helper(None, Union[str, int])  # noqa: UP007

    # even though 1 is not isinstance(float), often it ends up as float in data
    # see https://github.com/karlicoss/cachew/issues/54
    helper(1, float | str)
    helper(2, float | int)
    helper(2.0, float | int)
    helper((1, 2), tuple[int, float])

    # optionals
    helper('aaa', str | None)
    helper(None, str | None)
    # old syntax
    helper('aaa', Optional[str])  # noqa: UP045
    helper('aaa', Union[str, None])  # noqa: UP007
    helper(None, Union[str, None])  # noqa: UP007

    # lists/tuples/sequences
    # TODO test with from __future__ import annotations..
    helper([1, 2, 3], list[int])
    helper([1, 2, 3], Optional[List[int]])  # noqa: UP006,UP045
    helper([1, 2, 3], Sequence[int], expected=(1, 2, 3))
    helper((1, 2, 3), Sequence[int])
    helper((1, 2, 3), tuple[int, int, int])
    # old syntax
    helper([1, 2, 3], List[int])  # noqa: UP006
    helper((1, 2, 3), Tuple[int, int, int])  # noqa: UP006
    helper((1, 2, 3), Optional[tuple[int, int, int]])  # noqa: UP045

    # dicts
    helper({'a': 'aa', 'b': 'bb'}, dict[str, str])
    helper({'a': None, 'b': 'bb'}, dict[str, str | None])
    helper({'a': 'aa', 'b': 'bb'}, dict[str, str])
    # old syntax
    helper({'a': None, 'b': 'bb'}, Dict[str, Optional[str]])  # noqa: UP006,UP045

    # unions
    helper('aaa', str | int)
    # old syntax
    helper(1, Union[str, int])  # noqa: UP007

    # compounds of simple types
    helper(['1', 2, '3'], list[str | int])
    # old syntax
    helper(['1', 2, '3'], list[Union[str, int]])  # noqa: UP007

    # TODO need to add test for equivalent dataclasses

    @dataclass
    class Point:
        x: int
        y: int

    # dataclasses
    helper(Point(x=1, y=2), Point)

    # Namedtuple
    class NT(NamedTuple):
        first: str
        last: str

    helper(NT(first='aaa', last='bbb'), NT)

    @dataclass
    class WithJson:
        id: int
        raw_data: dict[str, Any]

    ## type aliases including new 3.12 type aliases
    # this works..
    StrInt = str | int
    helper('aaa', StrInt)

    helper('aaa', _StrIntType)
    helper([1, 2, 3], list[_IntType])

    @dataclass
    class TestTypeAlias:
        x: _IntType
        value: _StrIntType

    helper(TestTypeAlias(x=1, value='aaa'), TestTypeAlias)
    ##

    # json-ish stuff
    helper({}, dict[str, Any])
    helper(WithJson(id=123, raw_data={'payload': 'whatever', 'tags': ['a', 'b', 'c']}), WithJson)
    helper([], list[Any])

    # exceptions
    helper(RuntimeError('whatever!'), RuntimeError)
    # fmt: off
    helper([
        RuntimeError('I', 'am', 'exception', 123),
        Point(x=1, y=2),
        Point(x=11, y=22),
        RuntimeError('more stuff'),
        RuntimeError(),
    ], list[RuntimeError | Point])

    exc_with_datetime     = Exception('I happenned on', datetime.fromisoformat('2021-04-03T10:11:12'))
    exc_with_datetime_exp = Exception('I happenned on', '2021-04-03T10:11:12')
    helper(exc_with_datetime, Exception, expected=exc_with_datetime_exp)
    # fmt: on

    # datetimes
    import pytz

    tz_london = pytz.timezone('Europe/London')
    dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S')
    dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S')
    dwinter_tz = tz_london.localize(dwinter)
    dsummer_tz = tz_london.localize(dsummer)

    dates_tz = [
        dwinter_tz,
        dsummer_tz,
    ]

    tz_sydney = ZoneInfo('Australia/Sydney')
    ## these will have same local time (2025-04-06 02:01:00) in Sydney due to DST shift!
    ## the second one will have fold=1 set to disambiguate
    utc_before_shift = datetime.fromisoformat('2025-04-05T15:01:00+00:00')
    utc_after__shift = datetime.fromisoformat('2025-04-05T16:01:00+00:00')
    ##
    sydney_before = utc_before_shift.astimezone(tz_sydney)
    sydney__after = utc_after__shift.astimezone(tz_sydney)

    dates_tz.extend([sydney_before, sydney__after])

    dates = [
        *dates_tz,
        dwinter,
        dsummer,
        dsummer.replace(tzinfo=UTC),
    ]
    for d in dates:
        _jj, dd = helper(d, datetime)
        assert str(d) == str(dd)

        # test that we preserve zone names
        if d in dates_tz:
            # this works both with pytz and zoneinfo without getting .zone or .key attributes
            assert str(d.tzinfo) == str(dd.tzinfo)

    assert helper(dsummer_tz, datetime)[0] == ('2020-08-03T01:02:03+01:00', 'Europe/London', _TZTAG_PYTZ)
    assert helper(dwinter, datetime)[0] == ('2020-02-03T01:02:03', None, None)

    assert helper(sydney_before, datetime)[0] == ('2025-04-06T02:01:00+11:00', 'Australia/Sydney', _TZTAG_ZONEINFO)
    assert helper(sydney__after, datetime)[0] == ('2025-04-06T02:01:00+10:00', 'Australia/Sydney', _TZTAG_ZONEINFO)

    assert helper(dwinter.date(), date)[0] == '2020-02-03'

    # unsupported types
    class NotSupported:
        pass

    with pytest.raises(RuntimeError, match=r".*NotSupported.* isn't supported by cachew"):
        helper([NotSupported()], list[NotSupported])

    # edge cases
    helper((), tuple[()])

    # unions of generic sequences and such
    # these don't work because the erased type of both is just 'list'..
    # so there is no way to tell which one we need to construct :(
    with pytest.raises(TypeNotSupported, match=r".*runtime union arguments are not unique"):
        helper([1, 2, 3], list[int] | list[Exception])
    with pytest.raises(TypeNotSupported, match=r".*runtime union arguments are not unique"):
        helper([1, 2, 3], list[Exception] | list[int])


================================================
FILE: src/cachew/marshall/common.py
================================================
from abc import abstractmethod
from typing import Any

type Json = dict[str, Any] | tuple[Any, ...] | str | float | int | bool | None


class AbstractMarshall[T]:
    @abstractmethod
    def dump(self, obj: T) -> Json:
        raise NotImplementedError

    @abstractmethod
    def load(self, dct: Json) -> T:
        raise NotImplementedError


================================================
FILE: src/cachew/py.typed
================================================


================================================
FILE: src/cachew/pytest.py
================================================
"""
Helpers to prevent depending on pytest in runtime
"""

import sys
import typing

under_pytest = 'pytest' in sys.modules

if typing.TYPE_CHECKING or under_pytest:
    import pytest

    parametrize = pytest.mark.parametrize
else:

    def parametrize(*_args, **_kwargs):
        def wrapper(f):
            return f

        return wrapper


================================================
FILE: src/cachew/tests/marshall.py
================================================
# ruff: noqa: ARG001  # ruff thinks pytest fixtures are unused arguments
import shutil
import sqlite3
import sys
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
from typing import Any, Literal

import orjson
import pytest

from ..marshall.cachew import CachewMarshall
from ..marshall.common import Json
from .utils import (
    gc_control,  # noqa: F401
    profile,
    running_on_ci,
    timer,
)

Impl = Literal[
    'cachew',  # our custom deserialization
    'cattrs',
    'legacy',  # our legacy deserialization
]
# don't include legacy by default, it's only here just for the sake of comparing once before switch
Impls: list[Impl] = ['cachew', 'cattrs']


def do_test(*, test_name: str, Type, factory, count: int, impl: Impl = 'cachew') -> None:
    if count > 100 and running_on_ci:
        pytest.skip("test too heavy for CI, only meant to run manually")

    to_json: Any
    from_json: Any
    if impl == 'cachew':
        marshall = CachewMarshall(Type_=Type)
        to_json = marshall.dump
        from_json = marshall.load
    elif impl == 'legacy':
        from ..legacy import NTBinder

        # NOTE: legacy binder emits a tuple which can be inserted directly into the database
        # so 'json dump' and 'json load' should really be disregarded for this flavor
        # if you're comparing with <other> implementation, you should compare
        # legacy serializing as the sum of <other> serializing + <other> json dump
        # that said, this way legacy will have a bit of an advantage since custom types (e.g. datetime)
        # would normally be handled by sqlalchemy instead
        binder = NTBinder.make(Type)
        to_json = binder.to_row
        from_json = binder.from_row
    elif impl == 'cattrs':
        from cattrs import Converter

        converter = Converter()

        from typing import get_args

        # TODO use later
        # from typing import Union, get_origin
        # import types
        # def is_union(type_) -> bool:
        #     origin = get_origin(type_)
        #     return origin is Union or origin is types.UnionType

        def union_structure_hook_factory(_):
            def union_hook(data, type_):
                args = get_args(type_)

                if data is None:  # we don't try to coerce None into anything
                    return None

                for t in args:
                    try:
                        res = converter.structure(data, t)
                    except Exception:
                        continue
                    else:
                        return res
                raise ValueError(f"Could not cast {data} to {type_}")

            return union_hook

        # borrowed from https://github.com/python-attrs/cattrs/issues/423
        # uhh, this doesn't really work straightaway...
        # likely need to combine what cattr does with configure_tagged_union
        # converter.register_structure_hook_factory(is_union, union_structure_hook_factory)
        # configure_tagged_union(
        #     union=Type,
        #     converter=converter,
        # )
        # NOTE: this seems to give a bit of speedup... maybe raise an issue or something?
        # fmt: off
        unstruct_func = converter._unstructure_func.dispatch(Type)  # type: ignore[call-arg, misc]  # about 20% speedup
        struct_func   = converter._structure_func  .dispatch(Type)  # type: ignore[call-arg, misc]  # TODO speedup
        # fmt: on

        to_json = unstruct_func
        # todo would be nice to use partial? but how do we bind a positional arg?
        from_json = lambda x: struct_func(x, Type)
    else:
        raise RuntimeError(impl)

    print(file=sys.stderr)  # kinda annoying, pytest starts printing on the same line as test name

    with profile(test_name + ':baseline'), timer(f'building      {count} objects of type {Type}'):
        objects = list(factory(count=count))

    jsons: list[Json] = [None for _ in range(count)]
    with profile(test_name + ':serialize'), timer(f'serializing   {count} objects of type {Type}'):
        for i in range(count):
            jsons[i] = to_json(objects[i])  # ty: ignore[invalid-assignment]

    strs: list[bytes] = [None for _ in range(count)]  # type: ignore[misc]
    with profile(test_name + ':json_dump'), timer(f'json dump     {count} objects of type {Type}'):
        for i in range(count):
            # TODO any orjson options to speed up?
            strs[i] = orjson.dumps(jsons[i])

    db = Path('/tmp/cachew_test/db.sqlite')
    if db.parent.exists():
        shutil.rmtree(db.parent)
    db.parent.mkdir()

    with profile(test_name + ':sqlite_dump'), timer(f'sqlite dump   {count} objects of type {Type}'):
        with sqlite3.connect(db) as conn:
            conn.execute('CREATE TABLE data (value BLOB)')
            conn.executemany('INSERT INTO data (value) VALUES (?)', [(s,) for s in strs])
        conn.close()

    strs2: list[bytes] = [None for _ in range(count)]  # type: ignore[misc]
    with profile(test_name + ':sqlite_load'), timer(f'sqlite load   {count} objects of type {Type}'):
        with sqlite3.connect(db) as conn:
            i = 0
            for (value,) in conn.execute('SELECT value FROM data'):
                strs2[i] = value
                i += 1
        conn.close()

    cache = db.parent / 'cache.jsonl'

    with profile(test_name + ':jsonl_dump'), timer(f'jsonl dump    {count} objects of type {Type}'):
        with cache.open('wb') as fw:
            for s in strs:
                fw.write(s + b'\n')

    strs3: list[bytes] = [None for _ in range(count)]  # type: ignore[misc]
    with profile(test_name + ':jsonl_load'), timer(f'jsonl load    {count} objects of type {Type}'):
        i = 0
        with cache.open('rb') as fr:
            for l in fr:
                l = l.rstrip(b'\n')
                strs3[i] = l
                i += 1

    assert strs2[:100] + strs2[-100:] == strs3[:100] + strs3[-100:]  # just in case

    jsons2: list[Json] = [None for _ in range(count)]
    with profile(test_name + ':json_load'), timer(f'json load     {count} objects of type {Type}'):
        for i in range(count):
            # TODO any orjson options to speed up?
            jsons2[i] = orjson.loads(strs2[i])

    objects2 = [None for _ in range(count)]
    with profile(test_name + ':deserialize'), timer(f'deserializing {count} objects of type {Type}'):
        for i in range(count):
            objects2[i] = from_json(jsons2[i])  # ty: ignore[invalid-argument-type]

    assert objects[:100] + objects[-100:] == objects2[:100] + objects2[-100:]


@dataclass
class Name:
    first: str
    last: str


@pytest.mark.parametrize('impl', Impls)
@pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_union_str_dataclass(impl: Impl, count: int, gc_control, request) -> None:
    # NOTE: previously was union_str_namedtuple, but adapted to work with cattrs for now
    # perf difference between datacalss/namedtuple here seems negligible so old benchmark results should apply

    if impl == 'cattrs':
        pytest.skip('TODO need to adjust the handling of Union types..')

    def factory(count: int):
        objects: list[str | Name] = []
        for i in range(count):
            if i % 2 == 0:
                objects.append(str(i))
            else:
                objects.append(Name(first=f'first {i}', last=f'last {i}'))
        return objects

    do_test(test_name=request.node.name, Type=str | Name, factory=factory, count=count, impl=impl)


# OK, performance with calling this manually (not via pytest) is the same
# do_test_union_str_dataclass(count=1_000_000, test_name='adhoc')


@pytest.mark.parametrize('impl', Impls)
@pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_datetimes(impl: Impl, count: int, gc_control, request) -> None:
    if impl == 'cattrs':
        pytest.skip('TODO support datetime with pytz for cattrs')

    import pytz

    def factory(*, count: int):
        tzs = [
            pytz.timezone('Europe/Berlin'),
            UTC,
            pytz.timezone('America/New_York'),
        ]
        start = datetime.fromisoformat('1990-01-01T00:00:00')
        end = datetime.fromisoformat('2030-01-01T00:00:00')
        step = (end - start) / count
        for i in range(count):
            dt = start + step * i
            tz = tzs[i % len(tzs)]
            yield dt.replace(tzinfo=tz)

    do_test(test_name=request.node.name, Type=datetime, factory=factory, count=count, impl=impl)


@pytest.mark.parametrize('impl', Impls)
@pytest.mark.parametrize('count', [99, 1_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_nested_dataclass(impl: Impl, count: int, gc_control, request) -> None:
    # NOTE: was previously named test_many_from_cachew
    @dataclass
    class UUU:
        xx: int
        yy: int

    @dataclass
    class TE2:
        value: int
        uuu: UUU
        value2: int

    def factory(*, count: int):
        for i in range(count):
            yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i)

    do_test(test_name=request.node.name, Type=TE2, factory=factory, count=count, impl=impl)


# TODO next test should probs be runtimeerror?


================================================
FILE: src/cachew/tests/test_cachew.py
================================================
# ruff: noqa: ARG001  # ruff thinks pytest fixtures are unused arguments
import hashlib
import inspect
import platform
import string
import sys
import time
import timeit
from collections.abc import Iterable, Iterator, Sequence
from concurrent.futures import ProcessPoolExecutor
from contextlib import nullcontext
from dataclasses import asdict, dataclass
from datetime import UTC, date, datetime
from itertools import chain, islice
from pathlib import Path
from random import Random
from subprocess import check_call, check_output, run
from time import sleep
from typing import (
    Any,
    NamedTuple,
    cast,
)

import patchy
import pytest
from more_itertools import ilen, last, one, unique_everseen

from .. import (
    Backend,
    CachewException,
    cachew,
    callable_name,
    get_logger,
    settings,
)
from .utils import (
    gc_control,  # noqa: F401
    running_on_ci,
)

logger = get_logger()


@pytest.fixture(autouse=True)
def set_default_cachew_dir(tmp_path: Path):
    tpath = tmp_path / 'cachew_default'
    settings.DEFAULT_CACHEW_DIR = tpath


@pytest.fixture(autouse=True)
def throw_on_errors():
    # NOTE: in tests we always throw on errors, it's a more reasonable default for testing.
    # we still check defensive behaviour in test_defensive
    settings.THROW_ON_ERROR = True
    # TODO restore it?


@pytest.fixture(autouse=True, params=['sqlite', 'file'])
def set_backend(restore_settings, request):
    backend = request.param
    settings.DEFAULT_BACKEND = backend
    # TODO restore it??


@pytest.fixture
def restore_settings():
    orig = {k: v for k, v in settings.__dict__.items() if not k.startswith('__')}
    try:
        yield
    finally:
        for k, v in orig.items():
            setattr(settings, k, v)


class UUU(NamedTuple):
    xx: int
    yy: int


def test_simple() -> None:
    # just make sure all the high level cachew stuff is working
    @cachew
    def fun() -> Iterable[UUU]:
        yield from []

    list(fun())


def test_string_annotation_old() -> None:
    """
    For some reason collections.abc.Iterable doesn't seem to work here on python <= 3.11
     , it only sees 'UUU' as a string
    Keeping this just as a demonstration, probably not worth trying to support as it's fairly esoteric combo.
    """
    from typing import Iterable as typing_Iterable  # noqa: UP035

    @cachew
    def fun() -> typing_Iterable['UUU']:
        yield from []

    # should properly infer UUU type
    list(fun())


def test_string_annotation_new() -> None:
    @cachew
    def fun() -> Iterable['UUU']:
        yield from []

    # should properly infer UUU type
    list(fun())


def test_custom_hash(tmp_path: Path) -> None:
    """
    Demo of using argument's modification time to determine if underlying data changed
    """
    src = tmp_path / 'source'
    src.write_text('0')

    entities = [
        UUU(xx=1, yy=1),
        UUU(xx=2, yy=2),
        UUU(xx=3, yy=3),
    ]
    calls = 0

    def get_path_version(path: Path):
        ns = path.stat().st_mtime_ns
        # hmm, this might be unreliable, sometimes mtime doesn't change even after modifications?
        # I suppose it takes some time for them to sync or something...
        # so let's compute md5 or something in addition..
        md5 = hashlib.md5(path.read_bytes()).digest()
        return str((ns, md5))

    @cachew(
        cache_path=tmp_path,
        depends_on=get_path_version,  # when path is updated, underlying cache would be discarded
    )
    def data(path: Path) -> Iterable[UUU]:
        nonlocal calls
        calls += 1
        count = int(path.read_text())
        return entities[:count]

    ldata = lambda: list(data(path=src))

    assert len(ldata()) == 0
    assert len(ldata()) == 0
    assert len(ldata()) == 0
    assert calls == 1

    src.write_text('1')
    assert ldata() == entities[:1]
    assert ldata() == entities[:1]
    assert calls == 2

    src.write_text('3')
    assert ldata() == entities
    assert ldata() == entities
    assert calls == 3


def test_caching(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def data() -> Iterator[UUU]:
        time.sleep(1)
        for i in range(5):
            yield UUU(xx=i, yy=i)
            time.sleep(1)

    # https://stackoverflow.com/a/40385994/706389
    template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""
    timeit.template = template  # type: ignore[attr-defined]

    timer = timeit.Timer(lambda: len(list(data())))
    t, cnt = cast(tuple[float, int], timer.timeit(number=1))
    assert cnt == 5
    assert t > 5.0, 'should take at least 5 seconds'

    t, cnt = cast(tuple[float, int], timer.timeit(number=1))
    assert cnt == 5
    assert t < 2.0, 'should be pretty much instantaneous'


def test_error(tmp_path: Path) -> None:
    '''
    Test behaviour when the first time cache is initialized it ends up with an error
    '''
    cache_file = tmp_path / 'cache'
    assert not cache_file.exists(), cache_file  # just precondition

    should_raise = True

    @cachew(cache_file, force_file=True)
    def fun() -> Iterator[str]:
        yield 'string1'
        if should_raise:
            raise RuntimeError('oops')
        yield 'string2'

    with pytest.raises(RuntimeError, match='oops'):
        list(fun())

    # vvv this would be nice but might be tricky because of the way sqlite works (i.e. wal mode creates a file)
    # assert not cache_file.exists(), cache_file

    # perhaps doesn't hurt either way as long this vvv works properly
    # shouldn't cache anything and crach again
    with pytest.raises(RuntimeError, match='oops'):
        list(fun())

    should_raise = False
    assert list(fun()) == ['string1', 'string2']


def test_cache_path(tmp_path: Path) -> None:
    '''
    Tests various ways of specifying cache path
    '''
    calls = 0

    def orig() -> Iterable[int]:
        nonlocal calls
        yield 1
        yield 2
        calls += 1

    fun = cachew(tmp_path / 'non_existent_dir' / 'cache_dir')(orig)
    assert list(fun()) == [1, 2]
    assert calls == 1
    assert list(fun()) == [1, 2]
    assert calls == 1

    # dir by default
    cdir = tmp_path / 'non_existent_dir' / 'cache_dir'
    assert cdir.is_dir()
    cfile = one(cdir.glob('*'))
    assert cfile.name.startswith('cachew.tests.test_cachew:test_cache_path.')

    # treat None as "don't cache"
    fun = cachew(cache_path=None)(orig)
    assert list(fun()) == [1, 2]
    assert calls == 2
    assert list(fun()) == [1, 2]
    assert calls == 3

    f = tmp_path / 'a_file'
    f.touch()
    fun = cachew(cache_path=f)(orig)
    assert list(fun()) == [1, 2]
    assert calls == 4
    assert list(fun()) == [1, 2]
    assert calls == 4

    fun = cachew(tmp_path / 'name', force_file=True)(orig)
    assert list(fun()) == [1, 2]
    assert calls == 5
    assert list(fun()) == [1, 2]
    assert calls == 5

    # if passed force_file, also treat as file
    assert (tmp_path / 'name').is_file()

    # treat None as "don't cache" ('factory')
    # hmm not sure why mypy complains here.. might better if we get to use ParamSpec?
    fun = cachew(cache_path=lambda *args: None)(orig)  # type: ignore[arg-type]  # noqa: ARG005
    assert list(fun()) == [1, 2]
    assert calls == 6
    assert list(fun()) == [1, 2]
    assert calls == 7
    # TODO this won't work at the moment
    # f.write_text('garbage')
    # not sure... on the one hand could just delete the garbage file and overwrite with db
    # on the other hand, wouldn't want to delete some user file by accident


class UGood(NamedTuple):
    x: int


class UBad:
    pass


def test_unsupported_class(tmp_path: Path) -> None:
    with pytest.raises(CachewException, match=r'.*failed to infer cache type.*'):

        @cachew(cache_path=tmp_path)
        def fun() -> list[UBad]:
            return [UBad()]

    with pytest.raises(CachewException, match=r".*can't infer type from.*"):

        @cachew(cache_path=tmp_path)
        def fun2() -> Iterable[UGood | UBad]:
            yield UGood(x=1)
            yield UBad()
            yield UGood(x=2)


class TE2(NamedTuple):
    value: int
    uuu: UUU
    value2: int


# you can run one specific test (e.g. to profile) by passing it as -k to pytest
# e.g. -k 'test_many[500000-False]'
@pytest.mark.parametrize('count', [99, 500_000, 1_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_many(count: int, tmp_path: Path, gc_control) -> None:
    if count > 99 and running_on_ci:
        pytest.skip("test would be too slow on CI, only meant to run manually")
    # should be a parametrized test perhaps
    src = tmp_path / 'source'
    src.touch()

    cache_path = tmp_path / 'test_many'

    @cachew(cache_path=cache_path, force_file=True)
    def iter_data() -> Iterator[TE2]:
        for i in range(count):
            # TODO also profile datetimes?
            yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i)

    a = time.time()
    assert ilen(iter_data()) == count  # initial
    b = time.time()
    print(f'test_many: initial write to cache took {b - a:.1f}s', file=sys.stderr)

    print(f'test_many: cache size is {cache_path.stat().st_size / 10**6}Mb', file=sys.stderr)

    a = time.time()
    assert ilen(iter_data()) == count  # hitting cache
    b = time.time()
    print(f'test_many: reading from cache took {b - a:.1f}s', file=sys.stderr)

    assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1)

    # serializing to db
    # in-memory: 16 seconds

    # without transaction: 22secs
    # without transaction and size 100 chunks -- some crazy amount of time, as expected

    # with transaction:
    # about 17 secs to write 1M entries (just None)
    # chunking by 20K doesn't seem to help
    # chunking by 100 also gives same perf

    # with to_row binding: 21 secs for dummy NamedTuple with None inside, 22 for less trivial class

    # deserializing from db:
    # initially, took 20 secs to load 1M entries (TE2)
    # 9 secs currently
    # 6 secs if we instantiate namedtuple directly via indices
    # 3.5 secs if we just return None from row


class BB(NamedTuple):
    xx: int
    yy: int


class AA(NamedTuple):
    value: int
    b: BB | None
    value2: int


def test_return_type_inference(tmp_path: Path) -> None:
    """
    Tests that return type (BB) is inferred from the type annotation
    """

    @cachew(tmp_path)
    def data() -> Iterator[BB]:
        yield BB(xx=1, yy=2)
        yield BB(xx=3, yy=4)

    assert len(list(data())) == 2
    assert len(list(data())) == 2


def test_return_type_mismatch(tmp_path: Path) -> None:
    # even though user got invalid type annotation here, they specified correct type, and it's the one that should be used
    @cachew(tmp_path, cls=AA)
    def data2() -> list[BB]:
        return [  # ty: ignore[invalid-return-type]
            AA(value=1, b=None, value2=123),  # type: ignore[list-item]
        ]

    # TODO hmm, this is kinda a downside that it always returns
    # could preserve the original return type, but too much trouble for now

    assert list(data2()) == [AA(value=1, b=None, value2=123)]  # type: ignore[comparison-overlap]


def test_return_type_none(tmp_path: Path) -> None:
    with pytest.raises(CachewException):

        @cachew(tmp_path)
        def data():
            return []


def test_callable_cache_path(tmp_path: Path) -> None:
    """
    Cache path can be function dependent on wrapped function's arguments
    """
    called: set[str] = set()

    @cachew(cache_path=lambda kind: tmp_path / f'{kind}.cache')
    def get_data(kind: str) -> Iterator[BB]:
        assert kind not in called
        called.add(kind)
        if kind == 'first':
            yield BB(xx=1, yy=1)
        else:
            yield BB(xx=2, yy=2)

    # fmt: off
    assert list(get_data('first'))  == [BB(xx=1, yy=1)]
    assert list(get_data('second')) == [BB(xx=2, yy=2)]
    assert list(get_data('first'))  == [BB(xx=1, yy=1)]
    assert list(get_data('second')) == [BB(xx=2, yy=2)]
    # fmt: on


def test_nested(tmp_path: Path) -> None:
    d1 = AA(
        value=1,
        b=BB(xx=2, yy=3),
        value2=4,
    )
    d2 = AA(
        value=3,
        b=None,
        value2=5,
    )

    def data():
        yield d1
        yield d2

    @cachew(cache_path=tmp_path, cls=AA)
    def get_data():
        yield from data()

    assert list(get_data()) == [d1, d2]
    assert list(get_data()) == [d1, d2]


class BBv2(NamedTuple):
    xx: int
    yy: int
    zz: float


def test_schema_change(tmp_path: Path) -> None:
    """
    Should discard cache on schema change (BB to BBv2) in this example
    """
    b = BB(xx=2, yy=3)

    @cachew(cache_path=tmp_path, cls=BB)
    def get_data():
        return [b]

    assert list(get_data()) == [b]

    # TODO make type part of key?
    b2 = BBv2(xx=3, yy=4, zz=5.0)

    @cachew(cache_path=tmp_path, cls=BBv2)
    def get_data_v2():
        return [b2]

    assert list(get_data_v2()) == [b2]


def test_transaction(tmp_path: Path) -> None:
    """
    Should keep old cache and not leave it in some broken state in case of errors
    """
    # logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)

    class TestError(Exception):
        pass

    @cachew(cache_path=tmp_path, cls=BB, chunk_by=1)
    def get_data(version: int):
        for i in range(3):
            yield BB(xx=2, yy=i)
            if version == 2:
                raise TestError

    exp = [BB(xx=2, yy=0), BB(xx=2, yy=1), BB(xx=2, yy=2)]
    assert list(get_data(1)) == exp
    assert list(get_data(1)) == exp

    # TODO test that hash is unchanged?
    with pytest.raises(TestError):
        list(get_data(2))

    assert list(get_data(1)) == exp


class Job(NamedTuple):
    company: str
    title: str | None


def test_optional(tmp_path: Path) -> None:
    """
    Tests support for typing.Optional
    """

    @cachew(tmp_path)
    def data() -> Iterator[Job]:
        # fmt: off
        yield Job('google'      , title='engineed')
        yield Job('selfemployed', title=None)
        # fmt: on

    list(data())  # trigger cachew
    # fmt: off
    assert list(data()) == [
        Job('google'      , title='engineed'),
        Job('selfemployed', title=None),
    ]
    # fmt: on


# TODO add test for optional for misleading type annotation


class Person(NamedTuple):
    name: str
    secondname: str
    age: int
    job: Job | None


def make_people_data(count: int) -> Iterator[Person]:
    g = Random(124)
    chars = string.ascii_uppercase + string.ascii_lowercase

    randstr = lambda len_: ''.join(g.choices(chars, k=len_))

    for _ in range(count):
        has_job = g.choice([True, False])
        maybe_job: Job | None = None
        if has_job:
            maybe_job = Job(company=randstr(12), title=randstr(8))

        yield Person(
            name=randstr(5),
            secondname=randstr(10),
            age=g.randint(20, 60),
            job=maybe_job,
        )


def test_stats(tmp_path: Path) -> None:
    cache_file = tmp_path / 'cache'

    # 4 + things are string lengths
    one = (4 + 5) + (4 + 10) + 4 + (4 + 12 + 4 + 8)
    N = 10000

    @cachew(cache_path=cache_file, cls=Person)
    def get_people_data() -> Iterator[Person]:
        yield from make_people_data(count=N)

    list(get_people_data())
    print(
        f"Cache db size for {N} entries: estimated size {one * N // 1024} Kb, actual size {cache_file.stat().st_size // 1024} Kb;"
    )


@dataclass
class Test:
    field: int


def test_dataclass(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def get_dataclasses() -> Iterator[Test]:
        yield from [Test(field=i) for i in range(5)]

    assert list(get_dataclasses()) == [Test(field=i) for i in range(5)]
    assert list(get_dataclasses()) == [Test(field=i) for i in range(5)]


def test_inner_class(tmp_path: Path) -> None:
    # NOTE: this doesn't work at the moment if from __future__ import annotations is used in client code (e.g. on top of this test)
    # see test_future_annotations for more info

    @dataclass
    class InnerDataclass:
        field: int

    @cachew(tmp_path)
    def fun() -> Iterator[InnerDataclass]:
        yield from []

    # should manage to infer type and not crash at least
    list(fun())
    list(fun())


@dataclass
class Dates:
    d1: datetime
    d2: datetime
    d3: datetime
    d4: datetime
    d5: datetime


def test_dates(tmp_path: Path) -> None:
    from zoneinfo import ZoneInfo

    tz = ZoneInfo('Europe/London')
    dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S')
    dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S')

    x = Dates(
        d1=dwinter.replace(tzinfo=tz),
        d2=dsummer.replace(tzinfo=tz),
        d3=dwinter,
        d4=dsummer,
        d5=dsummer.replace(tzinfo=UTC),
    )

    @cachew(tmp_path)
    def fun() -> Iterable[Dates]:
        yield x

    assert one(fun()) == x
    assert one(fun()) == x

    # make sure the actuall tzinfo is preserved... otherwise we might end up with raw offsets and lose some info
    r = one(fun())
    assert str(r.d1.tzinfo) == str(x.d1.tzinfo)
    assert str(r.d2.tzinfo) == str(x.d2.tzinfo)
    assert r.d3.tzname() is None
    assert r.d4.tzname() is None
    assert r.d5.tzinfo is UTC


# fmt: off
@dataclass
class AllTypes:
    a_str   : str
    an_int  : int
    a_float : float
    a_bool  : bool
    a_dt    : datetime
    a_date  : date
    a_dict  : dict[str, Any]
    a_list  : list[Any]
    a_tuple : tuple[float, str]
    an_exc  : Exception
    an_opt  : str | None
# fmt: on

# TODO support vararg tuples?


def test_types(tmp_path: Path) -> None:
    import pytz

    tz = pytz.timezone('Europe/Berlin')
    # fmt: off
    obj = AllTypes(
        a_str   = 'abac',
        an_int  = 1123,
        a_float = 3.131,
        a_bool  = True,
        a_dt    = datetime.now(tz=tz),
        a_date  = datetime.now().replace(year=2000).date(),
        a_dict  = {'a': True, 'x': {'whatever': 3.14}},
        a_list  = ['aba', 123, None],
        a_tuple = (1.23, '3.2.1'),
        an_exc  = RuntimeError('error!', 123),
        an_opt  = 'hello',
    )
    # fmt: on

    @cachew(tmp_path)
    def get() -> Iterator[AllTypes]:
        yield obj

    def helper(t: AllTypes):
        # Exceptions can't be directly compared.. so this kinda helps
        d = asdict(t)
        d['an_exc'] = d['an_exc'].args
        return d

    assert helper(one(get())) == helper(obj)
    assert helper(one(get())) == helper(obj)


# TODO if I do perf tests, look at this https://docs.sqlalchemy.org/en/13/_modules/examples/performance/large_resultsets.html
# TODO should be possible to iterate anonymous tuples too? or just sequences of primitive types?


def test_primitive(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def fun() -> Iterator[str]:
        yield 'aba'
        yield 'caba'

    assert list(fun()) == ['aba', 'caba']
    assert list(fun()) == ['aba', 'caba']


def test_single_value(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def fun_int() -> int:
        return 123

    assert fun_int() == 123
    assert fun_int() == 123

    @cachew(tmp_path, cls=('single', str))
    def fun_str():
        return 'whatever'

    assert fun_str() == 'whatever'
    assert fun_str() == 'whatever'

    @cachew(tmp_path)
    def fun_opt_namedtuple(none: bool) -> UUU | None:  # noqa: FBT001
        if none:
            return None
        else:
            return UUU(xx=1, yy=2)

    assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2)
    assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2)
    assert fun_opt_namedtuple(none=True) is None
    assert fun_opt_namedtuple(none=True) is None


class O(NamedTuple):
    x: int


class _HackHash:
    def __init__(self, x: int) -> None:
        self.x = x

    def __repr__(self):
        return repr(self.x)


def test_default_arguments(tmp_path: Path) -> None:
    hh = _HackHash(1)

    calls = 0

    def orig(a: int, param: _HackHash = hh) -> Iterator[O]:
        yield O(hh.x)
        nonlocal calls
        calls += 1

    def depends_on(a: int, param: _HackHash) -> str:
        # hmm. in principle this should be str according to typing
        # on practice though we always convert hash to str, so maybe type should be changed to Any?
        return (a, param.x)  # type: ignore[return-value]

    fun = cachew(tmp_path, depends_on=depends_on)(orig)

    list(fun(123))
    assert list(fun(123)) == [O(1)]
    assert calls == 1

    # now, change hash. That should cause the composite hash to invalidate and recompute
    hh.x = 2
    assert list(fun(123)) == [O(2)]
    assert calls == 2

    # should be ok with explicitly passing
    assert list(fun(123, param=_HackHash(2))) == [O(2)]
    assert calls == 2

    # we don't have to handle the default param in the default hash key
    fun = cachew(tmp_path)(fun)
    assert list(fun(456)) == [O(2)]
    assert calls == 3
    assert list(fun(456)) == [O(2)]
    assert calls == 3

    # changing the default should trigger the default (i.e. kwargs) key function to invalidate the cache
    hh.x = 3
    assert list(fun(456)) == [O(3)]
    assert calls == 4

    # you don't have to pass the default parameter explicitly
    fun = cachew(tmp_path, depends_on=lambda a: a)(orig)
    assert list(fun(456)) == [O(3)]
    assert calls == 5

    # but watch out if you forget to handle it!
    hh.x = 4
    assert list(fun(456)) == [O(3)]
    assert calls == 5


class U(NamedTuple):
    x: str | O


def test_union(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def fun() -> Iterator[U]:
        yield U('hi')
        yield U(O(123))

    list(fun())
    assert list(fun()) == [U('hi'), U(O(123))]


# NOTE: empty dataclass doesn't have __annotations__ ??? not sure if need to handle it...
@dataclass
class DD:
    x: int


def test_union_with_dataclass(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def fun() -> Iterator[int | DD]:
        yield 123
        yield DD(456)

    assert list(fun()) == [123, DD(456)]


# ugh. we need to pass backend here explicitly since it might not get picked up from the fixture
# that sets it in settings. due to multiprocess stuff
def _concurrent_helper(cache_path: Path, count: int, backend: Backend, sleep_s=0.1):
    @cachew(cache_path, backend=backend)
    def test(count: int) -> Iterator[int]:
        for i in range(count):
            print(f"{count}: GENERATING {i}")
            sleep(sleep_s)
            yield i * i

    return list(test(count=count))


@pytest.fixture
def fuzz_cachew_impl():
    """
    Insert random sleeps in cachew_impl to increase likelihood of concurrency issues
    """
    from .. import cachew_wrapper

    patch = '''\
@@ -189,6 +189,11 @@
             old_hash = backend.get_old_hash()
             logger.debug(f'old hash: {old_hash}')

+            from random import random
+            rs = random() * 2
+            print("sleeping for: ", rs)
+            from time import sleep; sleep(rs)
+
             if new_hash == old_hash:
                 logger.debug('hash matched: loading from cache')
                 yield from cached_items()
'''
    patchy.patch(cachew_wrapper, patch)
    yield
    patchy.unpatch(cachew_wrapper, patch)


# TODO fuzz when they start so they enter transaction at different times?
# TODO how to run it enough times on CI and increase likelihood of failing?
# for now, stress testing manually:
# while PYTHONPATH=src pytest -s cachew -k concurrent_writes ; do sleep 0.5; done
@pytest.mark.xfail(condition=platform.system() == 'Darwin', reason='seems like file writes might not be atomic on osx?')
def test_concurrent_writes(tmp_path: Path, fuzz_cachew_impl) -> None:
    cache_path = tmp_path / 'cache.sqlite'

    # warm up to create the database
    # FIXME ok, that will be fixed separately with atomic move I suppose
    _concurrent_helper(cache_path, 1, settings.DEFAULT_BACKEND)

    processes = 5
    with ProcessPoolExecutor() as pool:
        futures = [
            pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND) for count in range(processes)
        ]

        for count, f in enumerate(futures):
            assert f.result() == [i * i for i in range(count)]


# TODO ugh. need to keep two processes around to test for yield holding transaction lock


def test_concurrent_reads(tmp_path: Path, fuzz_cachew_impl):
    cache_path = tmp_path / 'cache.sqlite'

    count = 10
    # warm up
    _concurrent_helper(cache_path, count, settings.DEFAULT_BACKEND, sleep_s=0)

    processes = 4

    start = time.time()
    with ProcessPoolExecutor() as pool:
        futures = [
            pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND, 1) for _ in range(processes)
        ]

        for f in futures:
            print(f.result())
    end = time.time()

    taken = end - start
    # should be pretty instantaneous
    # if it takes more, most likely means that helper was called again
    assert taken < 5


def test_mcachew(tmp_path: Path):
    # TODO how to test for defensive behaviour?
    from cachew.extra import mcachew

    # TODO check throw on error
    @mcachew(cache_path=tmp_path / 'cache')
    def func() -> Iterator[str]:
        yield 'one'
        yield 'two'

    assert list(func()) == ['one', 'two']
    assert list(func()) == ['one', 'two']


def test_defensive(restore_settings) -> None:
    '''
    Make sure that cachew doesn't crash on misconfiguration
    '''

    def orig() -> Iterator[int]:
        yield 123

    def orig2():
        yield "x"
        yield 123

    fun = cachew(bad_arg=123)(orig)  # type: ignore[call-overload]
    assert list(fun()) == [123]
    assert list(fun()) == [123]

    for throw in [True, False]:
        ctx = pytest.raises(Exception) if throw else nullcontext()
        settings.THROW_ON_ERROR = throw

        with ctx:
            fun = cachew(cache_path=lambda: 1 + 'bad_path_provider')(orig)  # type: ignore[arg-type,misc,operator]
            assert list(fun()) == [123]
            assert list(fun()) == [123]

            fun = cachew(cache_path=lambda p: '/tmp/' + str(p))(orig)
            assert list(fun()) == [123]
            assert list(fun()) == [123]

            fun = cachew(orig2)
            assert list(fun()) == ['x', 123]
            assert list(fun()) == ['x', 123]

            settings.DEFAULT_CACHEW_DIR = '/dev/nonexistent'
            fun = cachew(orig)
            assert list(fun()) == [123]
            assert list(fun()) == [123]


@pytest.mark.parametrize('throw', [False, True])
def test_bad_annotation(*, tmp_path: Path, throw: bool) -> None:
    """
    this will work in runtime without cachew if from __future__ import annotations is used
    so should work with cachew decorator as well
    """
    src = tmp_path / 'src.py'
    src.write_text(
        f'''
from __future__ import annotations

from cachew import settings, cachew
settings.THROW_ON_ERROR = {throw}

@cachew
def fun() -> BadType:
    print("called!")
    return 0

fun()
'''.lstrip()
    )

    ctx = pytest.raises(Exception) if throw else nullcontext()
    with ctx:
        assert check_output([sys.executable, src], text=True).strip() == "called!"


def test_recursive_simple(tmp_path: Path) -> None:
    d0 = 0
    d1 = 1000
    calls = 0

    @cachew(tmp_path)
    def factorials(n: int) -> Iterable[int]:
        nonlocal calls, d0, d1
        calls += 1

        if n == 0:
            d0 = len(inspect.stack(0))
        if n == 1:
            d1 = len(inspect.stack(0))

        if n == 0:
            yield 1
            return
        prev = factorials(n - 1)
        last = 1
        # TODO potentially quadratic? measure perf perhaps?
        for x in prev:
            yield x
            last = x
        yield last * n

    assert calls == 0
    assert list(factorials(3)) == [1, 1, 2, 6]

    # make sure the recursion isn't eating too much stack
    # ideally would have 1? not sure if possible without some insane hacking?
    # todo maybe check stack frame size as well?
    assert abs(d0 - d1) <= 2

    assert calls == 4
    assert list(factorials(3)) == [1, 1, 2, 6]
    assert calls == 4
    assert list(factorials(5)) == [1, 1, 2, 6, 24, 120]
    assert calls == 6
    assert list(factorials(3)) == [1, 1, 2, 6]
    assert calls == 10


def test_recursive_deep(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def numbers(n: int) -> Iterable[int]:
        if n == 0:
            yield 0
            return
        yield from numbers(n - 1)
        yield n

    @cachew(cache_path=None)
    def numbers_cache_disabled(n: int) -> Iterable[int]:
        if n == 0:
            yield 0
            return
        yield from numbers(n - 1)
        yield n

    rlimit = sys.getrecursionlimit()

    # NOTE in reality it has to do with the number of file descriptors (ulimit -Sn, e.g. 1024?)
    # but it seems that during the error unrolling, pytest or something else actually hits the recursion limit somehow
    # pytest ends up with an internal error in such case... which is good enough as long as tests are concerned I guess.
    sys.setrecursionlimit(2 * 800 + 100)
    try:
        # at the moment each recursive call takes two frames (one for the original call, one for cachew_wrapper)
        # + allow 100 calls for random constant overhead like pytest etc
        list(numbers(800))
        list(numbers(800))

        list(numbers_cache_disabled(800))
        list(numbers_cache_disabled(800))
    finally:
        sys.setrecursionlimit(rlimit)


def test_recursive_error(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def rec(n: int) -> Iterable[int]:
        if n == 0:
            yield 0
            return
        yield from rec(n - 1)
        yield n

    rlimit = sys.getrecursionlimit()
    try:
        sys.setrecursionlimit(50)
        list(rec(100))
        raise AssertionError('Expecting recursion error')
    except RecursionError:
        pass
    finally:
        sys.setrecursionlimit(rlimit)

    # todo not sure if cache file should exist??
    # either way, at least check that the db is not completely messed up
    assert len(list(rec(100))) == 101


def test_exceptions(tmp_path: Path) -> None:
    class X(NamedTuple):
        a: int

    d = datetime.strptime('20200102 03:04:05', '%Y%m%d %H:%M:%S')

    @cachew(tmp_path)
    def fun() -> Iterator[Exception]:
        yield RuntimeError('whatever', 123, d, X(a=123))

    list(fun())
    [e] = fun()
    # not sure if there is anything that can be done to preserve type information?
    assert type(e) is Exception
    assert e.args == ('whatever', 123, '2020-01-02T03:04:05', 'X(a=123)')


# see https://beepb00p.xyz/mypy-error-handling.html#kiss
def test_result(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def fun() -> Iterator[Exception | int]:
        yield 1
        yield RuntimeError("sad!")
        yield 123

    list(fun())
    [v1, ve, v123] = fun()
    assert v1 == 1
    assert v123 == 123
    assert isinstance(ve, Exception)
    assert ve.args == ('sad!',)


def test_version_change(tmp_path: Path) -> None:
    calls = 0

    @cachew(tmp_path, logger=logger)
    def fun() -> Iterator[str]:
        nonlocal calls
        calls += 1

        yield from ['a', 'b', 'c']

    list(fun())
    list(fun())
    assert calls == 1

    # todo ugh. not sure how to do this as a relative import??
    import cachew as cachew_module

    old_version = cachew_module.CACHEW_VERSION

    try:
        cachew_module.CACHEW_VERSION = old_version + '_whatever'
        # should invalidate cachew now
        list(fun())
        assert calls == 2
        list(fun())
        assert calls == 2
    finally:
        cachew_module.CACHEW_VERSION = old_version

    # and now again, back to the old version
    list(fun())
    assert calls == 3
    list(fun())
    assert calls == 3


def dump_old_cache(tmp_path: Path) -> None:
    # call this if you want to get an sql script for version upgrade tests..
    oc = tmp_path / 'old_cache.sqlite'

    @cachew(oc)
    def fun() -> Iterator[int]:
        yield from [1, 2, 3]

    list(fun())
    assert oc.exists(), oc

    sql = check_output(['sqlite3', oc, '.dump']).decode('utf8')
    print(sql, file=sys.stderr)


def test_old_cache_v0_6_3(tmp_path: Path) -> None:
    if settings.DEFAULT_BACKEND != 'sqlite':
        pytest.skip('this test only makes sense for sqlite backend')

    sql = '''
PRAGMA foreign_keys=OFF;
BEGIN TRANSACTION;
CREATE TABLE hash (
	value VARCHAR
);
INSERT INTO hash VALUES('cachew: 1, schema: {''_'': <class ''int''>}, hash: ()');
CREATE TABLE IF NOT EXISTS "table" (
	_cachew_primitive INTEGER
);
INSERT INTO "table" VALUES(1);
INSERT INTO "table" VALUES(2);
INSERT INTO "table" VALUES(3);
COMMIT;
    '''
    db = tmp_path / 'cache.sqlite'
    check_call(['sqlite3', db, sql])

    @cachew(db)
    def fun() -> Iterator[int]:
        yield from [1, 2, 3]

    # this tests that it doesn't crash
    # for actual version upgrade test see test_version_change
    assert list(fun()) == [1, 2, 3]


def test_disabled(tmp_path: Path) -> None:
    calls = 0

    @cachew(tmp_path)
    def fun() -> Iterator[int]:
        yield 1
        yield 2
        nonlocal calls
        calls += 1

    assert list(fun()) == [1, 2]
    assert list(fun()) == [1, 2]
    assert calls == 1

    from cachew.extra import disabled_cachew

    with disabled_cachew():
        assert list(fun()) == [1, 2]
        assert calls == 2
        assert list(fun()) == [1, 2]
        assert calls == 3


def test_early_exit_simple(tmp_path: Path) -> None:
    # cachew works on iterators and we'd prefer not to cache if the iterator hasn't been exhausted
    calls_f = 0

    @cachew(tmp_path)
    def f() -> Iterator[int]:
        yield from range(20)
        nonlocal calls_f
        calls_f += 1

    calls_g = 0

    @cachew(tmp_path)
    def g() -> Iterator[int]:
        yield from f()
        nonlocal calls_g
        calls_g += 1

    # only consume 10/20 items
    assert len(list(islice(g(), 0, 10))) == 10
    # precondition
    assert calls_f == 0  # f hasn't been fully exhausted
    assert calls_g == 0  # g hasn't been fully exhausted

    # todo not sure if need to check that db is empty?
    assert len(list(g())) == 20
    assert calls_f == 1
    assert calls_g == 1

    # should be cached now
    assert len(list(g())) == 20
    assert calls_f == 1
    assert calls_g == 1


# see https://github.com/sqlalchemy/sqlalchemy/issues/5522#issuecomment-705156746
def test_early_exit_shutdown(tmp_path: Path) -> None:
    # don't ask... otherwise the exception doesn't appear :shrug:
    import_hack = '''
from sqlalchemy import Column

import re
re.hack = lambda: None
    '''
    Path(tmp_path / 'import_hack.py').write_text(import_hack)

    prog = f'''
import sys
sys.path.insert(0, '')
import import_hack

import cachew
cachew.settings.THROW_ON_ERROR = True # todo check with both?
@cachew.cachew('{tmp_path}', cls=int)
def fun():
    yield 0

g = fun()
e = next(g)

print("FINISHED")
    '''
    r = run([sys.executable, '-c', prog], cwd=tmp_path, capture_output=True, check=True)
    assert r.stdout.strip() == b'FINISHED'
    assert b'Traceback' not in r.stderr


# tests both modes side by side to demonstrate the difference
@pytest.mark.parametrize('use_synthetic', ['False', 'True'])
def test_synthetic_keyset(*, tmp_path: Path, use_synthetic: bool) -> None:
    # just to keep track of which data we had to compute from scratch
    _recomputed: list[str] = []

    # assume key i is responsible for numbers i and i-1
    # in reality this could be some slow function we'd like to avoid calling if its results is already cached
    # e.g. the key would typically be a filename (e.g. isoformat timestamp)
    # and the returned values could be the results of an export over the month prior to the timestamp, or something like that
    # see https://beepb00p.xyz/exports.html#synthetic for more on the motivation
    def compute(key: str) -> Iterator[str]:
        _recomputed.append(key)
        n = int(key)
        yield str(n - 1)
        yield str(n)

    # fmt: off
    # should result in 01 + 12 + 45                     == 01245
    keys125         = ['1', '2', '5'                    ]
    # should result in 01 + 12 + 45 + 56 + 67           == 0124567
    keys12567       = ['1', '2', '5', '6', '7'          ]
    # should result in 01 + 12 + 45 + 56      + 78 + 89 == 012456789
    keys125689      = ['1', '2', '5', '6',      '8', '9']
    # should result in           45 + 56      + 78 + 89 ==    456789
    keys5689        = [          '5', '6',      '8', '9']
    # fmt: on

    def recomputed() -> list[str]:
        r = list(_recomputed)
        _recomputed.clear()
        return r

    ## 'cachew_cached' will just be [] if synthetic key is not used, so no impact on data
    @cachew(tmp_path, synthetic_key=('keys' if use_synthetic else None))
    def fun_aux(keys: Sequence[str], *, cachew_cached: Iterable[str] = []) -> Iterator[str]:
        yield from unique_everseen(
            chain(
                cachew_cached,
                *(compute(key) for key in keys),
            )
        )

    def fun(keys: Sequence[str]) -> set[str]:
        return set(fun_aux(keys=keys))

    ##

    # preserve formatting of string arguments it makes easier to read the tes
    # fmt: off
    assert fun(keys125) == set('01' '12' '45')
    assert recomputed() == keys125
    assert fun(keys125) == set('01' '12' '45')
    assert recomputed() == []  # should be cached

    assert fun(keys12567) == set('01' '12' '45' '56' '67')
    if use_synthetic:
        # 1, 2 and 5 should be already cached from the previous call
        assert recomputed() == ['6', '7']
    else:
        # but without synthetic key this would cause everything to recompute
        assert recomputed() == keys12567
    assert fun(keys12567) == set('01' '12' '45' '56' '67')
    assert recomputed() == []  # should be cached

    assert fun(keys125689) == set('01' '12' '45' '56' '78' '89')
    if use_synthetic:
        # similarly, 1 2 5 6 7 are cached from the previous cacll
        assert recomputed() == ['8', '9']
    else:
        # and we need to call against all keys otherwise
        assert recomputed() == keys125689
    assert fun(keys125689) == set('01' '12' '45' '56' '78' '89')
    assert recomputed() == []  # should be cached

    assert fun(keys5689) == set('45' '56' '78' '89')
    # now the prefix has changed, so if we returned cached items it might return too much
    # so have to recompute everything
    assert recomputed() == keys5689
    assert fun(keys5689) == set('45' '56' '78' '89')
    assert recomputed() == []  # should be cached
    # fmt: on

    # TODO maybe call combined function? so it could return total result and last cached?
    # TODO another option is:
    # the function yields all cached stuff first
    # then the user yields stuff from new
    # and then external function does merging
    # TODO test with kwargs hash?...
    # TODO try without and with simultaneously?
    # TODO check what happens when errors happen?
    # FIXME check what happens if we switch between modes? (synthetic/non-synthetic)
    # FIXME make sure this thing works if len(keys) > chunk size?
    # TODO check what happens when we forget to set 'cachew_cached' argument
    # TODO check what happens when keys are not str but e.g. Path


def test_db_path_matches_fun_name(tmp_path: Path) -> None:
    @cachew(tmp_path)
    def fun_single() -> int:
        return 123

    @cachew(tmp_path)
    def fun_multiple() -> Iterable[int]:
        return [123]

    # write to cache
    fun_single()
    list(fun_multiple())

    assert (tmp_path / callable_name(fun_single)).exists()
    assert (tmp_path / callable_name(fun_multiple)).exists()


def test_type_alias_type_1(tmp_path: Path) -> None:
    type Int = int

    @cachew(tmp_path)
    def fun() -> Iterator[Int]:
        yield 123

    assert list(fun()) == [123]
    assert list(fun()) == [123]


def test_type_alias_type_2(tmp_path: Path) -> None:
    type IteratorInt = Iterator[int]

    @cachew(tmp_path)
    def fun() -> IteratorInt:
        yield 123

    assert list(fun()) == [123]
    assert list(fun()) == [123]


def test_type_alias_generic(tmp_path: Path) -> None:
    type Res[T] = T | Exception
    type IntRes = Res[int]

    @cachew(tmp_path)
    def fun() -> Iterator[IntRes]:
        yield 123

    assert list(fun()) == [123]
    assert list(fun()) == [123]


================================================
FILE: src/cachew/tests/test_future_annotations.py
================================================
from __future__ import annotations

import os
import sys
import textwrap
from collections.abc import Iterator
from dataclasses import dataclass
from pathlib import Path
from subprocess import check_output
from typing import Any

import pytest
from more_itertools import one

from .. import cachew

type _Str = str  # deliberate, to test 3.12 'type ... = ...' type definitions


# fmt: off
@dataclass
class NewStyleTypes1:
    a_str   : str
    a_dict  : dict[str, Any]
    a_list  : list[Any]
    a_tuple : tuple[float, _Str]
# fmt: on


def test_types1(tmp_path: Path) -> None:
    obj = NewStyleTypes1(
        a_str   = 'abac',
        a_dict  = {'a': True, 'x': {'whatever': 3.14}},
        a_list  = ['aba', 123, None],
        a_tuple = (1.23, '3.2.1'),
    )  # fmt: skip

    @cachew(tmp_path)
    def get() -> Iterator[NewStyleTypes1]:
        yield obj

    assert one(get()) == obj
    assert one(get()) == obj


# fmt: off
@dataclass
class NewStyleTypes2:
    an_opt  : str | None
    a_union : _Str | int
# fmt: on


def test_types2(tmp_path: Path) -> None:
    obj = NewStyleTypes2(
        an_opt  = 'hello',
        a_union = 999,
    )  # fmt: skip

    @cachew(tmp_path)
    def get() -> Iterator[NewStyleTypes2]:
        yield obj

    assert one(get()) == obj
    assert one(get()) == obj


@pytest.mark.parametrize('use_future_annotations', [False, True])
@pytest.mark.parametrize('local', [False, True])
@pytest.mark.parametrize('throw', [False, True])
def test_future_annotations(
    *,
    use_future_annotations: bool,
    local: bool,
    throw: bool,
    tmp_path: Path,
) -> None:
    """
    Checks handling of postponed evaluation of annotations (from __future__ import annotations)
    """

    # NOTE: to avoid weird interactions with existing interpreter in which pytest is running
    #  , we compose a program and running in python directly instead
    #  (also not sure if it's even possible to tweak postponed annotations without doing that)

    if use_future_annotations and local and throw:
        # when annotation is local (like inner class), then they end up as strings
        #  so we can't eval it as we don't have access to a class defined inside function
        #  keeping this test just to keep track of whether this is fixed at some point
        #  possibly relevant:
        #  - https://peps.python.org/pep-0563/#keeping-the-ability-to-use-function-local-state-when-defining-annotations
        pytest.skip("local aliases/classses don't work with from __future__ import annotations")

    _PREAMBLE = f'''
from pathlib import Path
import tempfile

from cachew import cachew, settings
settings.THROW_ON_ERROR = {throw}

temp_dir = tempfile.TemporaryDirectory()
td = Path(temp_dir.name)

'''

    _TEST = '''

type Identity[T] = T
I = int
type S = Identity[str]

@cachew(td)
def fun() -> list[I | S]:
    print("called")
    return [1, "2"]

assert list(fun()) == [1, "2"]
assert list(fun()) == [1, "2"]
'''

    if use_future_annotations:
        code = '''
from __future__ import annotations
'''
    else:
        code = ''

    code += _PREAMBLE

    if local:
        code += f'''
def test() -> None:
{textwrap.indent(_TEST, prefix=" ")}

test()
'''
    else:
        code += _TEST

    run_py = tmp_path / 'run.py'
    run_py.write_text(code)

    cache_dir = tmp_path / 'cache'
    cache_dir.mkdir()

    res = check_output(
        [sys.executable, run_py],
        env={'TMPDIR': str(cache_dir), **os.environ},
        text=True,
    )
    called = int(res.count('called'))
    if use_future_annotations and local and not throw:
        # cachew fails to set up, so no caching but at least it works otherwise
        assert called == 2
    else:
        assert called == 1


================================================
FILE: src/cachew/tests/test_resolve_type_parameters.py
================================================
from ..utils import resolve_type_parameters


def test_simple_generic_alias() -> None:
    # if you define types ad-hoc, they resolve to GenericAlias, not TypeAliasType
    assert resolve_type_parameters(int) == int  # noqa: E721
    assert resolve_type_parameters(list[bool]) == list[bool]
    assert resolve_type_parameters(dict[str, list[float]]) == dict[str, list[float]]


def test_simple_type_keyword() -> None:
    type Int = int

    assert resolve_type_parameters(Int) == int  # noqa: E721
    assert resolve_type_parameters(list[Int]) == list[int]
    assert resolve_type_parameters(dict[str, list[Int]]) == dict[str, list[int]]


def test_generic_collections() -> None:
    type ListInt = list[int]
    assert resolve_type_parameters(ListInt) == list[int]
    assert resolve_type_parameters(dict[str, ListInt]) == dict[str, list[int]]

    type TupleInt = tuple[int, bool]
    assert resolve_type_parameters(TupleInt) == tuple[int, bool]
    type TupleIntStr = tuple[TupleInt, str]
    assert resolve_type_parameters(TupleIntStr) == tuple[tuple[int, bool], str]

    type SetStr = set[str]
    assert resolve_type_parameters(SetStr) == set[str]

    type DictAlias[K, V] = dict[K, V]
    assert resolve_type_parameters(DictAlias[str, int]) == dict[str, int]
    assert resolve_type_parameters(DictAlias[int, list[str]]) == dict[int, list[str]]

    type ComplexDict = dict[str, tuple[ListInt, SetStr]]
    assert resolve_type_parameters(ComplexDict) == dict[str, tuple[list[int], set[str]]]


def test_generic_type_keyword() -> None:
    type Id[T] = T
    type IdInt = Id[int]

    assert resolve_type_parameters(IdInt) == int  # noqa: E721
    assert resolve_type_parameters(list[IdInt]) == list[int]

    # check multiple uses of type params
    type Pair[T] = tuple[T, T]
    type PairInt = Pair[int]
    assert resolve_type_parameters(PairInt) == tuple[int, int]
    assert resolve_type_parameters(Pair[str]) == tuple[str, str]
    assert resolve_type_parameters(list[Pair[int]]) == list[tuple[int, int]]

    # check if type params aren't used
    type NotUsing1[T, V] = int
    type NotUsing2[V, W] = NotUsing1[bool, float]
    type ListInt1 = list[NotUsing2[list, str]]
    assert resolve_type_parameters(ListInt1) == list[int]

    # Test generic alias with alias as parameter
    type Container[T] = list[T]
    type Int = int
    assert resolve_type_parameters(Container[Int]) == list[int]


def test_chaining() -> None:
    type Int = int
    type Int2 = Int
    type Int3 = Int2
    assert resolve_type_parameters(Int3) == int  # noqa: E721

    type ListInt3 = list[Int3]
    assert resolve_type_parameters(ListInt3) == list[int]

    type Box[T] = list[T]
    type DoubleBox[T] = Box[Box[T]]
    type DoubleBoxFloat = DoubleBox[float]
    assert resolve_type_parameters(DoubleBoxFloat) == list[list[float]]


def test_optional_and_union() -> None:
    type Int = int
    type MaybeInt = int | None
    assert resolve_type_parameters(MaybeInt) == (int | None)
    assert resolve_type_parameters(list[MaybeInt]) == list[int | None]

    type Str = str  # FIXME extract outside?

    type StrOrInt = Str | Int
    assert resolve_type_parameters(StrOrInt) == (str | int)

    type UnionWithAlias = int | Str
    assert resolve_type_parameters(UnionWithAlias) == (int | str)

    # Test union in generic contexts
    type OptionalList[T] = list[T] | None
    assert resolve_type_parameters(OptionalList[int]) == (list[int] | None)
    assert resolve_type_parameters(OptionalList[str]) == (list[str] | None)

    # Test nested unions with aliases
    type Bool = bool
    type StrOrIntOrBool = StrOrInt | Bool
    assert resolve_type_parameters(StrOrIntOrBool) == (int | str | bool)

    # Test union with complex aliased types
    type ListInt = list[int]
    type DictStrInt = dict[str, int]
    type ComplexUnion = ListInt | DictStrInt | None
    assert resolve_type_parameters(ComplexUnion) == (list[int] | dict[str, int] | None)


def test_old_aliases() -> None:
    """
    Old style typing.* aliases get 'normalised' by typing.get_origin call.
    This shouldn't really be a problem, so just highihghting it here.
    """
    from typing import Dict, List, Optional  # noqa: UP035

    type OptionalInt = Optional[int]  # noqa: UP045
    assert resolve_type_parameters(OptionalInt) == int | None

    type ListInt = List[int]  # noqa: UP006
    assert resolve_type_parameters(ListInt) == list[int]

    type DictIntStr = Dict[int, str]  # noqa: UP006
    assert resolve_type_parameters(DictIntStr) == dict[int, str]


def test_old_union() -> None:
    from typing import Union

    type IntUnion[T] = Union[int, T, bool]  # noqa: UP007

    assert resolve_type_parameters(IntUnion[str]) == (int | str | bool)


def test_typevar() -> None:
    from typing import TypeVar

    X = TypeVar('X')

    ListX = list[X]
    type ListInt = ListX[int]
    assert resolve_type_parameters(ListInt) == list[int]

    SetX = set[X]
    SetFloat = SetX[float]
    assert resolve_type_parameters(SetFloat) == set[float]


def test_misc() -> None:
    """
    Miscellaneous more complex tests.
    """

    # Test union inside list/dict
    type MaybeStr = str | None
    assert resolve_type_parameters(list[MaybeStr]) == list[str | None]
    assert resolve_type_parameters(dict[str, MaybeStr]) == dict[str, str | None]

    # Test union with nested generic aliases
    type Container[T] = list[T]
    type OptionalContainer[T] = Container[T] | None
    assert resolve_type_parameters(OptionalContainer[int]) == (list[int] | None)

    # Test union with multiple aliased generics
    type ListAlias[T] = list[T]
    type SetAlias[T] = set[T]
    type CollectionUnion[T] = ListAlias[T] | SetAlias[T]
    assert resolve_type_parameters(CollectionUnion[str]) == (list[str] | set[str])

    # Test union in tuple
    type IntOrStr = int | str
    assert resolve_type_parameters(tuple[IntOrStr, bool]) == tuple[int | str, bool]

    # Test deeply nested union with aliases
    type Middle = list[IntOrStr]
    type Outer = Middle | None
    assert resolve_type_parameters(Outer) == (list[int | str] | None)

    # Test union with chained aliases
    type Level1 = int
    type Level2 = Level1
    type Level3 = Level2
    type UnionChained = Level3 | str | None
    assert resolve_type_parameters(UnionChained) == (int | str | None)

    # Test union with generic that resolves to union
    type MaybeList[T] = list[T] | None
    type NestedMaybe = MaybeList[int | str]
    assert resolve_type_parameters(NestedMaybe) == (list[int | str] | None)

    # Test union with aliased union
    type NumberOrStr = int | float | str
    type ExtendedUnion = NumberOrStr | bool
    assert resolve_type_parameters(ExtendedUnion) == (int | float | str | bool)

    # Test union in dict values and keys
    type FlexibleKey = str | int
    type FlexibleValue = list[int] | dict[str, str] | None
    assert (
        resolve_type_parameters(dict[FlexibleKey, FlexibleValue]) == dict[str | int, list[int] | dict[str, str] | None]
    )

    # Test union with same type repeated (Python may or may not normalize this)
    type RepeatUnion = int | int | str  # noqa: PYI016
    # Python's union implementation may deduplicate, so we accept both
    assert resolve_type_parameters(RepeatUnion) == (int | str) or resolve_type_parameters(RepeatUnion) == (int | int | str)  # fmt: skip

    # Test union with TypeAliasType in multiple positions
    type AliasA = list[int]
    type AliasB = dict[str, int]
    type AliasC = set[str]
    type MultiAliasUnion = AliasA | AliasB | AliasC
    assert resolve_type_parameters(MultiAliasUnion) == (list[int] | dict[str, int] | set[str])

    # Test generic union with substitution
    type Result[T, E] = T | E
    assert resolve_type_parameters(Result[int, str]) == (int | str)
    assert resolve_type_parameters(Result[list[int], dict[str, str]]) == (list[int] | dict[str, str])

    # Test union with None (Optional pattern) in various positions
    type OptionalInt = int | None
    type ListOfOptional = list[OptionalInt]
    assert resolve_type_parameters(ListOfOptional) == list[int | None]

    # Test union with multiple levels of aliased unions
    type UnionA = int | str
    type UnionB = bool | float
    type CombinedUnion = UnionA | UnionB
    assert resolve_type_parameters(CombinedUnion) == (int | str | bool | float)

    # Test union as generic parameter with nested aliases
    type NestedAlias = list[int]
    type UnionParam[T] = dict[str, T | None]
    assert resolve_type_parameters(UnionParam[NestedAlias]) == dict[str, list[int] | None]

    # Test complex scenario: generic alias that returns a union, used in another union
    type ComplexUnion[T] = MaybeList[T] | dict[str, T]
    assert resolve_type_parameters(ComplexUnion[int]) == (list[int] | None | dict[str, int])

    # Test union in tuple with multiple aliased elements
    type AliasInt = int
    type AliasStr = str
    type TupleWithUnions = tuple[AliasInt | None, list[AliasStr | bool]]
    assert resolve_type_parameters(TupleWithUnions) == tuple[int | None, list[str | bool]]

    # Test three-way union with all aliased types
    type TypeA = list[int]
    type TypeB = dict[str, str]
    type TypeC = set[bool]
    type ThreeWayUnion = TypeA | TypeB | TypeC
    assert resolve_type_parameters(ThreeWayUnion) == (list[int] | dict[str, str] | set[bool])

    # Test union where members themselves contain unions
    type InnerUnion1 = int | str
    type InnerUnion2 = bool | float
    type OuterUnion = list[InnerUnion1] | dict[str, InnerUnion2]
    assert resolve_type_parameters(OuterUnion) == (list[int | str] | dict[str, bool | float])

    # Test generic union with nested type aliases in parameters
    type Box[T] = list[T]
    type OptionBox[T] = Box[T] | None
    assert resolve_type_parameters(OptionBox[int | str]) == (list[int | str] | None)

    # Test union with mix of generic and non-generic aliases
    type SimpleAlias = int
    type GenericAlias[T] = list[T]
    type MixedUnion[T] = SimpleAlias | GenericAlias[T]
    assert resolve_type_parameters(MixedUnion[str]) == (int | list[str])

    # Test generic alias that returns the parameter unchanged
    type Same[T] = T
    assert resolve_type_parameters(Same[int]) == int  # noqa: E721
    assert resolve_type_parameters(Same[list[str]]) == list[str]
    assert resolve_type_parameters(Same[Same[int]]) == int  # noqa: E721

    # Test deeply nested generics
    type Deep = dict[str, list[tuple[int, set[str]]]]
    assert resolve_type_parameters(Deep) == dict[str, list[tuple[int, set[str]]]]

    # Test union in complex nested structure
    type Data[T] = dict[str, list[T] | None]
    assert resolve_type_parameters(Data[int | str]) == dict[str, list[int | str] | None]

    # Test alias in tuple with mixed types
    type Mixed = tuple[int, list[str], dict[str, int]]
    assert resolve_type_parameters(Mixed) == tuple[int, list[str], dict[str, int]]


================================================
FILE: src/cachew/tests/utils.py
================================================
import gc
import os
import sys
from contextlib import contextmanager
from pathlib import Path

import pytest

PROFILES = Path(__file__).absolute().parent / 'profiles'


@contextmanager
def profile(name: str):
    # ugh. seems like pyinstrument slows down code quite a bit?
    if os.environ.get('PYINSTRUMENT') is None:
        yield
        return

    from pyinstrument import Profiler

    with Profiler() as profiler:
        yield

    PROFILES.mkdir(exist_ok=True)
    results_file = PROFILES / f"{name}.html"

    print("results for " + name, file=sys.stderr)
    profiler.print()

    results_file.write_text(profiler.output_html())


def timer(name: str):
    from codetiming import Timer

    return Timer(name=name, text=name + ': ' + '{:.2f}s')


@pytest.fixture
def gc_control(*, gc_on: bool):
    if gc_on:
        # no need to do anything, should be on by default
        yield
        return

    gc.disable()
    try:
        yield
    finally:
        gc.enable()


running_on_ci = 'CI' in os.environ


================================================
FILE: src/cachew/utils.py
================================================
from collections.abc import Mapping
from types import UnionType
from typing import TypeAliasType, TypeVar, get_args, get_origin


# https://stackoverflow.com/a/2166841/706389
def is_namedtuple(t) -> bool:
    b = getattr(t, '__bases__', None)
    if b is None:
        return False
    if len(b) != 1 or b[0] is not tuple:
        return False
    f = getattr(t, '_fields', None)
    if not isinstance(f, tuple):
        return False
    return all(type(n) == str for n in f)  # noqa: E721


def resolve_type_parameters(t) -> type:
    return _resolve_type_parameters_aux(t, typevar_to_type={})


def _resolve_type_parameters_aux(t, *, typevar_to_type: Mapping[TypeVar, type]) -> type:
    if isinstance(t, TypeVar):
        return typevar_to_type[t]

    # This is the 'left hand side' case, i.e. in type ... =
    if isinstance(t, TypeAliasType):
        return _resolve_type_parameters_aux(t.__value__, typevar_to_type=typevar_to_type)

    # note: args is never none
    raw_args = get_args(t)
    resolved_args = tuple(_resolve_type_parameters_aux(arg, typevar_to_type=typevar_to_type) for arg in raw_args)

    # UnionType: resolve each member of the union
    if isinstance(t, UnionType):
        # Reconstruct the union with resolved args
        result = resolved_args[0]
        for arg in resolved_args[1:]:
            result = result | arg  # type: ignore[assignment]
        return result

    origin = get_origin(t)

    # Must be a non-generic type
    if origin is None:
        return t

    # This is the 'right hand side', e.g. '... = Id[int]' matches this
    if isinstance(origin, TypeAliasType):
        type_params = origin.__type_params__
        new_typevar_to_type: Mapping[TypeVar, type] = {
            **typevar_to_type,
            **dict(zip(type_params, resolved_args, strict=True)),  # type: ignore[arg-type]
        }
        return _resolve_type_parameters_aux(origin.__value__, typevar_to_type=new_typevar_to_type)

    # Just a regular generic type
    return origin[resolved_args]


================================================
FILE: tox.ini
================================================
[tox]
minversion = 4

# relies on the correct version of Python installed
# (we rely on CI for the test matrix)
envlist = ruff,tests,mypy,ty

# https://github.com/tox-dev/tox/issues/20#issuecomment-247788333
# hack to prevent .tox from crapping to the project directory
toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox

[testenv]
# TODO how to get package name from setuptools?
package_name = "cachew"
pass_env =
# useful for tests to know they are running under ci
    CI
    CI_*
# respect user's cache dirs to prevent tox from crapping into project dir
    PYTHONPYCACHEPREFIX
    MYPY_CACHE_DIR
    RUFF_CACHE_DIR

set_env =
# do not add current working directory to pythonpath
# generally this is more robust and safer, prevents weird issues later on
    PYTHONSAFEPATH=1

runner = uv-venv-lock-runner
uv_sync_locked = false


[testenv:ruff]
skip_install = true
dependency_groups = testing
commands =
    {envpython} -m ruff check \
        {posargs}


[testenv:tests]
dependency_groups = testing
commands =
    # posargs allow test filtering, e.g. tox ... -- -k test_name
    {envpython} -m pytest \
        --pyargs {[testenv]package_name} \
        {posargs}


[testenv:mypy]
dependency_groups = typecheck
commands =
    {envpython} -m mypy --no-install-types \
        -p {[testenv]package_name}       \
        --txt-report           .coverage.mypy \
        --html-report          .coverage.mypy \
        # this is for github actions to upload to codecov.io
        # sadly xml coverage crashes on windows... so we need to disable it
        {env:CI_MYPY_COVERAGE} \
        {posargs}


[testenv:ty]
dependency_groups = typecheck
commands =
    {envpython} -m ty \
        check \
        {posargs}


================================================
FILE: ty.toml
================================================
[src]
exclude = [
    "doc/test_serialization.py",
]