[
  {
    "path": ".ci/release",
    "content": "#!/usr/bin/env python3\n'''\nDeploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]].\n\n- running manually\n\n  You'll need =UV_PUBLISH_TOKEN= env variable\n\n- running on Github Actions\n\n  Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi\n\n  It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]].\n  Packages are deployed on:\n  - every master commit, onto test pypi\n  - every new tag, onto production pypi\n'''\n\nUV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN'\n\nimport argparse\nimport os\nfrom pathlib import Path\nfrom subprocess import check_call\n\nis_ci = os.environ.get('CI') is not None\n\n\ndef main() -> None:\n    p = argparse.ArgumentParser()\n    p.add_argument('--use-test-pypi', action='store_true')\n    args = p.parse_args()\n\n    publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else []\n\n    root = Path(__file__).absolute().parent.parent\n    os.chdir(root)  # just in case\n\n    check_call(['uv', 'build', '--clear'])\n\n    if not is_ci:\n        # CI relies on trusted publishers so doesn't need env variable\n        assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed'\n\n    check_call(['uv', 'publish', *publish_url])\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": ".ci/run",
    "content": "#!/bin/bash\nset -eu\n\ncd \"$(dirname \"$0\")\"\ncd .. # git root\n\nif ! command -v sudo; then\n    # CI or Docker sometimes doesn't have it, so useful to have a dummy\n    function sudo {\n        \"$@\"\n    }\nfi\n\n# --parallel-live to show outputs while it's running\ntox_cmd='run-parallel --parallel-live'\nif [ -n \"${CI-}\" ]; then\n    # install OS specific stuff here\n    case \"$OSTYPE\" in\n    darwin*) \n        # macos\n        :\n        ;;\n    cygwin* | msys* | win*)\n        # windows\n        # ugh. parallel stuff seems super flaky under windows, some random failures, \"file used by other process\" and crap like that\n        tox_cmd='run'\n        ;;\n    *)\n        # must be linux?\n        :\n        ;;\n    esac\nfi\n\n# NOTE: expects uv installed\nuv tool run --with tox-uv tox $tox_cmd \"$@\"\n"
  },
  {
    "path": ".gitattributes",
    "content": "*.ipynb filter=nbstripout\n\n*.ipynb diff=ipynb\n"
  },
  {
    "path": ".github/workflows/main.yml",
    "content": "# see https://github.com/karlicoss/pymplate for up-to-date reference\n\nname: CI\non:\n  push:\n    branches: '*'\n    tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi\n    # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:\n\n  # Needed to trigger on others' PRs.\n  # Note that people who fork it need to go to \"Actions\" tab on their fork and click \"I understand my workflows, go ahead and enable them\".\n  pull_request:\n\n  # Needed to trigger workflows manually.\n  workflow_dispatch:\n    inputs:\n      debug_enabled:\n        type: boolean\n        description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'\n        required: false\n        default: false\n\n  schedule:\n    - cron: '31 18 * * 5'  # run every Friday\n\n\njobs:\n  build:\n    strategy:\n      fail-fast: false\n      matrix:\n        platform: [ubuntu-latest, macos-latest]  # windows-latest\n        python-version: ['3.12', '3.13', '3.14']\n        # vvv just an example of excluding stuff from matrix\n        # exclude: [{platform: macos-latest, python-version: '3.6'}]\n\n    runs-on: ${{ matrix.platform }}\n\n    # useful for 'optional' pipelines\n    # continue-on-error: ${{ matrix.platform == 'windows-latest' }}\n\n    steps:\n    # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation\n    - run: echo \"$HOME/.local/bin\" >> $GITHUB_PATH\n\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n        fetch-depth: 0  # nicer to have all git history when debugging/for tests\n\n    - uses: actions/setup-python@v6\n      with:\n        python-version: ${{ matrix.python-version }}\n      \n    - uses: astral-sh/setup-uv@v7\n      with:\n        enable-cache: false  # we don't have lock files, so can't use them as cache key\n\n    - uses: mxschmitt/action-tmate@v3\n      if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}\n\n    # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...\n    - run: bash .ci/run\n      env:\n        # only compute lxml coverage on ubuntu; it crashes on windows\n        CI_MYPY_COVERAGE: ${{ matrix.platform == 'ubuntu-latest' && '--cobertura-xml-report .coverage.mypy' || '' }}\n\n    - if: matrix.platform == 'ubuntu-latest'  # no need to compute coverage for other platforms\n      uses: codecov/codecov-action@v5\n      with:\n        fail_ci_if_error: true  # default false\n        token: ${{ secrets.CODECOV_TOKEN }}\n        flags: mypy-${{ matrix.python-version }}\n        files: .coverage.mypy/cobertura.xml\n\n\n  pypi:\n    # Do not run it for PRs/cron schedule etc.\n    # NOTE: release tags are guarded by on: push: tags on the top.\n    if: github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') || (github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch)))\n    # Ugh, I tried using matrix or something to explicitly generate only test pypi or prod pypi pipelines.\n    # But github actions is so shit, it's impossible to do any logic at all, e.g. doesn't support conditional matrix, if/else statements for variables etc.\n\n    needs: [build] # add all other jobs here\n\n    runs-on: ubuntu-latest\n\n    permissions:\n      # necessary for Trusted Publishing\n      id-token: write\n\n    steps:\n    # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation\n    - run: echo \"$HOME/.local/bin\" >> $GITHUB_PATH\n\n    - uses: actions/checkout@v6\n      with:\n        submodules: recursive\n        fetch-depth: 0  # pull all commits to correctly infer vcs version\n\n    - uses: actions/setup-python@v6\n      with:\n        python-version: '3.12'\n\n    - uses: astral-sh/setup-uv@v7\n      with:\n        enable-cache: false  # we don't have lock files, so can't use them as cache key\n\n    - name: 'release to test pypi'\n      # always deploy merged master to test pypi\n      if: github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch)\n      run: .ci/release --use-test-pypi\n\n    - name: 'release to prod pypi'\n      # always deploy tags to release pypi\n      if: startsWith(github.event.ref, 'refs/tags/')\n      run: .ci/release\n"
  },
  {
    "path": ".gitignore",
    "content": "\n# Created by https://www.gitignore.io/api/python,emacs\n# Edit at https://www.gitignore.io/?templates=python,emacs\n\n### Emacs ###\n# -*- mode: gitignore; -*-\n*~\n\\#*\\#\n/.emacs.desktop\n/.emacs.desktop.lock\n*.elc\nauto-save-list\ntramp\n.\\#*\n\n# Org-mode\n.org-id-locations\n*_archive\n\n# flymake-mode\n*_flymake.*\n\n# eshell files\n/eshell/history\n/eshell/lastdir\n\n# elpa packages\n/elpa/\n\n# reftex files\n*.rel\n\n# AUCTeX auto folder\n/auto/\n\n# cask packages\n.cask/\ndist/\n\n# Flycheck\nflycheck_*.el\n\n# server auth directory\n/server/\n\n# projectiles files\n.projectile\n\n# directory configuration\n.dir-locals.el\n\n# network security\n/network-security.data\n\n\n### Python ###\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# End of https://www.gitignore.io/api/python,emacs\n\nuntracked/\n"
  },
  {
    "path": ".idea/dictionaries/karlicos.xml",
    "content": "<component name=\"ProjectDictionaryState\">\n  <dictionary name=\"karlicos\">\n    <words>\n      <w>cachew</w>\n      <w>dataclassish</w>\n      <w>pylint</w>\n      <w>typecheck</w>\n    </words>\n  </dictionary>\n</component>"
  },
  {
    "path": "LICENSE.txt",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2019 Dima Gerasimov\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import ast\\n\",\n    \"from pathlib import Path\\n\",\n    \"\\n\",\n    \"import jedi  # ty: ignore[unresolved-import]\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def git_root() -> Path:\\n\",\n    \"    import subprocess\\n\",\n    \"\\n\",\n    \"    path_s = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip()\\n\",\n    \"    path = Path(path_s)\\n\",\n    \"    assert path.is_absolute(), path  # just in case\\n\",\n    \"    return path\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"src_dir = git_root() / 'src'\\n\",\n    \"assert src_dir.exists(), src_dir  # seems like jedi is pretty quiet about missing dirs..\\n\",\n    \"\\n\",\n    \"project = jedi.Project(src_dir)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def _find(name: str):\\n\",\n    \"    # ugh. sometimes it returns exact dupes for no apparent reason??\\n\",\n    \"    completions = set(project.search(name, all_scopes=True))\\n\",\n    \"    assert len(completions) == 1, f\\\"Expected one completion for {name}, got {completions}\\\"\\n\",\n    \"    [c] = completions\\n\",\n    \"    [c] = c.goto()  # todo what is this for?\\n\",\n    \"    return c\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def rlink(name: str) -> str:\\n\",\n    \"    c = _find(name)\\n\",\n    \"    if c.module_path is None:\\n\",\n    \"        # TODO ugh raise an issue on tracker or something??\\n\",\n    \"        # seems to only happen for namsepace packages..\\n\",\n    \"        assert c.description.startswith('namespace '), c\\n\",\n    \"        res = name.replace('.', '/')\\n\",\n    \"        assert (src_dir / res).exists(), res\\n\",\n    \"        return f'src/{res}'\\n\",\n    \"    else:\\n\",\n    \"        rpath = Path(c.module_path).relative_to(src_dir)\\n\",\n    \"        return f'src/{rpath}#L{c.line}'\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# TODO ugh.. annoying, seems like Jedi can't get the functions source?\\n\",\n    \"# maybe because it's doing partial parsing or something?\\n\",\n    \"# there is c._get_module_context().code_lines, but it returns all lines in a source file??\\n\",\n    \"def getsource(symbol: str) -> str:\\n\",\n    \"    c = _find(symbol)\\n\",\n    \"    p = Path(c.module_path)\\n\",\n    \"    # TODO check that it's a function?\\n\",\n    \"    function_name = symbol.split('.')[-1]\\n\",\n    \"    assert p.exists(), p\\n\",\n    \"    src = p.read_text()\\n\",\n    \"    src_lines = src.splitlines(keepends=True)\\n\",\n    \"    for x in ast.walk(ast.parse(src)):\\n\",\n    \"        if isinstance(x, ast.FunctionDef) and x.name == function_name:\\n\",\n    \"            break\\n\",\n    \"    else:\\n\",\n    \"        raise RuntimeError(f'Function not found: {symbol}')\\n\",\n    \"\\n\",\n    \"    # ugh lineno is 1-indexed, and seems like a closed interval?\\n\",\n    \"    return ''.join(src_lines[x.lineno - 1 : x.end_lineno])\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def getdoc(symbol: str) -> str:\\n\",\n    \"    c = _find(symbol)\\n\",\n    \"    doc = c.docstring()\\n\",\n    \"    assert doc is not None, symbol\\n\",\n    \"    return doc\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# TODO just get rid of this in favor of native markdown + rlink?\\n\",\n    \"def flink(title: str, name: str | None = None) -> str:\\n\",\n    \"    if name is None:\\n\",\n    \"        name = title.replace('`', '')  # meh\\n\",\n    \"    if name.startswith('tests'):\\n\",\n    \"        name = name.replace('tests', 'cachew.tests.test_cachew')\\n\",\n    \"        # FIXME just replace in code..\\n\",\n    \"\\n\",\n    \"    return f\\\"[{title}]({rlink(name)})\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from IPython.display import Markdown as md  # ty: ignore[unresolved-import]\\n\",\n    \"\\n\",\n    \"dmd = lambda x: display(md(x.strip()))  # ty: ignore[unresolved-reference]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"autoscroll\": false,\n    \"ein.hycell\": false,\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"dmd('''\\n\",\n    \"<!--\\n\",\n    \"THIS FILE IS AUTOGENERATED BY README.ipynb.\\n\",\n    \"Ideally you should edit README.ipynb and use 'generate-readme' to produce README.md.\\n\",\n    \"But it's okay to edit README.md too directly if you want to fix something -- I can run generate-readme myself later.\\n\",\n    \"-->\\n\",\n    \"''')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"source\": [\n    \"# What is Cachew?\\n\",\n    \"TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.\\n\",\n    \"Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.\\n\",\n    \"\\n\",\n    \"In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.\\n\",\n    \"\\n\",\n    \"A simple type is defined as:\\n\",\n    \"\\n\",\n    \"- primitive: `str`/`int`/`float`/`bool`\\n\",\n    \"- JSON-like types (`dict`/`list`/`tuple`)\\n\",\n    \"- `datetime`\\n\",\n    \"- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )\\n\",\n    \"- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)\\n\",\n    \"- [dataclasses](https://docs.python.org/3/library/dataclasses.html)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.\\n\",\n    \"Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.\\n\",\n    \"\\n\",\n    \"## Motivation\\n\",\n    \"\\n\",\n    \"I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.\\n\",\n    \"\\n\",\n    \"Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,\\n\",\n    \"comparing on the next run and returning cached data if nothing changed.\\n\",\n    \"\\n\",\n    \"Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"# Examples\\n\",\n    \"## Processing Wikipedia\\n\",\n    \"Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.\\n\",\n    \"Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"With this library your can achieve it through single `@cachew` decorator.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"autoscroll\": false,\n    \"ein.hycell\": false,\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# FIXME hmm seems like this doesn't work if there are type annotations on cachew_impl? odd\\n\",\n    \"# likely this? https://github.com/davidhalter/jedi/issues/2025\\n\",\n    \"doc = getdoc('cachew_impl').split('Usage example:')[-1].lstrip()\\n\",\n    \"dmd(f\\\"\\\"\\\"```python\\n\",\n    \"{doc}\\n\",\n    \"```\\\"\\\"\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.\\n\",\n    \"\\n\",\n    \"When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Incremental data exports\\n\",\n    \"This is my most common usecase of cachew, which I'll illustrate with example.\\n\",\n    \"\\n\",\n    \"I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.\\n\",\n    \"Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).\\n\",\n    \"That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:\\n\",\n    \"\\n\",\n    \"    ...\\n\",\n    \"    20190715100026.db\\n\",\n    \"    20190716100138.db\\n\",\n    \"    20190717101651.db\\n\",\n    \"    20190718100118.db\\n\",\n    \"    20190719100701.db\\n\",\n    \"    ...\\n\",\n    \"\\n\",\n    \"To access **all** of historic temperature data, I have two options:\\n\",\n    \"\\n\",\n    \"- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:\\n\",\n    \"  \\n\",\n    \"      def measurements(chunks: List[Path]) -> Iterator[Measurement]:\\n\",\n    \"          for chunk in chunks:\\n\",\n    \"              # read measurements from 'chunk' and yield unseen ones\\n\",\n    \"\\n\",\n    \"  This is very **easy, but slow** and you waste CPU for no reason every time you need data.\\n\",\n    \"\\n\",\n    \"- Keep a 'master' database and write code to merge chunks in it.\\n\",\n    \"\\n\",\n    \"  This is very **efficient, but tedious**:\\n\",\n    \"  \\n\",\n    \"  - requires serializing/deserializing data -- boilerplate\\n\",\n    \"  - requires manually managing sqlite database -- error prone, hard to get right every time\\n\",\n    \"  - requires careful scheduling, ideally you want to access new data without having to refresh cache\\n\",\n    \"\\n\",\n    \"  \\n\",\n    \"Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:\\n\",\n    \"\\n\",\n    \"    @cachew      \\n\",\n    \"    def measurements(chunks: List[Path]) -> Iterator[Measurement]:\\n\",\n    \"        # ...\\n\",\n    \"        \\n\",\n    \"- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast\\n\",\n    \"- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)\\n\",\n    \"\\n\",\n    \"  All the complexity of handling database is hidden in `cachew` implementation.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"autoscroll\": false,\n    \"ein.hycell\": false,\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"link = rlink('composite_hash')\\n\",\n    \"\\n\",\n    \"dmd(f'''\\n\",\n    \"# How it works\\n\",\n    \"\\n\",\n    \"- first your objects get {flink('converted', 'cachew.marshall.cachew.CachewMarshall')} into a simpler JSON-like representation\\n\",\n    \"- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).\\n\",\n    \"\\n\",\n    \"When the function is called, cachew [computes the hash of your function's arguments ]({link})\\n\",\n    \"and compares it against the previously stored hash value.\\n\",\n    \"\\n\",\n    \"- If they match, it would deserialize and yield whatever is stored in the cache database\\n\",\n    \"- If the hash mismatches, the original function is called and new data is stored along with the new hash\\n\",\n    \"''')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"autoscroll\": false,\n    \"ein.hycell\": false,\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"dmd('# Features')\\n\",\n    \"types = [f'`{t}`' for t in ['str', 'int', 'float', 'bool', 'datetime', 'date', 'Exception']]\\n\",\n    \"dmd(f\\\"\\\"\\\"\\n\",\n    \"* automatic schema inference: {flink('1', 'tests.test_return_type_inference')}, {flink('2', 'tests.test_return_type_mismatch')}\\n\",\n    \"* supported types:\\n\",\n    \"\\n\",\n    \"    * primitive: {', '.join(types)}\\n\",\n    \"\\n\",\n    \"      See {flink('tests.test_types')}, {flink('tests.test_primitive')}, {flink('tests.test_dates')}, {flink('tests.test_exceptions')}\\n\",\n    \"    * {flink('@dataclass and NamedTuple', 'tests.test_dataclass')}\\n\",\n    \"    * {flink('Optional', 'tests.test_optional')} types\\n\",\n    \"    * {flink('Union', 'tests.test_union')} types\\n\",\n    \"    * {flink('nested datatypes', 'tests.test_nested')}\\n\",\n    \"\\n\",\n    \"* detects {flink('datatype schema changes', 'tests.test_schema_change')} and discards old data automatically\\n\",\n    \"\\\"\\\"\\\")\\n\",\n    \"# * custom hash function TODO example with mtime?\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Performance\\n\",\n    \"Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.\\n\",\n    \"\\n\",\n    \"During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.\\n\",\n    \"\\n\",\n    \"It would almost certainly make your program faster if your computations take more than several seconds.\\n\",\n    \"\\n\",\n    \"You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"autoscroll\": false,\n    \"ein.hycell\": false,\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"dmd(f\\\"\\\"\\\"\\n\",\n    \"# Using\\n\",\n    \"See {flink('docstring', 'cachew_impl')} for up-to-date documentation on parameters and return types.\\n\",\n    \"You can also use {flink('extensive unit tests', 'tests')} as a reference.\\n\",\n    \"\\n\",\n    \"Some useful (but optional) arguments of `@cachew` decorator:\\n\",\n    \"\\n\",\n    \"* `cache_path` can be a directory, or a callable that {flink('returns a path', 'tests.test_callable_cache_path')} and depends on function's arguments.\\n\",\n    \"\\n\",\n    \"   By default, `settings.DEFAULT_CACHEW_DIR` is used.\\n\",\n    \"\\n\",\n    \"* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.\\n\",\n    \"\\n\",\n    \"   By default it just uses string representation of the arguments, you can also specify a custom callable.\\n\",\n    \"\\n\",\n    \"   For instance, it can be used to {flink('discard cache', 'tests.test_custom_hash')} if the input file was modified.\\n\",\n    \"\\n\",\n    \"* `cls` is the type that would be serialized.\\n\",\n    \"\\n\",\n    \"   By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.\\n\",\n    \"\\\"\\\"\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"source\": [\n    \"# Installing\\n\",\n    \"Package is available on [pypi](https://pypi.org/project/cachew/).\\n\",\n    \"\\n\",\n    \"    pip3 install --user cachew\\n\",\n    \"    \\n\",\n    \"## Developing\\n\",\n    \"I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"ein.tags\": \"worksheet-0\",\n    \"slideshow\": {\n     \"slide_type\": \"-\"\n    }\n   },\n   \"source\": [\n    \"# Implementation\\n\",\n    \"\\n\",\n    \"* why NamedTuples and dataclasses?\\n\",\n    \"  \\n\",\n    \"  `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.\\n\",\n    \"  Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.\\n\",\n    \"   \\n\",\n    \"  If you want to find out more why you should use more dataclasses in your code I suggest these links:\\n\",\n    \"  \\n\",\n    \"  - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)\\n\",\n    \"  - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)\\n\",\n    \"   \\n\",\n    \"* why not `pandas.DataFrame`?\\n\",\n    \"\\n\",\n    \"  DataFrames are great and can be serialised to csv or pickled.\\n\",\n    \"  They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.\\n\",\n    \"  They also can't be nested.\\n\",\n    \"\\n\",\n    \"* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?\\n\",\n    \"  \\n\",\n    \"  ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.\\n\",\n    \"\\n\",\n    \"  * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.\\n\",\n    \"    Also it doesn't support nested types.\\n\",\n    \"    \\n\",\n    \"* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?\\n\",\n    \"\\n\",\n    \"  Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.\\n\",\n    \"\\n\",\n    \"  Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:\\n\",\n    \"  \\n\",\n    \"  * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api\\n\",\n    \"  * https://pypi.org/project/marshmallow-dataclass\\n\",\n    \" \\n\",\n    \"  I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).\\n\",\n    \"  So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.\\n\",\n    \"\\n\",\n    \"* why `sqlite` database for storage?\\n\",\n    \"\\n\",\n    \"  It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.\\n\",\n    \"\\n\",\n    \"  There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.\\n\",\n    \"  It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.\\n\",\n    \"  \\n\",\n    \"  It would also be interesting to experiment with in-RAM storages.\\n\",\n    \"\\n\",\n    \"  I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Tips and tricks\\n\",\n    \"## Optional dependency\\n\",\n    \"You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"dmd(f\\\"\\\"\\\"```python\\n\",\n    \"{getsource('cachew.extra.mcachew')}\\n\",\n    \"```\\\"\\\"\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.\\n\",\n    \"\\n\",\n    \"## Settings\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"dmd(f'''\\n\",\n    \"{flink('cachew.settings')} exposes some parameters that allow you to control `cachew` behaviour:\\n\",\n    \"- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).\\n\",\n    \"   You can also use {flink('cachew.extra.disabled_cachew')} context manager to do it temporarily.\\n\",\n    \"- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the \\\"user cache directory\\\" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).\\n\",\n    \"- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.\\n\",\n    \"   Set to `True` to catch errors earlier.\\n\",\n    \"- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).\\n\",\n    \"\\n\",\n    \"''')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Updating this readme\\n\",\n    \"This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script.\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Tags\",\n  \"kernelspec\": {\n   \"display_name\": \"cachew\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.4\"\n  },\n  \"name\": \"README.ipynb\"\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "README.md",
    "content": "<!--\nTHIS FILE IS AUTOGENERATED BY README.ipynb.\nIdeally you should edit README.ipynb and use 'generate-readme' to produce README.md.\nBut it's okay to edit README.md too directly if you want to fix something -- I can run generate-readme myself later.\n-->\n\n\n# What is Cachew?\nTLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.\nCache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.\n\nIn order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.\n\nA simple type is defined as:\n\n- primitive: `str`/`int`/`float`/`bool`\n- JSON-like types (`dict`/`list`/`tuple`)\n- `datetime`\n- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )\n- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)\n- [dataclasses](https://docs.python.org/3/library/dataclasses.html)\n\n\nThat allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.\nThanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.\n\n## Motivation\n\nI often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.\n\nConventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,\ncomparing on the next run and returning cached data if nothing changed.\n\nSimple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.\n\n\n# Examples\n## Processing Wikipedia\nImagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.\nParsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.\n\n\nWith this library your can achieve it through single `@cachew` decorator.\n\n\n```python\n>>> from typing import NamedTuple, Iterator\n>>> class Link(NamedTuple):\n...     url : str\n...     text: str\n...\n>>> @cachew\n... def extract_links(archive_path: str) -> Iterator[Link]:\n...     for i in range(5):\n...         # simulate slow IO\n...         # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours\n...         import time; time.sleep(1)\n...         yield Link(url=f'http://link{i}.org', text=f'text {i}')\n...\n>>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run\n[Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]\n\n>>> from timeit import Timer\n>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)\n... # second run is cached, so should take less time\n>>> print(f\"call took {int(res)} seconds\")\ncall took 0 seconds\n\n>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)\n... # now file has changed, so the cache will be discarded\n>>> print(f\"call took {int(res)} seconds\")\ncall took 5 seconds\n```\n\n\nWhen you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.\n\nWhen you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately.\n\n## Incremental data exports\nThis is my most common usecase of cachew, which I'll illustrate with example.\n\nI'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.\nData is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).\nThat means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:\n\n    ...\n    20190715100026.db\n    20190716100138.db\n    20190717101651.db\n    20190718100118.db\n    20190719100701.db\n    ...\n\nTo access **all** of historic temperature data, I have two options:\n\n- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:\n  \n      def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n          for chunk in chunks:\n              # read measurements from 'chunk' and yield unseen ones\n\n  This is very **easy, but slow** and you waste CPU for no reason every time you need data.\n\n- Keep a 'master' database and write code to merge chunks in it.\n\n  This is very **efficient, but tedious**:\n  \n  - requires serializing/deserializing data -- boilerplate\n  - requires manually managing sqlite database -- error prone, hard to get right every time\n  - requires careful scheduling, ideally you want to access new data without having to refresh cache\n\n  \nCachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:\n\n    @cachew      \n    def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n        # ...\n        \n- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast\n- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)\n\n  All the complexity of handling database is hidden in `cachew` implementation.\n\n\n# How it works\n\n- first your objects get [converted](src/cachew/marshall/cachew.py#L29) into a simpler JSON-like representation\n- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).\n\nWhen the function is called, cachew [computes the hash of your function's arguments ](src/cachew/__init__.py#L580)\nand compares it against the previously stored hash value.\n\n- If they match, it would deserialize and yield whatever is stored in the cache database\n- If the hash mismatches, the original function is called and new data is stored along with the new hash\n\n\n\n# Features\n\n\n\n* automatic schema inference: [1](src/cachew/tests/test_cachew.py#L381), [2](src/cachew/tests/test_cachew.py#L395)\n* supported types:\n\n    * primitive: `str`, `int`, `float`, `bool`, `datetime`, `date`, `Exception`\n\n      See [tests.test_types](src/cachew/tests/test_cachew.py#L682), [tests.test_primitive](src/cachew/tests/test_cachew.py#L720), [tests.test_dates](src/cachew/tests/test_cachew.py#L632), [tests.test_exceptions](src/cachew/tests/test_cachew.py#L1124)\n    * [@dataclass and NamedTuple](src/cachew/tests/test_cachew.py#L597)\n    * [Optional](src/cachew/tests/test_cachew.py#L524) types\n    * [Union](src/cachew/tests/test_cachew.py#L827) types\n    * [nested datatypes](src/cachew/tests/test_cachew.py#L440)\n\n* detects [datatype schema changes](src/cachew/tests/test_cachew.py#L470) and discards old data automatically\n\n\n# Performance\nUpdating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.\n\nDuring reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.\n\nIt would almost certainly make your program faster if your computations take more than several seconds.\n\nYou can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py).\n\n\n# Using\nSee [docstring](src/cachew/__init__.py#L279) for up-to-date documentation on parameters and return types.\nYou can also use [extensive unit tests](src/cachew/tests/test_cachew.py#L1) as a reference.\n\nSome useful (but optional) arguments of `@cachew` decorator:\n\n* `cache_path` can be a directory, or a callable that [returns a path](src/cachew/tests/test_cachew.py#L417) and depends on function's arguments.\n\n   By default, `settings.DEFAULT_CACHEW_DIR` is used.\n\n* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.\n\n   By default it just uses string representation of the arguments, you can also specify a custom callable.\n\n   For instance, it can be used to [discard cache](src/cachew/tests/test_cachew.py#L115) if the input file was modified.\n\n* `cls` is the type that would be serialized.\n\n   By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.\n\n\n# Installing\nPackage is available on [pypi](https://pypi.org/project/cachew/).\n\n    pip3 install --user cachew\n    \n## Developing\nI'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI.\n\n# Implementation\n\n* why NamedTuples and dataclasses?\n  \n  `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.\n  Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.\n   \n  If you want to find out more why you should use more dataclasses in your code I suggest these links:\n  \n  - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)\n  - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)\n   \n* why not `pandas.DataFrame`?\n\n  DataFrames are great and can be serialised to csv or pickled.\n  They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.\n  They also can't be nested.\n\n* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?\n  \n  ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.\n\n  * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.\n    Also it doesn't support nested types.\n    \n* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?\n\n  Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.\n\n  Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:\n  \n  * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api\n  * https://pypi.org/project/marshmallow-dataclass\n \n  I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).\n  So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.\n\n* why `sqlite` database for storage?\n\n  It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.\n\n  There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.\n  It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.\n  \n  It would also be interesting to experiment with in-RAM storages.\n\n  I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.\n\n\n# Tips and tricks\n## Optional dependency\nYou can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:\n\n\n```python\ndef mcachew(*args, **kwargs):\n    \"\"\"\n    Stands for 'Maybe cachew'.\n    Defensive wrapper around @cachew to make it an optional dependency.\n    \"\"\"\n    try:\n        import cachew\n    except ModuleNotFoundError:\n        import warnings\n\n        warnings.warn(\n            'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew',\n            stacklevel=2,\n        )\n        return lambda orig_func: orig_func\n    else:\n        return cachew.cachew(*args, **kwargs)\n\n```\n\n\nNow you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.\n\n## Settings\n\n\n[cachew.settings](src/cachew/__init__.py#L55) exposes some parameters that allow you to control `cachew` behaviour:\n- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).\n   You can also use [cachew.extra.disabled_cachew](src/cachew/extra.py#L25) context manager to do it temporarily.\n- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the \"user cache directory\" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).\n- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.\n   Set to `True` to catch errors earlier.\n- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).\n\n\n## Updating this readme\nThis is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script.\n"
  },
  {
    "path": "benchmarks/20230912-comparison-with-legacy.org",
    "content": "Running on @karlicoss desktop PC, =python3.10=.\n\nThis is basically to justify switching to the new serialization method\n\n- old way, =legacy= used to 'flatten' the type into an sqlite row\n- new way, =cachew=, just dumps it as a dict, then to bytes via =orjson= and stores in a single sqlite column\n\nThe numbers between legacy and cachew can't be directly compared though.\nLegacy =serializing= step emits a tuple, which can be inserted directly into the database.\nSo to compare it with the new way, we need to compare with the sum of =serializing= + =json dump=.\nThat said this won't be exact comparison either, since legacy binder relied on sqlalchemy to dump custom types to sqlite types (e.g. =datetime= or =Exception=). So legacy will have a slight advantage this way, but it's fine.\n\nSo we can see that for:\n- =test_union_str_dataclass=\n  - new implementation: =0.53 + 0.45s= to serialize; =0.29 + 0.48= to deserialize\n  - old implementation: =2.38s= to serialize; =1.92= to deserialize\n- =test_nested_dataclass=\n  - new implementation: =1.05 + 0.26s= to serialize; =0.50 + 1.42= to deserialize\n  - old implementation: =1.92s= to serialize; =1.88= to deserialize\n\nFor both tests, serialization if quite a bit faster with the new implementation.\nOn the second test, they are on par for deserialization, but as I mention these numbers are in favor of the legacy implementation.\n\nIn addition, keeping everything in one column unlocks some othe optimizations which wouldn't be possible with multiple columns.\n\n\n#+begin_example\n$ pytest --pyargs cachew.tests.marshall -k 'gc_off and 1000000 and not cattrs' -s\n=========================================================== test session starts ============================================================\nplatform linux -- Python 3.10.12, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3\ncachedir: .pytest_cache\nrootdir: /code/cachew_jsonpickle\nconfigfile: pytest.ini\nplugins: anyio-3.6.2\ncollected 100 items / 95 deselected / 5 selected\n\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]\nbuilding      1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.34s\nserializing   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.53s\njson dump     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s\nsqlite dump   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.08s\nsqlite load   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s\njsonl dump    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.18s\njsonl load    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.13s\njson load     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s\ndeserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.48s\nPASSED\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-legacy]\nbuilding      1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.35s\nserializing   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 2.38s\njson dump     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.22s\nsqlite dump   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.06s\nsqlite load   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s\njsonl dump    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s\njsonl load    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s\njson load     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.23s\ndeserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.92s\nPASSED\nsrc/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-cachew]\nbuilding      1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.58s\nserializing   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.05s\njson dump     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.26s\nsqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.03s\nsqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.30s\njsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.14s\njsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.14s\njson load     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.50s\ndeserializing 1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.42s\nPASSED\nsrc/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-legacy]\nbuilding      1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.56s\nserializing   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.92s\njson dump     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.21s\nsqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.99s\nsqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.29s\njsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.12s\njsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.12s\njson load     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.24s\ndeserializing 1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.88s\nPASSED\n#+end_example\n"
  },
  {
    "path": "benchmarks/20230912.org",
    "content": "Running on @karlicoss desktop PC, =python3.10=\n\n- serializing/deserializing here refers to converting object to json-ish python dictionary (not actual json string!)\n- json dump/json load refers to converting the dict above to a json string and back\n- sqlite dump/jsonl dump refers to saving/loading these strings to a persistent storage\n\n\n#+begin_example\n$ pytest --pyargs --ignore-glob '*test_cachew*' -k marshall -s\n=========================================================== test session starts ============================================================\nplatform linux -- Python 3.10.6, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3\ncachedir: .pytest_cache\nconfigfile: pytest.ini\nplugins: anyio-3.6.2\ncollected 37 items / 8 deselected / 29 selected\n\nsrc/cachew/marshall/cachew.py::test_serialize_and_deserialize PASSED\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]\nbuilding      1000000 objects of type str | cachew.tests.marshall.Name: 0.60s\nserializing   1000000 objects of type str | cachew.tests.marshall.Name: 0.85s\njson dump     1000000 objects of type str | cachew.tests.marshall.Name: 0.46s\nsqlite dump   1000000 objects of type str | cachew.tests.marshall.Name: 1.11s\nsqlite load   1000000 objects of type str | cachew.tests.marshall.Name: 0.31s\njsonl dump    1000000 objects of type str | cachew.tests.marshall.Name: 0.13s\njsonl load    1000000 objects of type str | cachew.tests.marshall.Name: 0.13s\njson load     1000000 objects of type str | cachew.tests.marshall.Name: 1.04s\ndeserializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.86s\nPASSED\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]\nbuilding      5000000 objects of type str | cachew.tests.marshall.Name: 3.00s\nserializing   5000000 objects of type str | cachew.tests.marshall.Name: 4.38s\njson dump     5000000 objects of type str | cachew.tests.marshall.Name: 2.14s\nsqlite dump   5000000 objects of type str | cachew.tests.marshall.Name: 5.43s\nsqlite load   5000000 objects of type str | cachew.tests.marshall.Name: 1.47s\njsonl dump    5000000 objects of type str | cachew.tests.marshall.Name: 0.62s\njsonl load    5000000 objects of type str | cachew.tests.marshall.Name: 0.64s\njson load     5000000 objects of type str | cachew.tests.marshall.Name: 4.74s\ndeserializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.06s\nPASSED\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]\nbuilding      5000000 objects of type str | cachew.tests.marshall.Name: 1.77s\nserializing   5000000 objects of type str | cachew.tests.marshall.Name: 2.59s\njson dump     5000000 objects of type str | cachew.tests.marshall.Name: 1.22s\nsqlite dump   5000000 objects of type str | cachew.tests.marshall.Name: 5.28s\nsqlite load   5000000 objects of type str | cachew.tests.marshall.Name: 1.58s\njsonl dump    5000000 objects of type str | cachew.tests.marshall.Name: 0.64s\njsonl load    5000000 objects of type str | cachew.tests.marshall.Name: 0.66s\njson load     5000000 objects of type str | cachew.tests.marshall.Name: 1.53s\ndeserializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.60s\nPASSED\nsrc/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)\nsrc/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]\nbuilding      1000000 objects of type <class 'datetime.datetime'>: 1.05s\nserializing   1000000 objects of type <class 'datetime.datetime'>: 1.28s\njson dump     1000000 objects of type <class 'datetime.datetime'>: 0.22s\nsqlite dump   1000000 objects of type <class 'datetime.datetime'>: 1.14s\nsqlite load   1000000 objects of type <class 'datetime.datetime'>: 0.30s\njsonl dump    1000000 objects of type <class 'datetime.datetime'>: 0.14s\njsonl load    1000000 objects of type <class 'datetime.datetime'>: 0.14s\njson load     1000000 objects of type <class 'datetime.datetime'>: 0.70s\ndeserializing 1000000 objects of type <class 'datetime.datetime'>: 2.20s\nPASSED\nsrc/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)\nsrc/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]\nbuilding      5000000 objects of type <class 'datetime.datetime'>: 5.08s\nserializing   5000000 objects of type <class 'datetime.datetime'>: 6.35s\njson dump     5000000 objects of type <class 'datetime.datetime'>: 1.13s\nsqlite dump   5000000 objects of type <class 'datetime.datetime'>: 5.58s\nsqlite load   5000000 objects of type <class 'datetime.datetime'>: 1.47s\njsonl dump    5000000 objects of type <class 'datetime.datetime'>: 0.69s\njsonl load    5000000 objects of type <class 'datetime.datetime'>: 0.70s\njson load     5000000 objects of type <class 'datetime.datetime'>: 6.85s\ndeserializing 5000000 objects of type <class 'datetime.datetime'>: 11.10s\nPASSED\nsrc/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)\nsrc/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]\nbuilding      1000000 objects of type <class 'datetime.datetime'>: 1.37s\nserializing   1000000 objects of type <class 'datetime.datetime'>: 1.25s\njson dump     1000000 objects of type <class 'datetime.datetime'>: 0.24s\nsqlite dump   1000000 objects of type <class 'datetime.datetime'>: 1.12s\nsqlite load   1000000 objects of type <class 'datetime.datetime'>: 0.29s\njsonl dump    1000000 objects of type <class 'datetime.datetime'>: 0.14s\njsonl load    1000000 objects of type <class 'datetime.datetime'>: 0.14s\njson load     1000000 objects of type <class 'datetime.datetime'>: 0.24s\ndeserializing 1000000 objects of type <class 'datetime.datetime'>: 2.17s\nPASSED\nsrc/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)\nsrc/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]\nbuilding      5000000 objects of type <class 'datetime.datetime'>: 5.10s\nserializing   5000000 objects of type <class 'datetime.datetime'>: 6.22s\njson dump     5000000 objects of type <class 'datetime.datetime'>: 1.17s\nsqlite dump   5000000 objects of type <class 'datetime.datetime'>: 5.43s\nsqlite load   5000000 objects of type <class 'datetime.datetime'>: 1.54s\njsonl dump    5000000 objects of type <class 'datetime.datetime'>: 0.70s\njsonl load    5000000 objects of type <class 'datetime.datetime'>: 0.71s\njson load     5000000 objects of type <class 'datetime.datetime'>: 1.22s\ndeserializing 5000000 objects of type <class 'datetime.datetime'>: 10.97s\nPASSED\nsrc/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)\nsrc/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]\nbuilding      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.64s\nserializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.43s\njson dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.30s\nsqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.16s\nsqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.30s\njsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s\njsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s\njson load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.02s\ndeserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 2.78s\nPASSED\nsrc/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]\nbuilding      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.88s\nserializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.80s\njson dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.31s\nsqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.39s\nsqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.31s\njsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s\njsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s\njson load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.03s\ndeserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 2.61s\nPASSED\nsrc/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]\nbuilding      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.57s\nserializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.08s\njson dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.29s\nsqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.09s\nsqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.30s\njsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s\njsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s\njson load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.50s\ndeserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.43s\nPASSED\nsrc/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]\nbuilding      1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.57s\nserializing   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.39s\njson dump     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.29s\nsqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.16s\nsqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.32s\njsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.16s\njsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.15s\njson load     1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 0.50s\ndeserializing 1000000 objects of type <class 'cachew.tests.marshall.test_many_from_cachew.<locals>.TE2'>: 1.29s\nPASSED\n\n============================================================ slowest durations =============================================================\n44.87s call     src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]\n38.76s call     src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]\n28.65s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]\n20.05s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]\n9.82s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]\n9.51s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]\n8.37s call     src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]\n8.20s call     src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]\n6.45s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]\n5.93s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]\n5.78s call     src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]\n3.98s call     src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]\n0.01s call     src/cachew/marshall/cachew.py::test_serialize_and_deserialize\n\n(68 durations < 0.005s hidden.  Use -vv to show these durations.)\n========================================================= short test summary info ==========================================================\nSKIPPED [6] src/cachew/tests/marshall.py:171: TODO need to adjust the handling of Union types..\nSKIPPED [4] src/cachew/tests/marshall.py:194: TODO support datetime with pytz for cattrs\nPASSED src/cachew/marshall/cachew.py::test_serialize_and_deserialize\nPASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]\nPASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]\nPASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]\n#+end_example\n"
  },
  {
    "path": "benchmarks/20230917.org",
    "content": "Running on @karlicoss desktop PC, =python3.10=\n\nJust a comparison of =sqlite= and =file= backends.\n\n#+begin_example\n$ pytest --pyargs -k 'test_many and gc_off and 3000000' -s\nsrc/cachew/tests/test_cachew.py::test_many[sqlite-gc_off-3000000] [INFO    2023-09-17 02:02:09,946 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: wrote   3000000 objects to   cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)\ntest_many: initial write to cache took 13.6s\ntest_many: cache size is 229.220352Mb\n[INFO    2023-09-17 02:02:10,780 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: loading 3000000 objects from cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)\ntest_many: reading from cache took 7.0s\nPASSED\nsrc/cachew/tests/test_cachew.py::test_many[file-gc_off-3000000] [INFO    2023-09-17 02:02:23,944 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: wrote   3000000 objects to   cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)\ntest_many: initial write to cache took 6.1s\ntest_many: cache size is 202.555667Mb\n[INFO    2023-09-17 02:02:23,945 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: loading objects from cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)\ntest_many: reading from cache took 5.4s\n#+end_example\n"
  },
  {
    "path": "doc/cachew_disable.md",
    "content": "Can put this in the README.md once its been tested a bit\n\n### Disable through Environment Variables\n\nTo disable a `cachew` function in some module, you can use the `CACHEW_DISABLE` environment variable. This is a colon-delimited (like a `$PATH`) list of modules to disable. It disables modules given some name recursively, and supports [unix-style globs](https://docs.python.org/3/library/fnmatch.html)\n\nFor example, say you were using [HPI](https://github.com/karlicoss/HPI) which internally uses a snippet like `mcachew` above. You may want to enable `cachew` for _most_ modules, but disable them for specific ones. For example take:\n\n```\nmy/browser\n├── active_browser.py\n├── all.py\n├── common.py\n└── export.py\nmy/reddit\n├── __init__.py\n├── all.py\n├── common.py\n├── pushshift.py\n└── rexport.py\n```\n\nTo disable `cachew` in all of these files: `export CACHEW_DISABLE=my.browser:my.reddit` (disables for all submodules)\n\nTo disable just for a particular module: `export CACHEW_DISABLE='my.browser.export'`\n\nSimilarly to `$PATH` manipulations, you can do this in your shell configuration incrementally:\n\n```\nCACHEW_DISABLE='my.reddit.rexport'\nif some condition...; then\n    CACHEW_DISABLE=\"my.browser.export:$CACHEW_DISABLE\"\nfi\nexport CACHEW_DISABLE\n```\n\nYou can also use globs, e.g. `CACHEW_DISABLE='my.*.gdpr`\n\nTo disable `cachew` everywhere, you could set `export CACHEW_DISABLE='*'`\n"
  },
  {
    "path": "doc/serialization.org",
    "content": "Cachew works kinda like =functools.lru_cache=, but it also works in-between program runs.\nFor that, it needs to somehow persist the objects on the disk (unlike =lru_cache= which just keeps references to the objects already in process memory).\n\nWhile persisting objects to the cache, essentially cachew needs to map them into simpler types, i.e. ones you can keep in a database like strings/ints/binary blobs.\n\nAt the moment (as of =v0.13.0=), we use sqlite as the cache store, with =sqlalchemy= as the interface to interact with it.\n\nThe way cachew works now is, to save the object in cache:\n\n- first it's \"flattened out\" to conform to the database row model, so individual fields (including recursive fields) become database columns\n- python types are mapped into sqlalchemy types, with extra =sqlalchemy.TypeDecorator= instances to support custom types like =datetime= or =Exception=\n\nYou can find a more detailed example [[https://github.com/karlicoss/cachew/blob/175afade0a417bfd533ced174365d246b8a7dabc/src/cachew/__init__.py#L319-L353][here]].\n\nA big problem is that in general it's not really possible to serialize, and especially to deserialize back an arbitrary object in Python, unless you resort to binary serialization like =pickle= (which is very slow and comes with its own hosts of issues).\n\nHowever in cachew we require the user to supply the *type signature* for the functions that are cached, so we can benefit from it for serializing and deserializing.\n\nFew years ago, when I implemented =cachew= at first, there weren't really many options for serialization driven by type signatures, so I implemented the custom code I mentioned above to support that. In 2023, however, more and more libraries are benefiting from type signatures, in particular for serializing stuff.\n\nSo I decided to give it another go, in hope of using some mature library, simplifying cachew's code, and possibly getting a perfromance boost.\nIt's possible that I missed some documentation so if you think the problems I am describing can actually be worked around, please don't hesitate to let me know.\n\n* Comparison\n\nIn cachew the very minimum we're aiming to support are:\n\n- all json-ish types, e.g. =int=/=str=/=dict=/=list= etc\n- =dataclass= and =NamedTuple=\n- =Optional= and =Union=\n- custom types, e.g. =datetime=, =Exception= (e.g. at least preserve exception message)\n\nSee [[file:test_serialization.py]] for more specific examples and supporting evidence for my summary here.\n\n** [[https://docs.python.org/3.10/library/pickle.html][pickle]]\nBuiltin pickle module can handle any objects, without even needing type annotations.\n\nHowever, it's [[https://www.benfrederickson.com/dont-pickle-your-data/][famously very slow]], so I even didn't consider using it.\n\nIt's also not secure in general, although in our case we control the objects we save/load from cache, so it's not a big issue.\n\n** [[https://github.com/jsonpickle/jsonpickle#readme][jsonpickle]]\nJsonpickle -- similar to pickle, can handle any types.\n\nI [[https://github.com/karlicoss/cachew/commit/048df33e65560205d63845f022b027a27719ff48][gave it a go]] just in case, and it's an order of magnitude slower than custom serialization code I already had, which is a no-go.\n\n** [[https://github.com/lidatong/dataclasses-json/#readme][dataclasses-json]]\n# TODO link to code\n- CON: requires annotating all dataclasses involved with =@dataclass_json=, recursively.\n  This is a blocker from using it in =cachew=.\n- CON: requires the type to be a =@dataclass= to annotate\n  So if you have something simpler you'll have to wrap it into a dummy dataclass or something.\n- PRO: supports =Union= correctly\n\n** [[https://github.com/marshmallow-code/marshmallow][marshmallow]]\n\nBy default marshmallow doesn't support dataclasses or unions, but there are some extra packages\n\n- for dataclasses https://github.com/lovasoa/marshmallow_dataclass\n  - PRO: doesn't require modifying the original class, handles recursion out of the box\n  - CON: doesn't handle =Union= correctly\n    This is a blocker for cachew.\n    In addition it has a custom implementation of Union handling (rather than e.g. relying on =python-marshmallow-union=).\n- https://github.com/adamboche/python-marshmallow-union\n  I didn't even get to try it since if dataclasses don't work marshmallow is a no-go for me.\n  Plus for some reason =marshmallow_dataclass= has a custom Union handling implementation which is different from this one, so it's going to be a huge mess.\n\n** [[https://github.com/pydantic/pydantic#readme][pydantic]]\n- PRO: if you use =TypeAdapter=, you can serialize/deserialize arbitrary types without decorating/inheriting from =BaseModel=\n- CON: doesn't handle =Union= correctly\n  Again, this is a bit blocker. I've created an issue on pydantic bug tracker here: https://github.com/pydantic/pydantic/issues/7391\n\n  Kind of sad, because otherwise pydantic seemed promising!\n\n** [[https://github.com/python-attrs/cattrs#features][cattrs]]\n- PRO: doesn't require modifying the classes you serialise\n- PRO: rich feature set, clearly aiming to comply with standard python's typing annotations\n- CON: there is an issue with handling =NamedTuple=\n\n  It isn't converted to a dictionary like =dataclass= does, [[https://github.com/python-attrs/cattrs/issues/425][likely a bug]]?\n- =Union= types are supported, but require some extra configuration\n\n  Unions work, but you have to 'register' them first.\n  A bit annoying that this is necessary even for simple unions like =int | str=, although [[https://github.com/python-attrs/cattrs/issues/423][possible]] to workaround.\n\n  The plus side is that cattr has a builtin utility for Union type discrimination.\n\n  I guess for my application I could traverse the type and register all necessary Unions with =catrrs=?\n  # TODO create an issue to support opting in everywhere by default?\n\n\nSince the above seems quite good, I did a quick cachew hack on [[https://github.com/karlicoss/cachew/tree/cattrs][cattrs branch]] to try and use it.\n\nThe pipeline is the following:\n- serialize type to a dictionary with primitive types via =cattrs=\n- serialize dictionary to a byte string via =orjson=\n- persist the byte string as an sqlite database row\n\n(for deserializing we just do the same in reverse)\n\nYou can find the results [[https://github.com/karlicoss/cachew/commit/82691b10cd1d4ced4862dff21cf038fb83f9525c][here]] -- cattrs proved to be quite a huge speedup over my custom serialization code!\n\nIt needs a bit more work and evaluation for use in =cachew=, however it's super promising!\n\n# TODO https://catt.rs/en/stable/preconf.html#orjson\n\nSome interesting reading about cattrs:\n- https://threeofwands.com/why-cattrs-is-so-fast/#v2-the-genconverter\n- https://threeofwands.com/why-i-use-attrs-instead-of-pydantic\n\n* Verdict\n\nThe biggest shared issues are that most of this libraries:\n- require modifying the original class definitions, either by inheriting or decorating\n- don't handle =Union= at all or don't handle it corectly (usually relying on the structural equivalence rather than actual types)\n\nSo for most of them, I even didn't get to trying to support custom types and measuing performance with =cachew=.\n\nOf all of them only =cattrs= stood out, it takes builtin python typing and performance very seriously, and very configurable.\nSo if you need no bullshit serialization in python, I can definitely recommend it.\nI might switch to it in [[https://github.com/karlicoss/promnesia][promnesia]] (where we have full control over the type we serialize in the database), and could potentially be used in HPI for [[https://github.com/karlicoss/HPI/blob/master/my/core/serialize.py][my.core.serialize]].\n"
  },
  {
    "path": "doc/test_serialization.py",
    "content": "#!/usr/bin/env python3\nfrom dataclasses import dataclass\nfrom typing import NamedTuple, Union\n\n\ndef test_dataclasses_json():\n    # pip install dataclasses-json\n    from dataclasses_json import dataclass_json\n\n    @dataclass\n    class Inner:\n        value: int\n\n    @dataclass\n    class Outer:\n        inner: Inner\n\n    ### issue 1: requires @dataclass_json annotation on all involved dataclasses\n    obj = Outer(inner=Inner(value=123))  # noqa: F841\n\n    # we don't control the types that are passed to us, so we can't use the @dataclass_json\n    # but we can just call the decorator directly\n\n    # HOWEVER: this modifies the original class, Outer!!\n    OuterJson = dataclass_json(Outer)  # noqa: F841\n    # it adds 'from_dict', 'from_json', 'schema', 'to_dict', 'to_json' attributes to it\n\n    # now if you try\n    # print(OuterJson.schema().dump(obj))\n    # you get a warning that it wants you to add annotations to Inner classes too.\n    # this isn't really an option for us.\n    ###\n\n    ### issue 2: can't dump anything unless the top level type is a dataclass?\n    ### could wrap into a dummy dataclass or something, but is wasteful in terms of performance\n    ###\n\n    ### nice thing: correctly serializes Union types, even if they share the same attributes\n    @dataclass_json\n    @dataclass\n    class City:\n        name: str\n\n    @dataclass_json\n    @dataclass\n    class Country:\n        name: str\n\n    @dataclass_json\n    @dataclass\n    class WithUnion:\n        union: Union[City, Country]  # noqa: UP007\n\n    objs = [\n        WithUnion(union=City(name='London')),\n        WithUnion(union=Country(name='UK')),\n    ]\n\n    schema = WithUnion.schema()\n    json = schema.dumps(objs, many=True)\n    objs2 = schema.loads(json, many=True)\n    print(\"objects  \", objs)\n    print(\"json     \", json)\n    # NOTE: it dumps [{\"union\": {\"name\": \"London\", \"__type\": \"City\"}}, {\"union\": {\"name\": \"UK\", \"__type\": \"Country\"}}]\n    # so types are correctly distinguished\n    print(\"restored \", objs2)\n    assert objs == objs2, (objs, objs2)\n    ###\n\n\ndef test_marshmallow_dataclass():\n    # pip3 install --user marshmallow-dataclass[union]\n    import marshmallow_dataclass\n\n    ### issue 1: the top level type has to be a dataclass?\n    ### although possible that we could use regular marshmallow for that instead\n    ###\n\n    ### issue 2: doesn't handle unions correctly\n    @dataclass\n    class City:\n        name: str\n\n    @dataclass\n    class Country:\n        name: str\n\n    @dataclass\n    class WithUnion:\n        union: Union[City, Country]  # noqa: UP007\n\n    objs = [\n        WithUnion(union=City(name=\"London\")),\n        WithUnion(union=Country(name=\"UK\")),\n    ]\n\n    # NOTE: good, doesn't require adding annotations on the original classes\n    schema = marshmallow_dataclass.class_schema(WithUnion)()\n\n    json = schema.dumps(objs, many=True)\n    objs2 = schema.loads(json, many=True)\n    print(\"objects  \", objs)\n    print(\"json     \", json)\n    # NOTE: it dumps [{\"union\": {\"value\": 123}}, {\"union\": {\"value\": 123}}]\n    # so it doesn't distingush based on types => won't deserialize correctly\n    print(\"restored \", objs2)\n    # assert objs == objs2, (objs, objs2)\n    # ^ this assert fails!\n    ###\n\n\ndef test_pydantic():\n    from pydantic import TypeAdapter\n\n    ### issue: doesn't handle Unions correctly\n    @dataclass\n    class City:\n        name: str\n\n    @dataclass\n    class Country:\n        name: str\n\n    @dataclass\n    class WithUnion:\n        union: Union[City, Country]  # noqa: UP007\n\n    objs = [\n        WithUnion(union=City(name=\"London\")),\n        WithUnion(union=Country(name=\"UK\")),\n    ]\n\n    # NOTE: nice, doesn't require annotating the original classes with anything\n    Schema = TypeAdapter(list[WithUnion])\n\n    json = Schema.dump_python(\n        objs,\n        # round_rtip: Whether to output the serialized data in a way that is compatible with deserialization\n        # not sure, doesn't seem to impact anything..\n        round_trip=True,\n    )\n    objs2 = Schema.validate_python(json)\n\n    print(\"objects  \", objs)\n    print(\"json     \", json)\n    print(\"restored \", objs2)\n\n    # assert objs == objs2, (objs, objs2)\n    # ^ this assert fails!\n    # created an issue https://github.com/pydantic/pydantic/issues/7391\n    ###\n\n\ndef test_cattrs():\n    from cattrs import Converter\n    from cattrs.strategies import configure_tagged_union\n\n    converter = Converter()\n\n    ### issue: NamedTuples aren't unstructured? asked here https://github.com/python-attrs/cattrs/issues/425\n    class X(NamedTuple):\n        value: int\n\n    d = converter.unstructure(X(value=123), X)  # noqa: F841\n    # NOTE: this assert doesn't pass!\n    # assert isinstance(d, dict)\n    ###\n\n    ### good: handles Union correctly (although some extra configuring required)\n    @dataclass\n    class City:\n        name: str\n\n    @dataclass\n    class Country:\n        name: str\n\n    @dataclass\n    class WithUnion:\n        union: Union[City, Country]  # noqa: UP007\n\n    objs = [\n        WithUnion(union=City(name=\"London\")),\n        WithUnion(union=Country(name=\"UK\")),\n    ]\n\n    configure_tagged_union(\n        union=City | Country,\n        converter=converter,\n    )\n    # NOTE: nice -- doesn't require decorating original classes\n    json = converter.unstructure(objs, list[WithUnion])\n    assert isinstance(json, list)\n    objs2 = converter.structure(json, list[WithUnion])\n\n    print(\"objects  \", objs)\n    # NOTE: dumps it as [{'union': {'name': 'London', '_type': 'City'}}, {'union': {'name': 'UK', '_type': 'Country'}}]\n    print(\"json     \", json)\n    print(\"restored \", objs2)\n\n    assert objs == objs2, (objs, objs2)\n    ###\n\n    ### issue: unions of simple types aren't supported?\n    # see https://github.com/python-attrs/cattrs/issues/423\n    mixed: list[int | str] = [\n        123,\n        'Jakarta',\n    ]\n    json = converter.unstructure(mixed, list[int | str])\n    # NOTE: this fails\n    # mixed2 = converter.structure(json , list[int | str])\n    ###\n\n\ntest_dataclasses_json()\ntest_marshmallow_dataclass()\ntest_pydantic()\ntest_cattrs()\n"
  },
  {
    "path": "generate-readme",
    "content": "#!/bin/bash\nset -eu\n\ncd \"$(dirname \"$0\")\"\n\n# --no-input seems to work well\n# but if need more targeted approach, pparently can mark certain cells with tag and use '--TagRemovePreprocessor.remove_cell_tags={\"noexport\"}' ?\nexec uvx --with jupyter --from jupyter-core jupyter nbconvert --execute --to markdown --no-input README.ipynb\n\n# TODO run it on CI to make sure it renders and up to date?\n"
  },
  {
    "path": "github-issues.org",
    "content": "#+todo: OPEN | CLOSED\n* Issues of cachew\n:PROPERTIES:\n:since:    \n:url:      https://api.github.com/repos/karlicoss/cachew\n:END:\n** OPEN keep hash along each cached entity instead of separate table?\n:PROPERTIES:\n:tags:     (\"prio-B\")\n:id:       15\n:date-modification: 2020-01-08T22:26:04+0000\n:date-creation: 2020-01-08T22:26:04+0000\n:author:   \"karlicoss\"\n:END:\n: At the moment there are two separate tables: one for latest hash value, another for cached entities.\n: It might be simpler and safer to keep a single table, with hash along with each cached entity.\n: \n** OPEN support multiple cached values?\n:PROPERTIES:\n:tags:     (\"prio-B\")\n:id:       14\n:date-modification: 2020-01-08T22:26:03+0000\n:date-creation: 2020-01-08T22:26:02+0000\n:author:   \"karlicoss\"\n:END:\n: At the moment it's LRU(1) cache, it some usecases it makes sense to cache more values though\n: \n** OPEN support pathlib.Path\n:PROPERTIES:\n:tags:     (\"prio-C\")\n:id:       13\n:date-modification: 2020-01-08T22:26:02+0000\n:date-creation: 2020-01-08T22:26:01+0000\n:author:   \"karlicoss\"\n:END:\n: Path is a trivial wrapper around str. I guess generally think of a good way to allow adhoc mapping of simple types.\n: Perhaps current Exception makes sense.\n: \n** OPEN support defensive behaviour\n:PROPERTIES:\n:tags:     (\"prio-C\")\n:id:       12\n:date-modification: 2020-01-08T22:26:01+0000\n:date-creation: 2020-01-08T22:26:00+0000\n:author:   \"karlicoss\"\n:END:\n: E.g. if we can't serialize for some reason, bail the database but at least yield values anyway\n: \n** OPEN Add Redis support\n:PROPERTIES:\n:id:       9\n:date-modification: 2020-01-06T00:48:59+0000\n:date-creation: 2020-01-06T00:48:59+0000\n:author:   \"softinio\"\n:END:\n: Add Redis support as an alternative to sqlite\n: \n: This would be a great feature as it will make this solution easier to use in an enterprise production environment as getting a redis instance shared amonst multiple instances of your app is very easy and cost effective to use.\n: \n** OPEN better pytz support?\n:PROPERTIES:\n:tags:     (\"prio-C\")\n:id:       6\n:date-modification: 2020-01-05T13:34:51+0000\n:date-creation: 2020-01-05T13:33:25+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED Optional feature: Exception support\n:PROPERTIES:\n:id:       11\n:date-modification: 2020-01-08T21:56:56+0000\n:date-creation: 2020-01-08T21:34:03+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED Add doc on defensive/optional usage\n:PROPERTIES:\n:id:       10\n:date-modification: 2020-01-06T23:48:54+0000\n:date-creation: 2020-01-06T23:47:39+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED Safer concurrent writes handling\n:PROPERTIES:\n:id:       8\n:date-modification: 2020-01-05T22:32:13+0000\n:date-creation: 2020-01-05T22:08:24+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED Update readme\n:PROPERTIES:\n:id:       7\n:date-modification: 2020-01-05T15:29:37+0000\n:date-creation: 2020-01-05T15:24:38+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED support for dataclasses\n:PROPERTIES:\n:id:       1\n:date-modification: 2020-01-05T13:34:50+0000\n:date-creation: 2019-07-30T21:45:30+0100\n:author:   \"karlicoss\"\n:END:\n** CLOSED Fix Json support for python3.6\n:PROPERTIES:\n:id:       2\n:date-modification: 2020-01-05T13:33:28+0000\n:date-creation: 2019-12-08T12:21:58+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED Fix bug when default argument is explicitly specified\n:PROPERTIES:\n:id:       3\n:date-modification: 2020-01-05T13:33:27+0000\n:date-creation: 2019-12-08T17:56:51+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED Union types\n:PROPERTIES:\n:id:       4\n:date-modification: 2020-01-05T13:33:27+0000\n:date-creation: 2019-12-19T23:32:55+0000\n:author:   \"karlicoss\"\n:END:\n** CLOSED support top level primitive types\n:PROPERTIES:\n:id:       5\n:date-modification: 2020-01-05T13:33:26+0000\n:date-creation: 2019-12-20T00:09:00+0000\n:author:   \"karlicoss\"\n:END:\n"
  },
  {
    "path": "misc/profile.py",
    "content": "#!/usr/bin/env python3\nimport sqlite3\nfrom collections.abc import Iterator\nfrom pathlib import Path\n\nimport sqlalchemy\nfrom codetiming import Timer\nfrom more_itertools import ilen\n\nfrom cachew import cachew\n\n# todo not sure it really helps much?\nimport gc  # isort: skip\n\ngc.disable()\n\n\ndef timer(name: str) -> Timer:\n    return Timer(name=name, text=name + ': ' + '{:.2f}s')\n\n\ndef test_ints() -> None:\n    N = 5_000_000\n\n    base = Path('/tmp/cachew_profiling/')\n    # shutil.rmtree(base)\n    base.mkdir(exist_ok=True, parents=True)\n\n    cache_path = base / 'ints'\n\n    def fun_nocachew(n) -> Iterator[int]:\n        yield from range(n)\n\n    @cachew(cache_path=cache_path, force_file=True)\n    def fun(n) -> Iterator[int]:\n        yield from range(n)\n\n    # with timer('no caching'):\n    #     ilen(fun_nocachew(N))\n\n    # with timer('initial call'):\n    #     ilen(fun(N))\n\n    assert cache_path.exists()  # just in case\n    with timer('reading directly via sqlite'):\n        total = 0\n        with sqlite3.connect(cache_path) as conn:\n            for (_x,) in conn.execute('SELECT * FROM cache'):\n                total += 1\n        assert total == N  # just in case\n\n    with timer('reading directly via sqlalchemy'):\n        total = 0\n        engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}')\n\n        from sqlalchemy import Column, MetaData, Table\n\n        meta = MetaData()\n        table_cache = Table('cache', meta, Column('_cachew_primitive', sqlalchemy.Integer))\n        with engine.connect() as conn:\n            with timer('sqlalchemy querying'):\n                rows = conn.execute(table_cache.select())\n                for (_x,) in rows:\n                    total += 1\n        engine.dispose()\n        assert total == N  # just in case\n\n    cache_size_mb = cache_path.stat().st_size / 10**6\n    print(f'cache size: {cache_size_mb:.1f} Mb')\n\n    with timer('subsequent call'):\n        ilen(fun(N))\n\n\ntest_ints()\n"
  },
  {
    "path": "misc/test_redis/docker-compose.yml",
    "content": "services:\n  redis:\n    image: \"redis:alpine\"\n    # restart: always\n    command:\n      - \"sh\"\n      - \"-euc\"\n      - |\n        exec redis-server\n      # - |\n      #   echo \"requirepass '$$REDIS_PASSWORD'\" > /etc/redis.conf\n      #   exec redis-server /etc/redis.conf\n    # environment:\n    #   REDIS_PASSWORD: \"password\"\n    ports:\n      - 6379:6379\n    volumes:\n      - \"redis-cachew:/data:rw\"\n\nvolumes:\n  redis-cachew:\n"
  },
  {
    "path": "misc/test_redis/test.py",
    "content": "#!/usr/bin/env python3\nfrom time import time\n\nimport redis  # ty: ignore[unresolved-import]\nfrom loguru import logger  # ty: ignore[unresolved-import]\nfrom more_itertools import ilen\n\nr = redis.Redis(host='localhost', port=6379, db=0)\n\n\nN = 1_000_000\n\n\ndef items():\n    yield from map(str, range(N))\n\n\nTAG = 'keys'\n\n\ndef reset():\n    r.delete(TAG)\n\n\ndef write():\n    for i, obj in enumerate(items()):\n        key = f'obj:{i}'\n        r.hset(key, 'data', obj)\n        r.lpush(TAG, key)\n\n\ndef read():\n    keys = r.lrange(TAG, 0, -1)\n    result = (r.hget(key, 'data') for key in keys)\n    print('total', ilen(result))\n\n\n# TODO could use lmove for atomic operations?\ndef write2():\n    for obj in items():\n        r.lpush(TAG, obj)\n\n\ndef read2():\n    result = r.lrange(TAG, 0, -1)\n    print('total', ilen(result))\n\n\nreset()\n\na = time()\nwrite2()\nb = time()\nlogger.info(f'writing took {b - a:.1f}s')\n\na = time()\nread2()\nb = time()\nlogger.info(f'reading took {b - a:.1f}s')\n\n\n# with read()/write()\n# 100000 strings:\n# 2023-09-09 01:50:23.498 | INFO     | __main__:<module>:37 - writing took 13.1s\n# 2023-09-09 01:50:30.052 | INFO     | __main__:<module>:42 - reading took 6.6s\n# hmm kinda slow..\n\n\n# with read2/write2, writing about 7secs, and reading is instantaneous??\n# for 1M objects, writing took 60 secs, and reading 0.2s?\n# lol could be promising...\n# I guess it's not iterative, but could retrieve items in batches?\n"
  },
  {
    "path": "mypy.ini",
    "content": "[mypy]\npretty = True\nshow_error_context = True\nshow_column_numbers = True\nshow_error_end = True\n\ncheck_untyped_defs = True\n\n# see https://mypy.readthedocs.io/en/stable/error_code_list2.html\nwarn_redundant_casts = True\nstrict_equality = True\nwarn_unused_ignores = True\nenable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable\n\n\n# an example of suppressing\n# [mypy-my.config.repos.pdfannots.pdfannots]\n# ignore_errors = True\n"
  },
  {
    "path": "pyproject.toml",
    "content": "# see https://github.com/karlicoss/pymplate for up-to-date reference\n[project]\ndynamic = [\"version\"]  # version is managed by build backend\nname = \"cachew\"\ndependencies = [\n    \"platformdirs\",     # default cache dir\n    \"sqlalchemy>=1.0\",  # cache DB interaction\n    \"orjson\",           # fast json serialization\n    \"typing-extensions\",# for depreceated decorator\n]\nrequires-python = \">=3.12\"\n\n## these need to be set if you're planning to upload to pypi\n# description = \"TODO\"\nlicense = {file = \"LICENSE.txt\"}\nauthors = [\n    {name = \"Dima Gerasimov (@karlicoss)\", email = \"karlicoss@gmail.com\"},\n]\nmaintainers = [\n    {name = \"Dima Gerasimov (@karlicoss)\", email = \"karlicoss@gmail.com\"},\n]\n# keywords = []\n# # see: http://pypi.python.org/pypi?%3Aaction=list_classifiers\n# classifiers = [\n# ]\n\n\n[project.urls]\nHomepage = \"https://github.com/karlicoss/cachew\"\n##\n\n\n[project.optional-dependencies]\noptional = [\n    \"colorlog\",\n]\n\n[dependency-groups]\n# TODO: not sure, on the one hand could just use 'standard' dev dependency group\n# On the other hand, it's a bit annoying that it's always included by default? \n# To make sure it's not included, need to use `uv run --exact --no-default-groups ...`\ntesting = [\n    \"pytest>=9\",  # need version 9 for proper namespace package support\n    \"ruff\",\n\n    \"pytz\",\n\n    \"more-itertools\",\n    \"patchy\",  # for injecting sleeps and testing concurrent behaviour\n    \"enlighten\",  # used in logging helper, but not really required\n    \"cattrs\",  # benchmarking alternative marshalling implementation\n    \"pyinstrument\",  # for profiling from within tests\n    \"codetiming\", # Timer context manager\n]\ntypecheck = [\n    { include-group = \"testing\" },\n    \"mypy\",\n    \"lxml\", # for mypy html coverage\n    \"ty>=0.0.3\",\n\n    \"types-pytz\",  # optional runtime only dependency\n\n    \"cachew[optional]\",\n]\n\n\n[build-system]\nrequires = [\"hatchling\", \"hatch-vcs\"]\nbuild-backend = \"hatchling.build\"\n\n# unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894\n[tool.hatch.build.targets.wheel]\npackages = [\"src/cachew\"]\n\n[tool.hatch.version]\nsource = \"vcs\"\n\n[tool.hatch.version.raw-options]\nversion_scheme = \"python-simplified-semver\"\nlocal_scheme = \"dirty-tag\"\n"
  },
  {
    "path": "pytest.ini",
    "content": "[pytest]\n# discover files that don't follow test_ naming. Useful to keep tests along with the source code\npython_files = *.py\n\n# this is necessary for --pyargs to discover implicit namespace packages correctly\nconsider_namespace_packages = true\n\n# see https://docs.pytest.org/en/stable/reference/reference.html#confval-strict\n# disable for now -- some macos tests ('file backend') are flaky\n# strict = true\n\naddopts =\n  # prevent pytest cache from being created... it craps into project dir and I never use it anyway\n  -p no:cacheprovider\n\n  # -rap to print tests summary even when they are successful\n  -rap\n  --verbose\n\n  # otherwise it won't discover doctests\n  --doctest-modules\n\n  # show all test durations (unless they are too short)\n  --durations=0\n"
  },
  {
    "path": "ruff.toml",
    "content": "line-length = 120  # impacts import sorting\n\nlint.extend-select = [\n    \"ALL\",\n]\n\nlint.ignore = [\n    \"D\",     # annoying nags about docstrings\n    \"N\",     # pep naming\n    \"TCH\",   # type checking rules, mostly just suggests moving imports under TYPE_CHECKING\n    \"S\",     # bandit (security checks) -- tends to be not very useful, lots of nitpicks\n    \"DTZ\",   # datetimes checks -- complaining about missing tz and mostly false positives\n    \"FIX\",   # complains about fixmes/todos -- annoying\n    \"TD\",    # complains about todo formatting -- too annoying\n    \"ANN\",   # missing type annotations? seems way to strict though\n    \"EM\" ,   # suggests assigning all exception messages into a variable first... pretty annoying\n\n### too opinionated style checks\n    \"E501\",  # too long lines\n    \"E731\",  # assigning lambda instead of using def\n    \"E741\",  # Ambiguous variable name: `l`\n    \"E742\",  # Ambiguous class name: `O\n    \"E401\",  # Multiple imports on one line\n    \"F403\",  # import *` used; unable to detect undefined names\n###\n\n###\n    \"E722\",  # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing..\n    \"F811\",  # Redefinition of unused  # this gets in the way of pytest fixtures (e.g. in cachew)\n\n## might be nice .. but later and I don't wanna make it strict\n    \"E402\",  # Module level import not at top of file\n\n### these are just nitpicky, we usually know better\n    \"PLR0911\",  # too many return statements\n    \"PLR0912\",  # too many branches\n    \"PLR0913\",  # too many function arguments\n    \"PLR0915\",  # too many statements\n    \"PLR1714\",  # consider merging multiple comparisons\n    \"PLR2044\",  # line with empty comment\n    \"PLR5501\",  # use elif instead of else if\n    \"PLR2004\",  # magic value in comparison -- super annoying in tests\n###\n    \"PLR0402\",  # import X.Y as Y -- TODO maybe consider enabling it, but double check\n\n    \"B009\",  # calling gettattr with constant attribute -- this is useful to convince mypy\n    \"B010\",  # same as above, but setattr\n    \"B017\",  # pytest.raises(Exception)\n    \"B023\",  # seems to result in false positives?\n\n    # complains about useless pass, but has sort of a false positive if the function has a docstring?\n    # this is common for click entrypoints (e.g. in __main__), so disable\n    \"PIE790\",\n\n    # a bit too annoying, offers to convert for loops to list comprehension\n    # , which may heart readability\n    \"PERF401\",\n\n    # suggests no using exception in for loops\n    # we do use this technique a lot, plus in 3.11 happy path exception handling is \"zero-cost\"\n    \"PERF203\",\n\n    \"RET504\", # unnecessary assignment before returning -- that can be useful for readability\n    \"RET505\", # unnecessary else after return -- can hurt readability\n\n    \"PLW0603\",  # global variable update.. we usually know why we are doing this\n    \"PLW2901\",  # for loop variable overwritten, usually this is intentional\n\n    \"PT011\",  # pytest raises is too broad\n\n    \"COM812\",  # trailing comma missing -- mostly just being annoying with long multiline strings\n\n    \"TRY003\",  # suggests defining exception messages in exception class -- kinda annoying\n    \"TRY201\",  # raise without specifying exception name -- sometimes hurts readability\n    \"TRY400\",  # a bit dumb, and results in false positives (see https://github.com/astral-sh/ruff/issues/18070)\n    \"TRY401\",  # redundant exception in logging.exception call? TODO double check, might result in excessive logging\n\n    \"TID252\",  # Prefer absolute imports over relative imports from parent modules\n\n    ## too annoying\n    \"T20\",     # just complains about prints and pprints (TODO maybe consider later?)\n    \"Q\",       # flake quotes, too annoying\n    \"C90\",     # some complexity checking\n    \"G004\",    # logging statement uses f string\n    \"ERA001\",  # commented out code\n    \"SLF001\",  # private member accessed\n    \"BLE001\",  # do not catch 'blind' Exception\n    \"INP001\",  # complains about implicit namespace packages\n    \"SIM102\",  # if statements collapsing, often hurts readability\n    \"SIM103\",  # multiple conditions collapsing, often hurts readability\n    \"SIM105\",  # suggests using contextlib.suppress instad of try/except -- this wouldn't be mypy friendly\n    \"SIM108\",  # suggests using ternary operation instead of if -- hurts readability\n    \"SIM110\",  # suggests using any(...) instead of for look/return -- hurts readability\n    \"SIM117\",  # suggests using single with statement instead of nested -- doesn't work in tests\n    \"RSE102\",  # complains about missing parens in exceptions\n    ##\n\n    \"PLC0415\", # \"imports should be at the top level\" -- not realistic\n\n    \"ISC001\",  # implicit string concatenation -- we do use it in tests\n]\n\n\nextend-exclude = [\n    \"src/cachew/legacy.py\",  # TODO dunno, remove it for good?\n]\n"
  },
  {
    "path": "src/cachew/__init__.py",
    "content": "import fnmatch\nimport functools\nimport importlib.metadata\nimport inspect\nimport json\nimport logging\nimport os\nimport stat\nimport warnings\nfrom collections.abc import Callable, Iterable\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Literal,\n    cast,\n    get_args,\n    get_origin,\n    get_type_hints,\n    overload,\n)\n\ntry:\n    # orjson might not be available on some architectures, so let's make it defensive just in case\n    from orjson import dumps as orjson_dumps\n    from orjson import loads as orjson_loads\nexcept:\n    warnings.warn(\"orjson couldn't be imported. It's _highly_ recommended for better caching performance\", stacklevel=2)\n\n    def orjson_dumps(*args, **kwargs):  # type: ignore[misc]\n        # sqlite needs a blob\n        return json.dumps(*args, **kwargs).encode('utf8')\n\n    orjson_loads = json.loads  # ty: ignore[invalid-assignment]\n\nimport platformdirs\n\nfrom .backend.common import AbstractBackend\nfrom .backend.file import FileBackend\nfrom .backend.sqlite import SqliteBackend\nfrom .common import CachewException, SourceHash, TypeNotSupported\nfrom .logging_helper import make_logger\nfrom .marshall.cachew import CachewMarshall, build_schema\nfrom .utils import resolve_type_parameters\n\n# in case of changes in the way cachew stores data, this should be changed to discard old caches\nCACHEW_VERSION: str = importlib.metadata.version(__name__)\n\ntype PathIsh = Path | str\n\nBackend = Literal['sqlite', 'file']\n\n\nclass settings:\n    '''\n    Global settings, you can override them after importing cachew\n    '''\n\n    '''\n    Toggle to disable caching\n    '''\n    ENABLE: bool = True\n\n    DEFAULT_CACHEW_DIR: PathIsh = Path(platformdirs.user_cache_dir('cachew'))\n\n    '''\n    Set to true if you want to fail early. Otherwise falls back to non-cached version\n    '''\n    THROW_ON_ERROR: bool = False\n\n    DEFAULT_BACKEND: Backend = 'sqlite'\n\n\ndef get_logger() -> logging.Logger:\n    return make_logger(__name__)\n\n\nBACKENDS: dict[Backend, type[AbstractBackend]] = {\n    'file': FileBackend,\n    'sqlite': SqliteBackend,\n}\n\n\ntype PathProvider[**P] = PathIsh | Callable[P, PathIsh]\ntype HashFunction[**P] = Callable[P, SourceHash]\n\n\ndef default_hash(*args, **kwargs) -> SourceHash:\n    # TODO eh, demand hash? it's not safe either... ugh\n    # can lead to werid consequences otherwise..\n    return str(args + tuple(sorted(kwargs.items())))  # good enough??\n\n\n# TODO give it as an example in docs\ndef mtime_hash(path: Path, *args, **kwargs) -> SourceHash:\n    mt = path.stat().st_mtime\n    return default_hash(f'{path}.{mt}', *args, **kwargs)\n\n\nFailure = str  # deliberately not a type =, used in type checks\ntype Kind = Literal['single', 'multiple']\ntype Inferred = tuple[Kind, type[Any]]\n\n\ndef infer_return_type(func) -> Failure | Inferred:\n    \"\"\"\n    >>> def const() -> int:\n    ...     return 123\n    >>> infer_return_type(const)\n    ('single', <class 'int'>)\n\n    >>> from typing import Optional\n    >>> def first_character(s: str) -> Optional[str]:\n    ...     return None if len(s) == 0 else s[0]\n    >>> kind, opt = infer_return_type(first_character)\n    >>> # in 3.8, Optional[str] is printed as Union[str, None], so need to hack around this\n    >>> (kind, opt == Optional[str])\n    ('single', True)\n\n    # tuple is an iterable.. but presumably should be treated as a single value\n    >>> from typing import Tuple\n    >>> def a_tuple() -> Tuple[int, str]:\n    ...     return (123, 'hi')\n    >>> infer_return_type(a_tuple)\n    ('single', tuple[int, str])\n\n    >>> from typing import Collection, NamedTuple\n    >>> class Person(NamedTuple):\n    ...     name: str\n    ...     age: int\n    >>> def person_provider() -> Collection[Person]:\n    ...     return []\n    >>> infer_return_type(person_provider)\n    ('multiple', <class 'cachew.Person'>)\n\n    >>> def single_str() -> str:\n    ...     return 'hello'\n    >>> infer_return_type(single_str)\n    ('single', <class 'str'>)\n\n    >>> def single_person() -> Person:\n    ...     return Person(name=\"what\", age=-1)\n    >>> infer_return_type(single_person)\n    ('single', <class 'cachew.Person'>)\n\n    >>> from typing import Sequence\n    >>> def int_provider() -> Sequence[int]:\n    ...     return (1, 2, 3)\n    >>> infer_return_type(int_provider)\n    ('multiple', <class 'int'>)\n\n    >>> from typing import Iterator\n    >>> def union_provider() -> Iterator[str | int]:\n    ...     yield 1\n    ...     yield 'aaa'\n    >>> infer_return_type(union_provider)\n    ('multiple', str | int)\n\n    >>> from typing import Iterator\n    >>> type Str = str\n    >>> type Int = int\n    >>> type IteratorStrInt = Iterator[Str | Int]\n    >>> def iterator_str_int() -> IteratorStrInt:\n    ...     yield 1\n    ...     yield 'aaa'\n    >>> infer_return_type(iterator_str_int)\n    ('multiple', str | int)\n\n    # a bit of an edge case\n    >>> from typing import Tuple\n    >>> def empty_tuple() -> Iterator[Tuple[()]]:\n    ...     yield ()\n    >>> infer_return_type(empty_tuple)\n    ('multiple', tuple[()])\n\n    ... # doctest: +ELLIPSIS\n\n    >>> def untyped():\n    ...     return 123\n    >>> infer_return_type(untyped)\n    'no return type annotation...'\n\n    >>> from typing import List\n    >>> class Custom:\n    ...     pass\n    >>> def unsupported() -> Custom:\n    ...     return Custom()\n    >>> infer_return_type(unsupported)\n    \"can't infer type from <class 'cachew.Custom'>: can't cache <class 'cachew.Custom'>\"\n\n    >>> def unsupported_list() -> List[Custom]:\n    ...     return [Custom()]\n    >>> infer_return_type(unsupported_list)\n    \"can't infer type from list[cachew.Custom]: can't cache <class 'cachew.Custom'>\"\n    \"\"\"\n    try:\n        hints = get_type_hints(func)\n    except Exception as ne:\n        # get_type_hints might fail if types are forward defined or missing\n        # see test_future_annotation for an example\n        return str(ne)\n    rtype = hints.get('return', None)\n    if rtype is None:\n        return f\"no return type annotation on {func}\"\n\n    rtype = resolve_type_parameters(rtype)\n\n    def bail(reason: str) -> str:\n        return f\"can't infer type from {rtype}: \" + reason\n\n    # first we wanna check if the top level type is some sort of iterable that makes sense ot cache\n    # e.g. List/Sequence/Iterator etc\n    return_multiple = _returns_multiple(rtype)\n\n    if return_multiple:\n        # then the actual type to cache will be the argument of the top level one\n        args = get_args(rtype)\n        if args is None:\n            return bail(\"has no __args__\")\n\n        if len(args) != 1:\n            return bail(f\"wrong number of __args__: {args}\")\n\n        (cached_type,) = args\n    else:\n        cached_type = rtype\n\n    try:\n        build_schema(Type=cached_type)\n    except TypeNotSupported as ex:\n        return bail(f\"can't cache {ex.type_}\")\n\n    return ('multiple' if return_multiple else 'single', cached_type)\n\n\ndef _returns_multiple(rtype) -> bool:\n    origin = get_origin(rtype)\n    if origin is None:\n        return False\n    if origin is tuple:\n        # usually tuples are more like single values rather than a sequence? (+ this works for namedtuple)\n        return False\n    try:\n        return issubclass(origin, Iterable)\n    except TypeError:\n        # that would happen if origin is not a 'proper' type, e.g. is a Union or something\n        # seems like exception is the easiest way to check\n        return False\n\n\n# https://stackoverflow.com/questions/653368/how-to-create-a-python-decorator-that-can-be-used-either-with-or-without-paramet\ndef doublewrap(f):\n    @functools.wraps(f)\n    def new_dec(*args, **kwargs):\n        if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):\n            # actual decorated function\n            return f(args[0])\n        else:\n            # decorator arguments\n            return lambda realf: f(realf, *args, **kwargs)\n\n    return new_dec\n\n\ndef cachew_error(e: Exception, *, logger: logging.Logger) -> None:\n    if settings.THROW_ON_ERROR:\n        # TODO would be nice to throw from the original code line -- maybe mess with the stack here?\n        raise e\n    logger.error(\"error while setting up cache, falling back to non-cached version\")\n    logger.exception(e)\n\n\nuse_default_path = cast(Path, object())\n\n\n# using cachew_impl here just to use different signatures during type checking (see below)\n@doublewrap\ndef cachew_impl[**P](\n    func=None,  # TODO should probably type it after switch to python 3.10/proper paramspec\n    cache_path: PathProvider[P] | None = use_default_path,\n    *,\n    force_file: bool = False,\n    cls: type | tuple[Kind, type] | None = None,\n    depends_on: HashFunction[P] = default_hash,\n    logger: logging.Logger | None = None,\n    chunk_by: int = 100,\n    # NOTE: allowed values for chunk_by depend on the system.\n    # some systems (to be more specific, sqlite builds), it might be too large and cause issues\n    # ideally this would be more defensive/autodetected, maybe with a warning?\n    # you can use 'test_many' to experiment\n    # - too small values (e.g. 10)  are slower than 100 (presumably, too many sql statements)\n    # - too large values (e.g. 10K) are slightly slower as well (not sure why?)\n    synthetic_key: str | None = None,\n    backend: Backend | None = None,\n    **kwargs,\n):\n    r\"\"\"\n    Database-backed cache decorator. TODO more description?\n    # TODO use this doc in readme?\n\n    :param cache_path: if not set, `cachew.settings.DEFAULT_CACHEW_DIR` will be used.\n    :param force_file: if set to True, assume `cache_path` is a regular file (instead of a directory)\n    :param cls: if not set, cachew will attempt to infer it from return type annotation. See :func:`infer_return_type` and :func:`cachew.tests.test_cachew.test_return_type_inference`.\n    :param depends_on: hash function to determine whether the underlying . Can potentially benefit from the use of side effects (e.g. file modification time). TODO link to test?\n    :param logger: custom logger, if not specified will use logger named `cachew`. See :func:`get_logger`.\n    :return: iterator over original or cached items\n\n    Usage example:\n    >>> from typing import NamedTuple, Iterator\n    >>> class Link(NamedTuple):\n    ...     url : str\n    ...     text: str\n    ...\n    >>> @cachew\n    ... def extract_links(archive_path: str) -> Iterator[Link]:\n    ...     for i in range(5):\n    ...         # simulate slow IO\n    ...         # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours\n    ...         import time; time.sleep(1)\n    ...         yield Link(url=f'http://link{i}.org', text=f'text {i}')\n    ...\n    >>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run\n    [Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]\n\n    >>> from timeit import Timer\n    >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)\n    ... # second run is cached, so should take less time\n    >>> print(f\"call took {int(res)} seconds\")\n    call took 0 seconds\n\n    >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)\n    ... # now file has changed, so the cache will be discarded\n    >>> print(f\"call took {int(res)} seconds\")\n    call took 5 seconds\n    \"\"\"\n    if logger is None:\n        module_name = getattr(func, '__module__', None)\n        if module_name is not None and module_name in logging.Logger.manager.loggerDict:\n            # if logger for the function's module already exists, reuse it\n            logger = logging.getLogger(module_name)\n        else:\n            # rely on default cachew logger\n            logger = get_logger()\n\n    class AddFuncName(logging.LoggerAdapter):\n        def process(self, msg, kwargs):\n            extra = self.extra\n            assert extra is not None\n            func_name = extra['func_name']\n            return f'[{func_name}] {msg}', kwargs\n\n    assert func is not None\n    func_name = callable_name(func)\n    adapter = AddFuncName(logger, {'func_name': func_name})\n    logger = cast(logging.Logger, adapter)\n\n    hashf = kwargs.get('hashf')\n    if hashf is not None:\n        warnings.warn(\"'hashf' is deprecated. Please use 'depends_on' instead\", stacklevel=2)\n        depends_on = hashf\n\n    # todo not very nice that ENABLE check is scattered across two places\n    if not settings.ENABLE or cache_path is None:\n        logger.debug('cache explicitly disabled (settings.ENABLE is False or cache_path is None)')\n        return func\n\n    if cache_path is use_default_path:\n        cache_path = settings.DEFAULT_CACHEW_DIR\n        logger.debug(f'no cache_path specified, using the default {cache_path}')\n\n    use_kind: Kind | None = None\n    use_cls: type | None = None\n    if cls is not None:\n        # defensive here since typing. objects passed as cls might fail on isinstance\n        try:\n            is_tuple = isinstance(cls, tuple)\n        except:\n            is_tuple = False\n        if is_tuple:\n            use_kind, use_cls = cls  # type: ignore[misc]\n        else:\n            use_kind = 'multiple'\n            use_cls = cls  # type: ignore[assignment]\n\n    # TODO fuzz infer_return_type, should never crash?\n    inference_res = infer_return_type(func)\n    if isinstance(inference_res, Failure):\n        msg = f\"failed to infer cache type: {inference_res}. See https://github.com/karlicoss/cachew#features for the list of supported types.\"\n        if use_cls is None:\n            ex = CachewException(msg)\n            cachew_error(ex, logger=logger)\n            return func\n        else:\n            # it's ok, assuming user knows better\n            logger.debug(msg)\n            assert use_kind is not None\n    else:\n        (inferred_kind, inferred_cls) = inference_res\n        if use_cls is None:\n            logger.debug(f'using inferred type {inferred_kind} {inferred_cls}')\n            (use_kind, use_cls) = (inferred_kind, inferred_cls)\n        else:\n            assert use_kind is not None\n            if (use_kind, use_cls) != inference_res:\n                logger.warning(\n                    f\"inferred type {inference_res} mismatches explicitly specified type {(use_kind, use_cls)}\"\n                )\n                # TODO not sure if should be more serious error...\n\n    if use_kind == 'single':\n        # pretend it's an iterable, this is just simpler for cachew_wrapper\n        @functools.wraps(func)\n        def _func(*args, **kwargs):\n            return [func(*args, **kwargs)]\n\n    else:\n        _func = func\n\n    assert use_cls is not None\n\n    ctx = Context(\n        func         =_func,\n        cache_path   =cache_path,\n        force_file   =force_file,\n        cls_         =use_cls,\n        depends_on   =depends_on,\n        logger       =logger,\n        chunk_by     =chunk_by,\n        synthetic_key=synthetic_key,\n        backend      =backend,\n    )  # fmt: skip\n\n    # hack to avoid extra stack frame (see test_recursive*)\n    @functools.wraps(func)\n    def binder(*args, **kwargs):\n        kwargs['_cachew_context'] = ctx\n        res = cachew_wrapper(*args, **kwargs)\n\n        if use_kind == 'single':\n            lres = list(res)\n            assert len(lres) == 1, lres  # shouldn't happen\n            return lres[0]\n        return res\n\n    return binder\n\n\nif TYPE_CHECKING:\n    # we need two versions due to @doublewrap\n    # this is when we just annotate as @cachew without any args\n    @overload\n    def cachew[F: Callable](fun: F) -> F: ...\n\n    # NOTE: we won't really be able to make sure the args of cache_path are the same as args of the wrapped function\n    # because when cachew() is called, we don't know anything about the wrapped function yet\n    # but at least it works for checking that cachew_path and depdns_on have the same args :shrug:\n    @overload\n    def cachew[F, **P](\n        cache_path: PathProvider[P] | None = ...,\n        *,\n        force_file: bool = ...,\n        cls: type | tuple[Kind, type] | None = ...,\n        depends_on: HashFunction[P] = ...,\n        logger: logging.Logger | None = ...,\n        chunk_by: int = ...,\n        synthetic_key: str | None = ...,\n        backend: Backend | None = ...,\n    ) -> Callable[[F], F]: ...\n\n    def cachew(*args, **kwargs):  # make ty happy\n        raise NotImplementedError\nelse:\n    cachew = cachew_impl\n\n\ndef callable_name(func: Callable) -> str:\n    # some functions don't have __module__\n    mod = getattr(func, '__module__', None) or ''\n    return f'{mod}:{getattr(func, \"__qualname__\")}'\n\n\ndef callable_module_name(func: Callable) -> str | None:\n    return getattr(func, '__module__', None)\n\n\n# could cache this, but might be worth not to, so the user can change it on the fly?\ndef _parse_disabled_modules(logger: logging.Logger | None = None) -> list[str]:\n    # e.g. CACHEW_DISABLE=my.browser:my.reddit\n    if 'CACHEW_DISABLE' not in os.environ:\n        return []\n    disabled = os.environ['CACHEW_DISABLE']\n    if disabled.strip() == '':\n        return []\n    if ',' in disabled and logger:\n        logger.warning(\n            'CACHEW_DISABLE contains a comma, but this expects a $PATH-like, colon-separated list; '\n            f'try something like CACHEW_DISABLE={disabled.replace(\",\", \":\")}'\n        )\n    # remove any empty strings incase did something like CACHEW_DISABLE=my.module:$CACHEW_DISABLE\n    return [p for p in disabled.split(':') if p.strip() != '']\n\n\ndef _matches_disabled_module(module_name: str, pattern: str) -> bool:\n    '''\n    >>> _matches_disabled_module('my.browser', 'my.browser')\n    True\n    >>> _matches_disabled_module('my.browser', 'my.*')\n    True\n    >>> _matches_disabled_module('my.browser', 'my')\n    True\n    >>> _matches_disabled_module('my.browser', 'my.browse*')\n    True\n    >>> _matches_disabled_module('my.browser.export', 'my.browser')\n    True\n    >>> _matches_disabled_module('mysomething.else', '*')  # CACHEW_DISABLE='*' disables everything\n    True\n    >>> _matches_disabled_module('my.browser', 'my.br?????')  # fnmatch supports unix-like patterns\n    True\n    >>> _matches_disabled_module('my.browser', 'my.browse')\n    False\n    >>> _matches_disabled_module('mysomething.else', 'my')  # since not at '.' boundary, doesn't match\n    False\n    >>> _matches_disabled_module('mysomething.else', '')\n    False\n    >>> _matches_disabled_module('my.browser', 'my.browser.export')\n    False\n    '''\n\n    if module_name == pattern:\n        return True\n\n    module_parts = module_name.split('.')\n    pattern_parts = pattern.split('.')\n\n    # e.g. if pattern is 'module.submod.inner_module' and module is just 'module.submod'\n    # theres no possible way for it to match\n    if len(module_parts) < len(pattern_parts):\n        return False\n\n    for mp, pp in zip(module_parts, pattern_parts, strict=False):\n        if fnmatch.fnmatch(mp, pp):\n            continue\n        return False\n    return True\n\n\ndef _module_is_disabled(module_name: str, logger: logging.Logger) -> bool:\n    disabled_modules = _parse_disabled_modules(logger)\n    for pat in disabled_modules:\n        if _matches_disabled_module(module_name, pat):\n            logger.debug(\n                f\"caching disabled for {module_name} (matched '{pat}' from 'CACHEW_DISABLE={os.environ['CACHEW_DISABLE']})'\"\n            )\n            return True\n    return False\n\n\n# fmt: off\n_CACHEW_CACHED       = 'cachew_cached'  # TODO add to docs\n_SYNTHETIC_KEY       = 'synthetic_key'\n_SYNTHETIC_KEY_VALUE = 'synthetic_key_value'\n_DEPENDENCIES        = 'dependencies'\n# fmt: on\n\n\n@dataclass\nclass Context[**P]:\n    # fmt: off\n    func         : Callable\n    cache_path   : PathProvider[P]\n    force_file   : bool\n    cls_         : type\n    depends_on   : HashFunction[P]\n    logger       : logging.Logger\n    chunk_by     : int\n    synthetic_key: str | None\n    backend      : Backend | None\n\n    def composite_hash(self, *args, **kwargs) -> dict[str, Any]:\n        fsig = inspect.signature(self.func)\n        # defaults wouldn't be passed in kwargs, but they can be an implicit dependency (especially inbetween program runs)\n        defaults = {\n            k: v.default\n            for k, v in fsig.parameters.items()\n            if v.default is not inspect.Parameter.empty\n        }\n        # but only pass default if the user wants it in the hash function?\n        hsig = inspect.signature(self.depends_on)\n        defaults = {\n            k: v\n            for k, v in defaults.items()\n            if k in hsig.parameters or 'kwargs' in hsig.parameters\n        }\n        kwargs = {**defaults, **kwargs}\n        schema = str(self.cls_)\n        hash_parts = {\n            'cachew'      : CACHEW_VERSION,\n            'schema'      : schema,\n            _DEPENDENCIES : str(self.depends_on(*args, **kwargs)),\n        }\n        synthetic_key = self.synthetic_key\n        if synthetic_key is not None:\n            hash_parts[_SYNTHETIC_KEY      ] = synthetic_key\n            hash_parts[_SYNTHETIC_KEY_VALUE] = kwargs[synthetic_key]\n            # FIXME assert it's in kwargs in the first place?\n            # FIXME support positional args too? maybe extract the name from signature somehow? dunno\n            # need to test it\n        return hash_parts\n    # fmt: on\n\n\ndef cachew_wrapper[**P](\n    *args,\n    _cachew_context: Context[P],\n    **kwargs,\n):\n    C = _cachew_context\n    # fmt: off\n    func          = C.func\n    cache_path    = C.cache_path\n    force_file    = C.force_file\n    cls           = C.cls_\n    logger        = C.logger\n    chunk_by      = C.chunk_by\n    synthetic_key = C.synthetic_key\n    backend_name  = C.backend\n    # fmt: on\n\n    used_backend = backend_name or settings.DEFAULT_BACKEND\n\n    func_name = callable_name(func)\n    if not settings.ENABLE:\n        logger.debug('cache explicitly disabled (settings.ENABLE is False)')\n        yield from func(*args, **kwargs)\n        return\n\n    mod_name = callable_module_name(func)\n    if mod_name is not None and _module_is_disabled(mod_name, logger):\n        yield from func(*args, **kwargs)\n        return\n\n    def get_db_path() -> Path | None:\n        db_path: Path\n        if callable(cache_path):\n            pp = cache_path(*args, **kwargs)\n            if pp is None:\n                logger.debug('cache explicitly disabled (cache_path is None)')\n                # early return, in this case we just yield the original items from the function\n                return None\n            else:\n                db_path = Path(pp)\n        else:\n            db_path = Path(cache_path)\n\n        db_path.parent.mkdir(parents=True, exist_ok=True)\n\n        # need to be atomic here, hence calling stat() once and then just using the results\n        try:\n            # note: stat follows symlinks (which is what we want)\n            st = db_path.stat()\n        except FileNotFoundError:\n            # doesn't exist. then it's controlled by force_file\n            if force_file:\n                # just use db_path as is\n                pass\n            else:\n                db_path.mkdir(parents=True, exist_ok=True)\n                db_path = db_path / func_name\n        else:\n            # already exists, so just use callable name if it's a dir\n            if stat.S_ISDIR(st.st_mode):\n                db_path = db_path / func_name\n\n        logger.debug(f'using {used_backend}:{db_path} for cache')\n        return db_path\n\n    def try_use_synthetic_key() -> None:\n        if synthetic_key is None:\n            return\n        # attempt to use existing cache if possible, as a 'prefix'\n\n        old_hash_d: dict[str, Any] = {}\n        if old_hash is not None:\n            try:\n                old_hash_d = json.loads(old_hash)\n            except json.JSONDecodeError:\n                # possible if we used old cachew version (<=0.8.1), hash wasn't json\n                pass\n\n        hash_diffs = {\n            k: new_hash_d.get(k) == old_hash_d.get(k)\n            for k in (*new_hash_d.keys(), *old_hash_d.keys())\n            # the only 'allowed' differences for hash, otherwise need to recompute (e.g. if schema changed)\n            if k not in {_SYNTHETIC_KEY_VALUE, _DEPENDENCIES}\n        }\n        cache_compatible = all(hash_diffs.values())\n        if not cache_compatible:\n            return\n\n        def missing_keys(cached: list[str], wanted: list[str]) -> list[str] | None:\n            # FIXME assert both cached and wanted are sorted? since we rely on it\n            # if not, then the user could use some custom key for caching (e.g. normalise filenames etc)\n            # although in this case passing it into the function wouldn't make sense?\n\n            if len(cached) == 0:\n                # no point trying to reuse anything, cache should be empty?\n                return None\n            if len(wanted) == 0:\n                # similar, no way to reuse cache\n                return None\n            if cached[0] != wanted[0]:\n                # there is no common prefix, so no way to reuse cache really\n                return None\n            last_cached = cached[-1]\n            # ok, now actually figure out which items are missing\n            for i, k in enumerate(wanted):\n                if k > last_cached:\n                    # ok, rest of items are missing\n                    return wanted[i:]\n            # otherwise too many things are cached, and we seem to wante less\n            return None\n\n        new_values: list[str] = new_hash_d[_SYNTHETIC_KEY_VALUE]\n        old_values: list[str] = old_hash_d[_SYNTHETIC_KEY_VALUE]\n        missing = missing_keys(cached=old_values, wanted=new_values)\n        if missing is not None:\n            # can reuse cache\n            kwargs[_CACHEW_CACHED] = cached_items()\n            kwargs[synthetic_key] = missing\n\n    early_exit = False\n\n    def written_to_cache():\n        nonlocal early_exit\n\n        datas = func(*args, **kwargs)\n\n        if isinstance(backend, FileBackend):\n            # FIXME uhhh.. this is a bit crap\n            # but in sqlite mode we don't want to publish new hash before we write new items\n            # maybe should use tmp table for hashes as well?\n            backend.write_new_hash(new_hash)\n        else:\n            # happens later for sqlite\n            pass\n\n        flush_blobs = backend.flush_blobs\n\n        chunk: list[Any] = []\n\n        def flush() -> None:\n            nonlocal chunk\n            if len(chunk) > 0:\n                flush_blobs(chunk=chunk)\n                chunk = []\n\n        total_objects = 0\n        for obj in datas:\n            try:\n                total_objects += 1\n                yield obj\n            except GeneratorExit:\n                early_exit = True\n                return\n\n            dct = marshall.dump(obj)\n            blob = orjson_dumps(dct)\n            chunk.append(blob)\n            if len(chunk) >= chunk_by:\n                flush()\n        flush()\n\n        backend.finalize(new_hash)\n        logger.info(f'wrote   {total_objects} objects to   cachew ({used_backend}:{db_path})')\n\n    def cached_items():\n        total_cached = backend.cached_blobs_total()\n        total_cached_s = '' if total_cached is None else f'{total_cached} '\n        logger.info(f'loading {total_cached_s}objects from cachew ({used_backend}:{db_path})')\n\n        for blob in backend.cached_blobs():\n            j = orjson_loads(blob)\n            obj = marshall.load(j)\n            yield obj\n\n    # NOTE: annoyingly huge try/catch ahead...\n    # but it lets us save a function call, hence a stack frame\n    # see test_recursive*\n    try:\n        db_path = get_db_path()\n        if db_path is None:\n            yield from func(*args, **kwargs)\n            return\n\n        BackendCls = BACKENDS[used_backend]\n\n        new_hash_d = C.composite_hash(*args, **kwargs)\n        new_hash: SourceHash = json.dumps(new_hash_d)\n        logger.debug(f'new hash: {new_hash}')\n\n        marshall: CachewMarshall[Any] = CachewMarshall(Type_=cls)\n\n        with BackendCls(cache_path=db_path, logger=logger) as backend:\n            old_hash = backend.get_old_hash()\n            logger.debug(f'old hash: {old_hash}')\n\n            if new_hash == old_hash:\n                logger.debug('hash matched: loading from cache')\n                yield from cached_items()\n                return\n\n            logger.debug('hash mismatch: computing data and writing to db')\n\n            try_use_synthetic_key()\n\n            got_write = backend.get_exclusive_write()\n            if not got_write:\n                # NOTE: this is the bit we really have to watch out for and not put in a helper function\n                # otherwise it's causing an extra stack frame on every call\n                # the rest (reading from cachew or writing to cachew) happens once per function call? so not a huge deal\n                yield from func(*args, **kwargs)\n                return\n\n            # at this point we're guaranteed to have an exclusive write transaction\n            yield from written_to_cache()\n    except Exception as e:\n        # sigh... see test_early_exit_shutdown...\n        if early_exit and 'Cannot operate on a closed database' in str(e):\n            return\n\n        # todo hmm, kinda annoying that it tries calling the function twice?\n        # but gonna require some sophisticated cooperation with the cached wrapper otherwise\n        cachew_error(e, logger=logger)\n        yield from func(*args, **kwargs)\n\n\n__all__ = [\n    'CachewException',\n    'HashFunction',\n    'SourceHash',\n    'cachew',\n    'get_logger',\n]\n"
  },
  {
    "path": "src/cachew/backend/common.py",
    "content": "import logging\nfrom abc import abstractmethod\nfrom collections.abc import Iterator, Sequence\nfrom pathlib import Path\n\nfrom ..common import SourceHash\n\n\nclass AbstractBackend:\n    @abstractmethod\n    def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:\n        raise NotImplementedError\n\n    @abstractmethod\n    def __enter__(self):\n        raise NotImplementedError\n\n    def __exit__(self, *args) -> None:\n        raise NotImplementedError\n\n    def get_old_hash(self) -> SourceHash | None:\n        raise NotImplementedError\n\n    def cached_blobs_total(self) -> int | None:\n        raise NotImplementedError\n\n    def cached_blobs(self) -> Iterator[bytes]:\n        raise NotImplementedError\n\n    def get_exclusive_write(self) -> bool:\n        '''\n        Returns whether it actually managed to get it\n        '''\n        raise NotImplementedError\n\n    def write_new_hash(self, new_hash: SourceHash) -> None:\n        raise NotImplementedError\n\n    def flush_blobs(self, chunk: Sequence[bytes]) -> None:\n        raise NotImplementedError\n\n    def finalize(self, new_hash: SourceHash) -> None:\n        raise NotImplementedError\n"
  },
  {
    "path": "src/cachew/backend/file.py",
    "content": "import logging\nfrom collections.abc import Iterator, Sequence\nfrom pathlib import Path\nfrom typing import (\n    BinaryIO,\n)\n\nfrom ..common import SourceHash\nfrom .common import AbstractBackend\n\n\nclass FileBackend(AbstractBackend):\n    jsonl: Path\n    jsonl_tmp: Path\n    jsonl_fr: BinaryIO | None\n    jsonl_tmp_fw: BinaryIO | None\n\n    def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:\n        self.logger = logger\n        self.jsonl = cache_path\n        self.jsonl_tmp = Path(str(self.jsonl) + '.tmp')\n\n        self.jsonl_fr = None\n        self.jsonl_tmp_fw = None\n\n    def __enter__(self) -> 'FileBackend':\n        try:\n            self.jsonl_fr = self.jsonl.open('rb')\n        except FileNotFoundError:\n            self.jsonl_fr = None\n        return self\n\n    def __exit__(self, *args) -> None:\n        if self.jsonl_tmp_fw is not None:\n            # might still exist in case of early exit\n            self.jsonl_tmp.unlink(missing_ok=True)\n\n            # NOTE: need to unlink first\n            # otherwise possible that someone else might open the file before we unlink it\n            self.jsonl_tmp_fw.close()\n\n        if self.jsonl_fr is not None:\n            self.jsonl_fr.close()\n\n    def get_old_hash(self) -> SourceHash | None:\n        if self.jsonl_fr is None:\n            return None\n        hash_line = self.jsonl_fr.readline().rstrip(b'\\n')\n        return hash_line.decode('utf8')\n\n    def cached_blobs_total(self) -> int | None:\n        # not really sure how to support that for a plaintext file?\n        # could wc -l but it might be costly..\n        return None\n\n    def cached_blobs(self) -> Iterator[bytes]:\n        assert self.jsonl_fr is not None  # should be guaranteed by get_old_hash\n        yield from self.jsonl_fr  # yields line by line\n\n    def get_exclusive_write(self) -> bool:\n        # NOTE: opening in x (exclusive write) mode just in case, so it throws if file exists\n        try:\n            self.jsonl_tmp_fw = self.jsonl_tmp.open('xb')\n        except FileExistsError:\n            self.jsonl_tmp_fw = None\n            return False\n        else:\n            return True\n\n    def write_new_hash(self, new_hash: SourceHash) -> None:\n        assert self.jsonl_tmp_fw is not None\n        self.jsonl_tmp_fw.write(new_hash.encode('utf8') + b'\\n')\n\n    def flush_blobs(self, chunk: Sequence[bytes]) -> None:\n        fw = self.jsonl_tmp_fw\n        assert fw is not None\n        for blob in chunk:\n            fw.write(blob)\n            fw.write(b'\\n')\n\n    def finalize(self, new_hash: SourceHash) -> None:  # noqa: ARG002\n        # TODO defensive??\n        self.jsonl_tmp.rename(self.jsonl)\n"
  },
  {
    "path": "src/cachew/backend/sqlite.py",
    "content": "import logging\nimport sqlite3\nimport time\nimport warnings\nfrom collections.abc import Iterator, Sequence\nfrom pathlib import Path\n\nimport sqlalchemy\nimport sqlalchemy.exc\nfrom sqlalchemy import Column, Table, event, text\nfrom sqlalchemy.dialects import sqlite\n\nfrom ..common import SourceHash\nfrom .common import AbstractBackend\n\n\nclass SqliteBackend(AbstractBackend):\n    def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:\n        self.logger = logger\n        self.engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}', connect_args={'timeout': 0})\n        # NOTE: timeout is necessary so we don't lose time waiting during recursive calls\n        # by default, it's several seconds? you'd see 'test_recursive' test performance degrade\n\n        @event.listens_for(self.engine, 'connect')\n        def set_sqlite_pragma(dbapi_connection, connection_record):  # noqa: ARG001\n            # without wal, concurrent reading/writing is not gonna work\n\n            # ugh. that's odd, how are we supposed to set WAL if the very fact of setting wal might lock the db?\n            while True:\n                try:\n                    dbapi_connection.execute('PRAGMA journal_mode=WAL')\n                    break\n                except sqlite3.OperationalError as oe:\n                    if 'database is locked' not in str(oe):\n                        # ugh, pretty annoying that exception doesn't include database path for some reason\n                        raise RuntimeError(f'Error while setting WAL on {cache_path}') from oe\n                time.sleep(0.1)\n\n        self.connection = self.engine.connect()\n\n        \"\"\"\n        Erm... this is pretty confusing.\n        https://docs.sqlalchemy.org/en/13/dialects/sqlite.html#transaction-isolation-level\n\n        Somehow without this thing sqlalchemy logs BEGIN (implicit) instead of BEGIN TRANSACTION which actually works in sqlite...\n\n        Judging by sqlalchemy/dialects/sqlite/base.py, looks like some sort of python sqlite driver problem??\n\n        test_transaction should check this behaviour\n        \"\"\"\n\n        @event.listens_for(self.connection, 'begin')\n        def do_begin(conn):\n            # NOTE there is also BEGIN CONCURRENT in newer versions of sqlite. could use it later?\n            conn.execute(text('BEGIN DEFERRED'))\n\n        self.meta = sqlalchemy.MetaData()\n        self.table_hash = Table('hash', self.meta, Column('value', sqlalchemy.String))\n\n        # fmt: off\n        # actual cache\n        self.table_cache     = Table('cache'    , self.meta, Column('data', sqlalchemy.BLOB))\n        # temporary table, we use it to insert and then (atomically?) rename to the above table at the very end\n        self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.BLOB))\n        # fmt: on\n\n    def __enter__(self) -> 'SqliteBackend':\n        # NOTE: deferred transaction\n        self.transaction = self.connection.begin()\n        # FIXME this is a bit crap.. is there a nicer way to use another ctx manager here?\n        self.transaction.__enter__()\n        return self\n\n    def __exit__(self, *args) -> None:\n        self.transaction.__exit__(*args)\n        self.connection.close()\n        self.engine.dispose()\n\n    def get_old_hash(self) -> SourceHash | None:\n        # first, try to do as much as possible read-only, benefiting from deferred transaction\n        old_hashes: Sequence\n        try:\n            # not sure if there is a better way...\n            cursor = self.connection.execute(self.table_hash.select())\n        except sqlalchemy.exc.OperationalError as e:\n            # meh. not sure if this is a good way to handle this..\n            if 'no such table: hash' in str(e):\n                old_hashes = []\n            else:\n                raise e\n        else:\n            old_hashes = cursor.fetchall()\n\n        assert len(old_hashes) <= 1, old_hashes  # shouldn't happen\n\n        old_hash: SourceHash | None\n        if len(old_hashes) == 0:\n            old_hash = None\n        else:\n            old_hash = old_hashes[0][0]  # returns a tuple...\n        return old_hash\n\n    def cached_blobs_total(self) -> int | None:\n        [(total,)] = self.connection.execute(sqlalchemy.select(sqlalchemy.func.count()).select_from(self.table_cache))\n        return total\n\n    def cached_blobs(self) -> Iterator[bytes]:\n        rows = self.connection.execute(self.table_cache.select())\n        # by default, sqlalchemy wraps all results into Row object\n        # this can cause quite a lot of overhead if you're reading many rows\n        # it seems that in principle, sqlalchemy supports just returning bare underlying tuple from the dbapi\n        # but from browsing the code it doesn't seem like this functionality exposed\n        # if you're looking for cues, see\n        # - ._source_supports_scalars\n        # - ._generate_rows\n        # - ._row_getter\n        # by using this raw iterator we speed up reading the cache quite a bit\n        # asked here https://github.com/sqlalchemy/sqlalchemy/discussions/10350\n        raw_row_iterator = getattr(rows, '_raw_row_iterator', None)\n        if raw_row_iterator is None:\n            warnings.warn(\n                \"CursorResult._raw_row_iterator method isn't found. This could lead to degraded cache reading performance.\",\n                stacklevel=2,\n            )\n            row_iterator = rows\n        else:\n            row_iterator = raw_row_iterator()\n\n        for (blob,) in row_iterator:\n            yield blob\n\n    def get_exclusive_write(self) -> bool:\n        # NOTE on recursive calls\n        # somewhat magically, they should work as expected with no extra database inserts?\n        # the top level call 'wins' the write transaction and once it's gathered all data, will write it\n        # the 'intermediate' level calls fail to get it and will pass data through\n        # the cached 'bottom' level is read only and will be yielded without a write transaction\n        try:\n            # first 'write' statement will upgrade transaction to write transaction which might fail due to concurrency\n            # see https://www.sqlite.org/lang_transaction.html\n            # NOTE: because of 'checkfirst=True', only the last .create will guarantee the transaction upgrade to write transaction\n            self.table_hash.create(self.connection, checkfirst=True)\n\n            # 'table' used to be old 'cache' table name, so we just delete it regardless\n            # otherwise it might overinfalte the cache db with stale values\n            self.connection.execute(text('DROP TABLE IF EXISTS `table`'))\n\n            # NOTE: we have to use .drop and then .create (e.g. instead of some sort of replace)\n            # since it's possible to have schema changes inbetween calls\n            # checkfirst=True because it might be the first time we're using cache\n            self.table_cache_tmp.drop(self.connection, checkfirst=True)\n            self.table_cache_tmp.create(self.connection)\n        except sqlalchemy.exc.OperationalError as e:\n            if e.code == 'e3q8' and 'database is locked' in str(e):\n                # someone else must be have won the write lock\n                # not much we can do here\n                # NOTE: important to close early, otherwise we might hold onto too many file descriptors during yielding\n                # see test_recursive_deep\n                # (normally connection is closed in SqliteBackend.__exit__)\n                self.connection.close()\n                # in this case all the callee can do is just to call the actual function\n                return False\n            else:\n                raise e\n        return True\n\n    def flush_blobs(self, chunk: Sequence[bytes]) -> None:\n        # uhh. this gives a huge speedup for inserting\n        # since we don't have to create intermediate dictionaries\n        # TODO move this to __init__?\n        insert_into_table_cache_tmp_raw = str(\n            self.table_cache_tmp.insert().compile(dialect=sqlite.dialect(paramstyle='qmark'))\n        )\n        # I also tried setting paramstyle='qmark' in create_engine, but it seems to be ignored :(\n        # idk what benefit sqlalchemy gives at this point, seems to just complicate things\n        self.connection.exec_driver_sql(insert_into_table_cache_tmp_raw, [(c,) for c in chunk])\n\n    def finalize(self, new_hash: SourceHash) -> None:\n        # delete hash first, so if we are interrupted somewhere, it mismatches next time and everything is recomputed\n        self.connection.execute(self.table_hash.delete())\n\n        # checkfirst is necessary since it might not have existed in the first place\n        # e.g. first time we use cache\n        self.table_cache.drop(self.connection, checkfirst=True)\n\n        # meh https://docs.sqlalchemy.org/en/14/faq/metadata_schema.html#does-sqlalchemy-support-alter-table-create-view-create-trigger-schema-upgrade-functionality\n        # also seems like sqlalchemy doesn't have any primitives to escape table names.. sigh\n        self.connection.execute(text(f\"ALTER TABLE `{self.table_cache_tmp.name}` RENAME TO `{self.table_cache.name}`\"))\n\n        self.connection.execute(self.table_hash.insert().values([{'value': new_hash}]))\n"
  },
  {
    "path": "src/cachew/common.py",
    "content": "from dataclasses import dataclass\n\n# TODO better name to represent what it means?\ntype SourceHash = str\n\n\nclass CachewException(RuntimeError):\n    pass\n\n\n@dataclass\nclass TypeNotSupported(CachewException):\n    type_: type\n    reason: str\n\n    def __str__(self) -> str:\n        return f\"{self.type_} isn't supported by cachew: {self.reason}. See https://github.com/karlicoss/cachew#features for the list of supported types.\"\n"
  },
  {
    "path": "src/cachew/compat.py",
    "content": "import sys\n\nif sys.version_info[:2] >= (3, 13):\n    from warnings import deprecated\nelse:\n    from typing_extensions import deprecated\n\n\n__all__ = [\"deprecated\"]\n"
  },
  {
    "path": "src/cachew/experimental.py",
    "content": "from typing import TYPE_CHECKING\n\nif not TYPE_CHECKING:\n    from .compat import deprecated\n\n    @deprecated(\"Exceptions are not an experimental feature anymore and enabled by default.\")\n    def enable_exceptions() -> None:\n        pass\n\n    @deprecated(\"Exceptions are not an experimental feature anymore and enabled by default.\")\n    def disable_exceptions() -> None:\n        pass\n"
  },
  {
    "path": "src/cachew/extra.py",
    "content": "# todo Ideally, needs doublewraps as well? also typing helpers\ndef mcachew(*args, **kwargs):\n    \"\"\"\n    Stands for 'Maybe cachew'.\n    Defensive wrapper around @cachew to make it an optional dependency.\n    \"\"\"\n    try:\n        import cachew\n    except ModuleNotFoundError:\n        import warnings\n\n        warnings.warn(\n            'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew',\n            stacklevel=2,\n        )\n        return lambda orig_func: orig_func\n    else:\n        return cachew.cachew(*args, **kwargs)\n\n\nfrom contextlib import contextmanager\n\n\n@contextmanager\ndef disabled_cachew():\n    from . import settings\n\n    orig = settings.ENABLE\n    try:\n        settings.ENABLE = False\n        yield\n    finally:\n        settings.ENABLE = orig\n"
  },
  {
    "path": "src/cachew/legacy.py",
    "content": "import typing\nimport warnings\nfrom collections.abc import Iterable, Iterator, Sequence\nfrom dataclasses import dataclass\nfrom datetime import date, datetime\nfrom itertools import chain, islice\nfrom pathlib import Path\nfrom typing import (\n    Any,\n    Generic,\n    NamedTuple,\n    Optional,\n    TypeVar,\n    Union,\n)\n\nimport sqlalchemy\nfrom sqlalchemy import Column\n\nfrom .pytest import parametrize\nfrom .common import CachewException\n\n\ndef get_union_args(cls) -> Optional[tuple[type]]:\n    if getattr(cls, '__origin__', None) != Union:\n        return None\n\n    args = cls.__args__\n    args = tuple(e for e in args if e is not type(None))\n    assert len(args) > 0\n    return args  # ty: ignore[invalid-return-type]\n\n\ndef is_union(cls) -> bool:\n    return get_union_args(cls) is not None\n\n\nTypes = Union[\n    type[str],\n    type[int],\n    type[float],\n    type[bool],\n    type[datetime],\n    type[date],\n    type[dict],\n    type[list],\n    type[Exception],\n    type[NamedTuple],\n]\n\nValues = Union[\n    str,\n    int,\n    float,\n    bool,\n    datetime,\n    date,\n    dict,\n    list,\n    Exception,\n    NamedTuple,\n]\n\nPRIMITIVE_TYPES = {\n    str,\n    int,\n    float,\n    bool,\n    datetime,\n    date,\n    dict,\n    list,\n    Exception,\n}\n\n\ndef is_primitive(cls: type) -> bool:\n    \"\"\"\n    >>> from typing import Dict, Any\n    >>> is_primitive(int)\n    True\n    >>> is_primitive(set)\n    False\n    >>> is_primitive(dict)\n    True\n    \"\"\"\n    return cls in PRIMITIVE_TYPES\n\n\nclass IsoDateTime(sqlalchemy.TypeDecorator):\n    # in theory could use something more effecient? e.g. blob for encoded datetime and tz?\n    # but practically, the difference seems to be pretty small, so perhaps fine for now\n    impl = sqlalchemy.String\n\n    cache_ok = True\n\n    @property\n    def python_type(self):\n        return datetime\n\n    def process_literal_param(self, value, dialect):\n        raise NotImplementedError()  # make pylint happy\n\n    def process_bind_param(self, value: Optional[datetime], dialect) -> Optional[str]:  # noqa: ARG002\n        if value is None:\n            return None\n        # ok, it's a bit hacky... attempt to preserve pytz infromation\n        iso = value.isoformat()\n        tz = getattr(value, 'tzinfo', None)\n        if tz is None:\n            return iso\n        try:\n            import pytz\n        except ImportError:\n            self.warn_pytz()\n            return iso\n        else:\n            if isinstance(tz, pytz.BaseTzInfo):\n                zone = tz.zone\n                # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6\n                assert zone is not None, tz\n                return iso + ' ' + zone\n            else:\n                return iso\n\n    def process_result_value(self, value: Optional[str], dialect) -> Optional[datetime]:  # noqa: ARG002\n        if value is None:\n            return None\n        spl = value.split(' ')\n        dt = datetime.fromisoformat(spl[0])\n        if len(spl) <= 1:\n            return dt\n        zone = spl[1]\n        # else attempt to decypher pytz tzinfo\n        try:\n            import pytz\n        except ImportError:\n            self.warn_pytz()\n            return dt\n        else:\n            tz = pytz.timezone(zone)\n            return dt.astimezone(tz)\n\n    def warn_pytz(self) -> None:\n        warnings.warn('install pytz for better timezone support while serializing with cachew', stacklevel=2)\n\n\n# a bit hacky, but works...\nclass IsoDate(IsoDateTime):\n    impl = sqlalchemy.String\n\n    cache_ok = True\n\n    @property\n    def python_type(self):\n        return date\n\n    def process_literal_param(self, value, dialect):\n        raise NotImplementedError()  # make pylint happy\n\n    def process_result_value(self, value: Optional[str], dialect) -> Optional[date]:  # type: ignore[override]\n        res = super().process_result_value(value, dialect)\n        if res is None:\n            return None\n        return res.date()\n\n\njtypes = (int, float, bool, type(None))\n\n\nclass ExceptionAdapter(sqlalchemy.TypeDecorator):\n    '''\n    Enables support for caching Exceptions. Exception is treated as JSON and serialized.\n\n    It's useful for defensive error handling, in case of cachew in particular for preserving error state.\n\n    I elaborate on it here: [mypy-driven error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss).\n    '''\n\n    impl = sqlalchemy.JSON\n\n    cache_ok = True\n\n    @property\n    def python_type(self):\n        return Exception\n\n    def process_literal_param(self, value, dialect):\n        raise NotImplementedError()  # make pylint happy\n\n    def process_bind_param(self, value: Optional[Exception], dialect) -> Optional[list[Any]]:  # noqa: ARG002\n        if value is None:\n            return None\n        sargs: list[Any] = []\n        for a in value.args:\n            if any(isinstance(a, t) for t in jtypes):\n                sargs.append(a)\n            elif isinstance(a, date):\n                sargs.append(a.isoformat())\n            else:\n                sargs.append(str(a))\n        return sargs\n\n    def process_result_value(self, value: Optional[str], dialect) -> Optional[Exception]:  # noqa: ARG002\n        if value is None:\n            return None\n        # sadly, can't do much to convert back from the strings? Unless I serialize the type info as well?\n        return Exception(*value)\n\n\n# fmt: off\nPRIMITIVES = {\n    str      : sqlalchemy.String,\n    int      : sqlalchemy.Integer,\n    float    : sqlalchemy.Float,\n    bool     : sqlalchemy.Boolean,\n    datetime : IsoDateTime,\n    date     : IsoDate,\n    dict     : sqlalchemy.JSON,\n    list     : sqlalchemy.JSON,\n    Exception: ExceptionAdapter,\n}\n# fmt: on\nassert set(PRIMITIVES.keys()) == PRIMITIVE_TYPES\n\n\ndef strip_optional(cls) -> tuple[type, bool]:\n    \"\"\"\n    >>> from typing import Optional, NamedTuple\n    >>> strip_optional(Optional[int])\n    (<class 'int'>, True)\n    >>> class X(NamedTuple):\n    ...     x: int\n    >>> strip_optional(X)\n    (<class 'cachew.legacy.X'>, False)\n    \"\"\"\n    is_opt: bool = False\n\n    args = get_union_args(cls)\n    if args is not None and len(args) == 1:\n        cls = args[0]  # meh\n        is_opt = True\n\n    return (cls, is_opt)\n\n\ndef strip_generic(tp):\n    \"\"\"\n    >>> from typing import List\n    >>> strip_generic(List[int])\n    <class 'list'>\n    >>> strip_generic(str)\n    <class 'str'>\n    \"\"\"\n    GA = getattr(typing, '_GenericAlias')  # ugh, can't make both mypy and pylint happy here?\n    if isinstance(tp, GA):\n        return tp.__origin__\n    return tp\n\n\nNT = TypeVar('NT')\n# sadly, bound=NamedTuple is not working yet in mypy\n# https://github.com/python/mypy/issues/685\n# also needs to support dataclasses?\n\n\n@dataclass\nclass NTBinder(Generic[NT]):\n    \"\"\"\n    >>> class Job(NamedTuple):\n    ...    company: str\n    ...    title: Optional[str]\n    >>> class Person(NamedTuple):\n    ...     name: str\n    ...     age: int\n    ...     job: Optional[Job]\n\n    NTBinder is a helper class for inteacting with sqlite database.\n    Hierarchy is flattened:\n    >>> binder = NTBinder.make(Person)\n    >>> [(c.name, type(c.type)) for c in binder.columns]\n    ... # doctest: +NORMALIZE_WHITESPACE\n    [('name',         <class 'sqlalchemy.sql.sqltypes.String'>),\n     ('age',          <class 'sqlalchemy.sql.sqltypes.Integer'>),\n     ('_job_is_null', <class 'sqlalchemy.sql.sqltypes.Boolean'>),\n     ('job_company',  <class 'sqlalchemy.sql.sqltypes.String'>),\n     ('job_title',    <class 'sqlalchemy.sql.sqltypes.String'>)]\n\n\n    >>> person = Person(name='alan', age=40, job=None)\n\n    to_row converts object to a sql-friendly tuple. job=None, so we end up with True in _job_is_null field\n    >>> tuple(binder.to_row(person))\n    ('alan', 40, True, None, None)\n\n    from_row does reverse conversion\n    >>> binder.from_row(('alan', 40, True, None, None))\n    Person(name='alan', age=40, job=None)\n\n    >>> binder.from_row(('ann', 25, True, None, None, 'extra'))\n    Traceback (most recent call last):\n    ...\n    cachew.common.CachewException: unconsumed items in iterator ['extra']\n    \"\"\"\n\n    name: Optional[str]  # None means toplevel\n    type_: Types\n    span: int  # not sure if span should include optional col?\n    primitive: bool\n    optional: bool\n    union: Optional[type]  # helper, which isn't None if type is Union\n    fields: Sequence[Any]  # mypy can't handle cyclic definition at this point :(\n\n    @staticmethod\n    def make(tp: type[NT], name: Optional[str] = None) -> 'NTBinder[NT]':\n        tp, optional = strip_optional(tp)  # ty: ignore[invalid-assignment]\n        union: Optional[type]\n        fields: tuple[Any, ...]\n        primitive: bool\n\n        union_args = get_union_args(tp)\n        if union_args is not None:\n            CachewUnion = NamedTuple('_CachewUnionRepr', [(x.__name__, Optional[x]) for x in union_args])  # type: ignore[misc]\n            union = CachewUnion\n            primitive = False\n            fields = (NTBinder.make(tp=CachewUnion, name='_cachew_union_repr'),)\n            span = 1\n        else:\n            union = None\n            tp = strip_generic(tp)\n            primitive = is_primitive(tp)\n\n            if primitive:\n                if name is None:\n                    name = '_cachew_primitive'  # meh. presumably, top level\n            if primitive:\n                fields = ()\n                span = 1\n            else:\n                annotations = typing.get_type_hints(tp)\n                if annotations == {}:\n                    raise CachewException(\n                        f\"{tp} (field '{name}'): doesn't look like a supported type to cache. See https://github.com/karlicoss/cachew#features for the list of supported types.\"\n                    )\n                fields = tuple(NTBinder.make(tp=ann, name=fname) for fname, ann in annotations.items())\n                span = sum(f.span for f in fields) + (1 if optional else 0)\n        return NTBinder(\n            name=name,\n            type_=tp,  # type: ignore[arg-type]\n            span=span,\n            primitive=primitive,\n            optional=optional,\n            union=union,\n            fields=fields,\n        )\n\n    @property\n    def columns(self) -> list[Column]:\n        return list(self.iter_columns())\n\n    # TODO not necessarily namedtuple? could be primitive type\n    def to_row(self, obj: NT) -> tuple[Optional[Values], ...]:\n        return tuple(self._to_row(obj))\n\n    def from_row(self, row: Iterable[Any]) -> NT:\n        riter = iter(row)\n        res = self._from_row(riter)\n        remaining = list(islice(riter, 0, 1))\n        if len(remaining) != 0:\n            raise CachewException(f'unconsumed items in iterator {remaining}')\n        assert res is not None  # nosec # help mypy; top level will not be None\n        return res\n\n    def _to_row(self, obj) -> Iterator[Optional[Values]]:\n        if self.primitive:\n            yield obj\n        elif self.union is not None:\n            CachewUnion = self.union\n            (uf,) = self.fields\n            # TODO assert only one of them matches??\n            union = CachewUnion(**{f.name: obj if isinstance(obj, f.type_) else None for f in uf.fields})\n            yield from uf._to_row(union)\n        else:\n            if self.optional:\n                is_none = obj is None\n                yield is_none\n            else:\n                is_none = False\n                assert obj is not None  # TODO hmm, that last assert is not very symmetric...\n\n            if is_none:\n                for _ in range(self.span - 1):\n                    yield None\n            else:\n                yield from chain.from_iterable(f._to_row(getattr(obj, f.name)) for f in self.fields)\n\n    def _from_row(self, row_iter):\n        if self.primitive:\n            return next(row_iter)\n        elif self.union is not None:\n            CachewUnion = self.union  # noqa: F841\n            (uf,) = self.fields\n            # TODO assert only one of them is not None?\n            union_params = [r for r in uf._from_row(row_iter) if r is not None]\n            assert len(union_params) == 1, union_params\n            return union_params[0]\n        else:\n            if self.optional:\n                is_none = next(row_iter)\n            else:\n                is_none = False\n\n            if is_none:\n                for _ in range(self.span - 1):\n                    x = next(row_iter)\n                    assert x is None, x  # huh. assert is kinda opposite of producing value\n                return None\n            else:\n                return self.type_(*(f._from_row(row_iter) for f in self.fields))\n\n    # TODO not sure if we want to allow optionals on top level?\n    def iter_columns(self) -> Iterator[Column]:\n        used_names: set[str] = set()\n\n        def col(name: str, tp) -> Column:\n            while name in used_names:\n                name = '_' + name\n            used_names.add(name)\n            return Column(name, tp)\n\n        if self.primitive:\n            if self.name is None:\n                raise AssertionError\n            yield col(self.name, PRIMITIVES[self.type_])\n        else:\n            prefix = '' if self.name is None else self.name + '_'\n            if self.optional:\n                yield col(f'_{prefix}is_null', sqlalchemy.Boolean)\n            for f in self.fields:\n                for c in f.iter_columns():\n                    yield col(f'{prefix}{c.name}', c.type)\n\n    def __str__(self):\n        lines = ['  ' * level + str(x.name) + ('?' if x.optional else '') + f' <span {x.span}>' for level, x in self.flatten()]\n        return '\\n'.join(lines)\n\n    def __repr__(self):\n        return str(self)\n\n    def flatten(self, level=0):\n        yield (level, self)\n        for f in self.fields:\n            yield from f.flatten(level=level + 1)\n\n\ndef test_mypy_annotations() -> None:\n    # mypy won't handle, so this has to be dynamic\n    vs = []\n    for t in Types.__args__:  # type: ignore[attr-defined]\n        (arg,) = t.__args__\n        vs.append(arg)\n\n    def types(ts):\n        return sorted(ts, key=lambda t: str(t))\n\n    assert types(vs) == types(Values.__args__)  # type: ignore[attr-defined]\n\n    for p in PRIMITIVE_TYPES:\n        assert p in Values.__args__  # type: ignore[attr-defined]\n\n\n@parametrize(\n    ('tp', 'val'),\n    [\n        (int, 22),\n        (bool, False),\n        (Optional[str], 'abacaba'),\n        (Union[str, int], 1),\n    ],\n)\ndef test_ntbinder_primitive(tp, val) -> None:\n    b = NTBinder.make(tp, name='x')\n    row = b.to_row(val)\n    vv = b.from_row(list(row))\n    assert vv == val\n\n\ndef test_unique_columns(tmp_path: Path) -> None:  # noqa: ARG001\n    class Job(NamedTuple):\n        company: str\n        title: Optional[str]\n\n    class Breaky(NamedTuple):\n        job_title: int\n        job: Optional[Job]\n\n    assert [c.name for c in NTBinder.make(Breaky).columns] == [\n        'job_title',\n        '_job_is_null',\n        'job_company',\n        '_job_title',\n    ]\n"
  },
  {
    "path": "src/cachew/logging_helper.py",
    "content": "from __future__ import annotations\n\nimport logging\nimport os\nimport warnings\nfrom functools import lru_cache\nfrom typing import TYPE_CHECKING\n\n\ndef test() -> None:\n    import sys\n    from collections.abc import Callable\n\n    M: Callable[[str], None] = lambda s: print(s, file=sys.stderr)\n\n    ## prepare exception for later\n    try:\n        None.whatever  # type: ignore[attr-defined]  # noqa: B018\n    except Exception as e:\n        ex = e\n    ##\n\n    M(\"   Logging module's defaults are not great:\")\n    l = logging.getLogger('default_logger')\n    l.error(\n        \"For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level\"\n    )\n\n    M(\"\\n   The reason is that you need to remember to call basicConfig() first. Let's do it now:\")\n    logging.basicConfig()\n    l.error(\n        \"OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number\"\n    )\n\n    M(\n        \"\\n   Also exception logging is kinda lame, doesn't print traceback by default unless you remember to pass exc_info:\"\n    )\n    l.exception(ex)  # type: ignore[possibly-undefined]\n\n    M(\n        \"\\n\\n    With make_logger you get a reasonable logging format, colours (via colorlog library) and other neat things:\"\n    )\n\n    ll = make_logger('test')  # No need for basicConfig!\n    ll.info(\"default level is INFO\")\n    ll.debug(\"... so this shouldn't be displayed\")\n    ll.warning(\"warnings are easy to spot!\")\n\n    M(\"\\n    Exceptions print traceback by default now:\")\n    ll.exception(ex)\n\n    M(\n        \"\\n    You can (and should) use it via regular logging.getLogger after that, e.g. let's set logging level to DEBUG now\"\n    )\n    logging.getLogger('test').setLevel(logging.DEBUG)\n    ll.debug(\"... now debug messages are also displayed\")\n\n\nDEFAULT_LEVEL = 'INFO'\nFORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)-4d]{end} %(message)s'\nFORMAT_NOCOLOR = FORMAT.format(start='', end='')\n\n\nLevel = int\nLevelIsh = Level | str | None\n\n\ndef mklevel(level: LevelIsh) -> Level:\n    if level is None:\n        return logging.NOTSET\n    if isinstance(level, int):\n        return level\n    return getattr(logging, level.upper())\n\n\ndef get_collapse_level() -> Level | None:\n    # TODO not sure if should be specific to logger name?\n    cl = os.environ.get('LOGGING_COLLAPSE', None)\n    if cl is not None:\n        return mklevel(cl)\n    # legacy name, maybe deprecate?\n    cl = os.environ.get('COLLAPSE_DEBUG_LOGS', None)\n    if cl is not None:\n        return logging.DEBUG\n    return None\n\n\ndef get_env_level(name: str) -> Level | None:\n    PREFIX = 'LOGGING_LEVEL_'  # e.g. LOGGING_LEVEL_my_hypothesis=debug\n    # shell doesn't allow using dots in var names without escaping, so also support underscore syntax\n    lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None)\n    if lvl is not None:\n        return mklevel(lvl)\n    return None\n\n\ndef setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> None:\n    \"\"\"\n    Wrapper to simplify logging setup.\n    \"\"\"\n    if isinstance(logger, str):\n        logger = logging.getLogger(logger)\n\n    if level is None:\n        level = DEFAULT_LEVEL\n\n    # env level always takes precedence\n    env_level = get_env_level(logger.name)\n    if env_level is not None:\n        lvl = env_level\n    else:\n        lvl = mklevel(level)\n\n    if logger.level == logging.NOTSET:\n        # if it's already set, the user requested a different logging level, let's respect that\n        logger.setLevel(lvl)\n\n    _setup_handlers_and_formatters(name=logger.name)\n\n\n# cached since this should only be done once per logger instance\n@lru_cache(None)\ndef _setup_handlers_and_formatters(name: str) -> None:\n    logger = logging.getLogger(name)\n\n    logger.addFilter(AddExceptionTraceback())\n\n    ch = logging.StreamHandler()\n    collapse_level = get_collapse_level()\n    ch = logging.StreamHandler() if collapse_level is None else CollapseLogsHandler(maxlevel=collapse_level)\n\n    # default level for handler is NOTSET, which will make it process all messages\n    # we rely on the logger to actually accept/reject log msgs\n    logger.addHandler(ch)\n\n    # this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand)\n    # even if log entry is handled by this logger ... not sure what's the point of this behaviour??\n    logger.propagate = False\n\n    try:\n        # try colorlog first, so user gets nice colored logs\n        import colorlog\n    except ModuleNotFoundError:\n        warnings.warn(\"You might want to 'pip install colorlog' for nice colored logs\", stacklevel=2)\n        formatter = logging.Formatter(FORMAT_NOCOLOR)\n    else:\n        # log_color/reset are specific to colorlog\n        FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s')\n        # colorlog should detect tty in principle, but doesn't handle everything for some reason\n        # see https://github.com/borntyping/python-colorlog/issues/71\n        if ch.stream.isatty():\n            formatter = colorlog.ColoredFormatter(FORMAT_COLOR)\n        else:\n            formatter = logging.Formatter(FORMAT_NOCOLOR)\n\n    ch.setFormatter(formatter)\n\n\n# by default, logging.exception isn't logging traceback unless called inside of the exception handler\n# which is a bit annoying since we have to pass exc_info explicitly\n# also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default\n# todo also amend by post about defensive error handling?\nclass AddExceptionTraceback(logging.Filter):\n    def filter(self, record: logging.LogRecord) -> bool:\n        if record.levelname == 'ERROR':\n            exc = record.msg\n            if isinstance(exc, BaseException):\n                if record.exc_info is None or record.exc_info == (None, None, None):\n                    exc_info = (type(exc), exc, exc.__traceback__)\n                    record.exc_info = exc_info\n        return True\n\n\n# todo also save full log in a file?\nclass CollapseLogsHandler(logging.StreamHandler):\n    '''\n    Collapses subsequent debug log lines and redraws on the same line.\n    Hopefully this gives both a sense of progress and doesn't clutter the terminal as much?\n    '''\n\n    last: bool = False\n\n    maxlevel: Level = logging.DEBUG  # everything with less or equal level will be collapsed\n\n    def __init__(self, *args, maxlevel: Level, **kwargs) -> None:\n        super().__init__(*args, **kwargs)\n        self.maxlevel = maxlevel\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            msg = self.format(record)\n            cur = record.levelno <= self.maxlevel and '\\n' not in msg\n            if cur:\n                if self.last:\n                    self.stream.write('\\033[K' + '\\r')  # clear line + return carriage\n            else:\n                if self.last:\n                    self.stream.write('\\n')  # clean up after the last line\n            self.last = cur\n            columns, _ = os.get_terminal_size(0)\n            # ugh. the columns thing is meh. dunno I guess ultimately need curses for that\n            # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc)\n            self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\\n'))\n            self.flush()\n        except:\n            self.handleError(record)\n\n\ndef make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger:\n    logger = logging.getLogger(name)\n    setup_logger(logger, level=level)\n    return logger\n\n\n# ughh. hacky way to have a single enlighten instance per interpreter, so it can be shared between modules\n# not sure about this. I guess this should definitely be behind some flag\n# OK, when stdout is not a tty, enlighten doesn't log anything, good\ndef get_enlighten():\n    # TODO could add env variable to disable enlighten for a module?\n    from unittest.mock import (\n        Mock,  # Mock to return stub so cients don't have to think about it\n    )\n\n    # for now hidden behind the flag since it's a little experimental\n    if os.environ.get('ENLIGHTEN_ENABLE', None) is None:\n        return Mock()\n\n    try:\n        import enlighten  # type: ignore[import-untyped]\n    except ModuleNotFoundError:\n        warnings.warn(\"You might want to 'pip install enlighten' for a nice progress bar\", stacklevel=2)\n\n        return Mock()\n\n    # dirty, but otherwise a bit unclear how to share enlighten manager between packages that call each other\n    instance = getattr(enlighten, 'INSTANCE', None)\n    if instance is not None:\n        return instance\n    instance = enlighten.get_manager()\n    setattr(enlighten, 'INSTANCE', instance)\n    return instance\n\n\nif __name__ == '__main__':\n    test()\n\n\n## legacy/deprecated methods for backwards compatilibity\nif not TYPE_CHECKING:\n    LazyLogger = make_logger\n    logger = make_logger\n##\n"
  },
  {
    "path": "src/cachew/marshall/cachew.py",
    "content": "from __future__ import annotations\n\nimport types\nfrom abc import abstractmethod\nfrom collections import abc\nfrom collections.abc import Sequence\nfrom dataclasses import dataclass, is_dataclass\nfrom datetime import UTC, date, datetime\nfrom numbers import Real\nfrom typing import (  # noqa: UP035\n    Any,\n    Dict,\n    List,\n    NamedTuple,\n    Optional,\n    Tuple,\n    Union,\n    get_args,\n    get_origin,\n    get_type_hints,\n)\nfrom zoneinfo import ZoneInfo\n\nfrom ..common import TypeNotSupported\nfrom ..utils import is_namedtuple, resolve_type_parameters\nfrom .common import AbstractMarshall, Json\n\n\nclass CachewMarshall[T](AbstractMarshall[T]):\n    def __init__(self, Type_: type[T]) -> None:\n        self.schema = build_schema(Type_)\n\n    def dump(self, obj: T) -> Json:\n        return self.schema.dump(obj)\n\n    def load(self, dct: Json) -> T:\n        return self.schema.load(dct)\n\n\n# NOTE: using slots gives a small speedup (maybe 5%?)\n# I suppose faster access to fields or something..\n\n\n@dataclass(slots=True)\nclass Schema:\n    type: Any\n\n    @abstractmethod\n    def dump(self, obj):\n        raise NotImplementedError\n\n    @abstractmethod\n    def load(self, dct):\n        raise NotImplementedError\n\n\n@dataclass(slots=True)\nclass SPrimitive(Schema):\n    def dump(self, obj):\n        # NOTE: returning here directly (instead of calling identity lambda) gives about 20% speedup\n        # I think custom types should have their own Schema subclass\n        return obj\n        # prim = primitives_to.get(self.type)\n        # assert prim is not None\n        # return prim(o)\n\n    def load(self, dct):\n        return dct\n        # prim = primitives_from.get(self.type)\n        # assert prim is not None\n        # return prim(d)\n\n\n@dataclass(slots=True)\nclass SDataclass(Schema):\n    # using list of tuples instead of dict gives about 5% speedup\n    fields: tuple[tuple[str, Schema], ...]\n\n    def dump(self, obj):\n        # TODO would be nice if we didn't create a dictionary here\n        # considering it is going to be serialized to json anyway\n        # maybe we need to yield json bits actually?\n        return {\n            # would be kinda nice if we didn't have to use getattr here\n            # but I think for dataclass this is actually the fastest way\n            # TODO for NamedTuples could just use them as tuples.. think about separating\n            k: ks.dump(getattr(obj, k))\n            for k, ks in self.fields\n        }\n\n    def load(self, dct):\n        # dict comprehension is meh, but not sure if there is a faster way?\n        return self.type(**{\n            k: ks.load(dct[k])\n            for k, ks in self.fields\n        })  # fmt: skip\n\n\n@dataclass(slots=True)\nclass SUnion(Schema):\n    # it's a bit faster to cache indices here, gives about 15% speedup\n    args: tuple[tuple[int, Schema], ...]\n\n    def dump(self, obj):\n        if obj is None:\n            # if it's a None, then doesn't really matter how to serialize and deserialize it\n            return (0, None)\n\n        # TODO could do a bit of magic here and remember the last index that worked?\n        # that way if some objects dominate the Union, the first isinstance would always work\n        for tidx, a in self.args:\n            if isinstance(obj, a.type):  # this takes quite a lot of time (sort of expected?)\n                # using lists instead of dicts gives a bit of a speedup (about 15%)\n                # so probably worth it even though a bit cryptic\n                # also could add a tag or something?\n                # NOTE: using tuple instead of list gives a tiiny speedup\n                jj = a.dump(obj)\n                return (tidx, jj)\n                # {\n                #     '__union_index__': tidx,\n                #     '__value__': jj,\n                # }\n        raise RuntimeError(f\"shouldn't happen: {self.args} {obj}\")\n\n    def load(self, dct):\n        # tidx = d['__union_index__']\n        # s = self.args[tidx]\n        # return s.load(d['__value__'])\n        tidx, val = dct\n        if val is None:\n            # counterpart for None handling in .dump method\n            return None\n\n        _, s = self.args[tidx]\n        return s.load(val)\n\n\n@dataclass(slots=True)\nclass SList(Schema):\n    arg: Schema\n\n    def dump(self, obj):\n        return tuple(self.arg.dump(i) for i in obj)\n\n    def load(self, dct):\n        return [self.arg.load(i) for i in dct]\n\n\n@dataclass(slots=True)\nclass STuple(Schema):\n    args: tuple[Schema, ...]\n\n    def dump(self, obj):\n        return tuple(a.dump(i) for a, i in zip(self.args, obj, strict=True))\n\n    def load(self, dct):\n        return tuple(a.load(i) for a, i in zip(self.args, dct, strict=True))\n\n\n@dataclass(slots=True)\nclass SSequence(Schema):\n    arg: Schema\n\n    def dump(self, obj):\n        return tuple(self.arg.dump(i) for i in obj)\n\n    def load(self, dct):\n        return tuple(self.arg.load(i) for i in dct)\n\n\n@dataclass(slots=True)\nclass SDict(Schema):\n    ft: SPrimitive\n    tt: Schema\n\n    def dump(self, obj):\n        return {\n            k: self.tt.dump(v)\n            for k, v in obj.items()\n        }  # fmt: skip\n\n    def load(self, dct):\n        return {\n            k: self.tt.load(v)\n            for k, v in dct.items()\n        }  # fmt: skip\n\n\n# TODO unify with primitives?\nJTypes = {int, str, type(None), float, bool}\n\n\ndef _exc_helper(args):\n    for a in args:\n        at = type(a)\n        if at in JTypes:\n            yield a\n        elif issubclass(at, date):\n            # TODO would be nice to restore datetime from cache too\n            # maybe generally save exception as a union? or intact and let orjson save it?\n            yield a.isoformat()\n        else:\n            yield str(a)  # not much we can do..\n\n\n@dataclass(slots=True)\nclass SException(Schema):\n    def dump(self, obj: Exception) -> Json:\n        return tuple(_exc_helper(obj.args))\n\n    def load(self, dct: Json):\n        return self.type(*dct)\n\n\ntry:\n    # defensive to avoid dependency on pytz when we switch to python >= 3.9\n    import pytz\nexcept ModuleNotFoundError:\n    # dummy, this is only needed for isinstance check below\n    class pytz_BaseTzInfo:\n        zone: str\n\n    def make_tz_pytz(zone: str):\n        raise RuntimeError(f\"Install pytz to deserialize {zone}\")\n\nelse:\n    pytz_BaseTzInfo = pytz.BaseTzInfo  # type: ignore[misc,assignment]\n\n    make_tz_pytz = pytz.timezone\n\n\n# just ints to avoid inflating db size\n# for now, we try to preserve actual timezone object just in case since they do have somewhat incompatible apis\n_TZTAG_ZONEINFO = 1\n_TZTAG_PYTZ = 2\n\n\n@dataclass(slots=True)\nclass SDatetime(Schema):\n    def dump(self, obj: datetime) -> Json:\n        iso = obj.isoformat()\n        tz = obj.tzinfo\n        if tz is None:\n            return (iso, None, None)\n\n        if isinstance(tz, ZoneInfo):\n            return (iso, tz.key, _TZTAG_ZONEINFO)\n        elif isinstance(tz, pytz_BaseTzInfo):\n            zone = tz.zone\n            # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6\n            assert zone is not None, (obj, tz)\n            return (iso, zone, _TZTAG_PYTZ)\n        else:\n            return (iso, None, None)\n\n    def load(self, dct: tuple):\n        iso, zone, zone_tag = dct\n        dt = datetime.fromisoformat(iso)\n        if zone is None:\n            return dt\n\n        make_tz = ZoneInfo if zone_tag == _TZTAG_ZONEINFO else make_tz_pytz\n        tz = make_tz(zone)\n        return dt.astimezone(tz)\n\n\n@dataclass(slots=True)\nclass SDate(Schema):\n    def dump(self, obj: date) -> Json:\n        return obj.isoformat()\n\n    def load(self, dct: str):\n        return date.fromisoformat(dct)\n\n\nPRIMITIVES = {\n    # int and float are handled a bit differently to allow implicit casts\n    # isinstance(.., Real) works both for int and for float\n    # Real can't be serialized back, but if you look in SPrimitive, it leaves the values intact anyway\n    # since the actual serialization of primitives is handled by orjson\n    int: Real,\n    float: Real,\n    str: str,\n    type(None): type(None),\n    bool: bool,\n    # if type is Any, there isn't much we can do to dump it -- just dump into json and rely on the best\n    # so in this sense it works exacly like primitives\n    Any: Any,\n}\n\n\ndef build_schema(Type) -> Schema:\n    # just to avoid confusion in case of weirdness with stringish type annotations\n    assert not isinstance(Type, str), Type\n\n    Type = resolve_type_parameters(Type)\n\n    ptype = PRIMITIVES.get(Type)\n    if ptype is not None:\n        return SPrimitive(type=ptype)\n\n    origin = get_origin(Type)\n    # origin is 'unsubscripted/erased' version of type\n    # if origin is NOT None, it's some sort of generic type\n\n    if origin is None:\n        if issubclass(Type, Exception):\n            return SException(type=Type)\n\n        if issubclass(Type, datetime):\n            return SDatetime(type=Type)\n\n        if issubclass(Type, date):\n            return SDate(type=Type)\n\n        if not (is_dataclass(Type) or is_namedtuple(Type)):\n            raise TypeNotSupported(type_=Type, reason='unknown type')\n        try:\n            hints = get_type_hints(Type)\n        except TypeError as te:\n            # this can happen for instance on 3.9 if pipe syntax was used for Union types\n            # would be nice to provide a friendlier error though\n            raise TypeNotSupported(type_=Type, reason='failed to get type hints') from te\n        fields = tuple((k, build_schema(t)) for k, t in hints.items())\n        return SDataclass(\n            type=Type,\n            fields=fields,\n        )\n\n    args = get_args(Type)\n    is_union = origin is Union or origin is types.UnionType\n\n    if is_union:\n        # We 'erasing' types (since generic types don't work with isinstance checks).\n        # So we need to make sure the types are unique to make sure we can deserialise them.\n        schemas = [build_schema(a) for a in args]\n        union_types = [s.type for s in schemas if s.type is not Real]\n        if len(set(union_types)) != len(union_types):\n            raise TypeNotSupported(type_=Type, reason=f'runtime union arguments are not unique: {union_types}')\n        return SUnion(\n            type=origin,\n            args=tuple(\n                (tidx, s)\n                for tidx, s in enumerate(schemas)\n            ),\n        )  # fmt: skip\n\n    is_listish = origin is list\n    if is_listish:\n        (t,) = args\n        return SList(\n            type=origin,\n            arg=build_schema(t),\n        )\n\n    # hmm check for is typing.Sequence doesn't pass for some reason\n    # perhaps because it's a deprecated alias?\n    is_tuplish = origin is tuple or origin is abc.Sequence\n    if is_tuplish:\n        if origin is tuple:\n            # this is for Tuple[()], which is the way to represent empty tuple\n            # before python 3.11, get_args for that gives ((),) instead of an empty tuple () as one might expect\n            if args == ((),):\n                args = ()\n            return STuple(\n                type=origin,\n                args=tuple(build_schema(a) for a in args),\n            )\n        else:\n            (t,) = args\n            return SSequence(\n                type=origin,\n                arg=build_schema(t),\n            )\n\n    is_dictish = origin is dict\n    if is_dictish:\n        (ft, tt) = args\n        fts = build_schema(ft)\n        tts = build_schema(tt)\n        assert isinstance(fts, SPrimitive)\n        return SDict(\n            type=origin,\n            ft=fts,\n            tt=tts,\n        )\n\n    raise RuntimeError(f\"unsupported: {Type=} {origin=} {args=}\")\n\n\n######### tests\n\n\ndef _test_identity(obj, Type_, expected=None):\n    if expected is None:\n        expected = obj\n\n    m = CachewMarshall(Type_)\n\n    j = m.dump(obj)\n    obj2 = m.load(j)\n\n    # Exception's don't support equality normally, so we need to do some hacks..\n    def normalise(x):\n        if isinstance(x, Exception):\n            return (type(x), x.args)\n        if type(x) is list:\n            return [(type(i), i.args) if isinstance(i, Exception) else i for i in x]\n        return x\n\n    # ugh that doesn't work\n    # def exc_eq(s, other):\n    #     return (type(s), s.args) == (type(other), other.args)\n    # Exception.__eq__ = exc_eq\n\n    assert normalise(expected) == normalise(obj2), (expected, obj2)\n    return (j, obj2)\n\n\n## this is used for test below...\n# however if we define this inside the test function, it fails if from __future__ import annotations is present on the file..\ntype _IntType = int\ntype _StrIntType = str | int\n##\n\n\n# TODO customise with cattrs\ndef test_serialize_and_deserialize() -> None:\n    import pytest\n\n    helper = _test_identity\n\n    # primitives\n    helper(1, int)\n    helper('aaa', str)\n    helper(None, type(None))\n    # TODO emit other value as none type? not sure what should happen\n\n    # implicit casts, simple version\n    helper(None, int)\n    helper(None, str)\n    helper(1, float)\n\n    # implicit casts, inside other types\n    # technically not type safe, but might happen in practice\n    # doesn't matter how to deserialize None anyway so let's allow this\n    helper(None, str | int)\n    # old syntax\n    helper(None, Union[str, int])  # noqa: UP007\n\n    # even though 1 is not isinstance(float), often it ends up as float in data\n    # see https://github.com/karlicoss/cachew/issues/54\n    helper(1, float | str)\n    helper(2, float | int)\n    helper(2.0, float | int)\n    helper((1, 2), tuple[int, float])\n\n    # optionals\n    helper('aaa', str | None)\n    helper(None, str | None)\n    # old syntax\n    helper('aaa', Optional[str])  # noqa: UP045\n    helper('aaa', Union[str, None])  # noqa: UP007\n    helper(None, Union[str, None])  # noqa: UP007\n\n    # lists/tuples/sequences\n    # TODO test with from __future__ import annotations..\n    helper([1, 2, 3], list[int])\n    helper([1, 2, 3], Optional[List[int]])  # noqa: UP006,UP045\n    helper([1, 2, 3], Sequence[int], expected=(1, 2, 3))\n    helper((1, 2, 3), Sequence[int])\n    helper((1, 2, 3), tuple[int, int, int])\n    # old syntax\n    helper([1, 2, 3], List[int])  # noqa: UP006\n    helper((1, 2, 3), Tuple[int, int, int])  # noqa: UP006\n    helper((1, 2, 3), Optional[tuple[int, int, int]])  # noqa: UP045\n\n    # dicts\n    helper({'a': 'aa', 'b': 'bb'}, dict[str, str])\n    helper({'a': None, 'b': 'bb'}, dict[str, str | None])\n    helper({'a': 'aa', 'b': 'bb'}, dict[str, str])\n    # old syntax\n    helper({'a': None, 'b': 'bb'}, Dict[str, Optional[str]])  # noqa: UP006,UP045\n\n    # unions\n    helper('aaa', str | int)\n    # old syntax\n    helper(1, Union[str, int])  # noqa: UP007\n\n    # compounds of simple types\n    helper(['1', 2, '3'], list[str | int])\n    # old syntax\n    helper(['1', 2, '3'], list[Union[str, int]])  # noqa: UP007\n\n    # TODO need to add test for equivalent dataclasses\n\n    @dataclass\n    class Point:\n        x: int\n        y: int\n\n    # dataclasses\n    helper(Point(x=1, y=2), Point)\n\n    # Namedtuple\n    class NT(NamedTuple):\n        first: str\n        last: str\n\n    helper(NT(first='aaa', last='bbb'), NT)\n\n    @dataclass\n    class WithJson:\n        id: int\n        raw_data: dict[str, Any]\n\n    ## type aliases including new 3.12 type aliases\n    # this works..\n    StrInt = str | int\n    helper('aaa', StrInt)\n\n    helper('aaa', _StrIntType)\n    helper([1, 2, 3], list[_IntType])\n\n    @dataclass\n    class TestTypeAlias:\n        x: _IntType\n        value: _StrIntType\n\n    helper(TestTypeAlias(x=1, value='aaa'), TestTypeAlias)\n    ##\n\n    # json-ish stuff\n    helper({}, dict[str, Any])\n    helper(WithJson(id=123, raw_data={'payload': 'whatever', 'tags': ['a', 'b', 'c']}), WithJson)\n    helper([], list[Any])\n\n    # exceptions\n    helper(RuntimeError('whatever!'), RuntimeError)\n    # fmt: off\n    helper([\n        RuntimeError('I', 'am', 'exception', 123),\n        Point(x=1, y=2),\n        Point(x=11, y=22),\n        RuntimeError('more stuff'),\n        RuntimeError(),\n    ], list[RuntimeError | Point])\n\n    exc_with_datetime     = Exception('I happenned on', datetime.fromisoformat('2021-04-03T10:11:12'))\n    exc_with_datetime_exp = Exception('I happenned on', '2021-04-03T10:11:12')\n    helper(exc_with_datetime, Exception, expected=exc_with_datetime_exp)\n    # fmt: on\n\n    # datetimes\n    import pytz\n\n    tz_london = pytz.timezone('Europe/London')\n    dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S')\n    dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S')\n    dwinter_tz = tz_london.localize(dwinter)\n    dsummer_tz = tz_london.localize(dsummer)\n\n    dates_tz = [\n        dwinter_tz,\n        dsummer_tz,\n    ]\n\n    tz_sydney = ZoneInfo('Australia/Sydney')\n    ## these will have same local time (2025-04-06 02:01:00) in Sydney due to DST shift!\n    ## the second one will have fold=1 set to disambiguate\n    utc_before_shift = datetime.fromisoformat('2025-04-05T15:01:00+00:00')\n    utc_after__shift = datetime.fromisoformat('2025-04-05T16:01:00+00:00')\n    ##\n    sydney_before = utc_before_shift.astimezone(tz_sydney)\n    sydney__after = utc_after__shift.astimezone(tz_sydney)\n\n    dates_tz.extend([sydney_before, sydney__after])\n\n    dates = [\n        *dates_tz,\n        dwinter,\n        dsummer,\n        dsummer.replace(tzinfo=UTC),\n    ]\n    for d in dates:\n        _jj, dd = helper(d, datetime)\n        assert str(d) == str(dd)\n\n        # test that we preserve zone names\n        if d in dates_tz:\n            # this works both with pytz and zoneinfo without getting .zone or .key attributes\n            assert str(d.tzinfo) == str(dd.tzinfo)\n\n    assert helper(dsummer_tz, datetime)[0] == ('2020-08-03T01:02:03+01:00', 'Europe/London', _TZTAG_PYTZ)\n    assert helper(dwinter, datetime)[0] == ('2020-02-03T01:02:03', None, None)\n\n    assert helper(sydney_before, datetime)[0] == ('2025-04-06T02:01:00+11:00', 'Australia/Sydney', _TZTAG_ZONEINFO)\n    assert helper(sydney__after, datetime)[0] == ('2025-04-06T02:01:00+10:00', 'Australia/Sydney', _TZTAG_ZONEINFO)\n\n    assert helper(dwinter.date(), date)[0] == '2020-02-03'\n\n    # unsupported types\n    class NotSupported:\n        pass\n\n    with pytest.raises(RuntimeError, match=r\".*NotSupported.* isn't supported by cachew\"):\n        helper([NotSupported()], list[NotSupported])\n\n    # edge cases\n    helper((), tuple[()])\n\n    # unions of generic sequences and such\n    # these don't work because the erased type of both is just 'list'..\n    # so there is no way to tell which one we need to construct :(\n    with pytest.raises(TypeNotSupported, match=r\".*runtime union arguments are not unique\"):\n        helper([1, 2, 3], list[int] | list[Exception])\n    with pytest.raises(TypeNotSupported, match=r\".*runtime union arguments are not unique\"):\n        helper([1, 2, 3], list[Exception] | list[int])\n"
  },
  {
    "path": "src/cachew/marshall/common.py",
    "content": "from abc import abstractmethod\nfrom typing import Any\n\ntype Json = dict[str, Any] | tuple[Any, ...] | str | float | int | bool | None\n\n\nclass AbstractMarshall[T]:\n    @abstractmethod\n    def dump(self, obj: T) -> Json:\n        raise NotImplementedError\n\n    @abstractmethod\n    def load(self, dct: Json) -> T:\n        raise NotImplementedError\n"
  },
  {
    "path": "src/cachew/py.typed",
    "content": ""
  },
  {
    "path": "src/cachew/pytest.py",
    "content": "\"\"\"\nHelpers to prevent depending on pytest in runtime\n\"\"\"\n\nimport sys\nimport typing\n\nunder_pytest = 'pytest' in sys.modules\n\nif typing.TYPE_CHECKING or under_pytest:\n    import pytest\n\n    parametrize = pytest.mark.parametrize\nelse:\n\n    def parametrize(*_args, **_kwargs):\n        def wrapper(f):\n            return f\n\n        return wrapper\n"
  },
  {
    "path": "src/cachew/tests/marshall.py",
    "content": "# ruff: noqa: ARG001  # ruff thinks pytest fixtures are unused arguments\nimport shutil\nimport sqlite3\nimport sys\nfrom dataclasses import dataclass\nfrom datetime import UTC, datetime\nfrom pathlib import Path\nfrom typing import Any, Literal\n\nimport orjson\nimport pytest\n\nfrom ..marshall.cachew import CachewMarshall\nfrom ..marshall.common import Json\nfrom .utils import (\n    gc_control,  # noqa: F401\n    profile,\n    running_on_ci,\n    timer,\n)\n\nImpl = Literal[\n    'cachew',  # our custom deserialization\n    'cattrs',\n    'legacy',  # our legacy deserialization\n]\n# don't include legacy by default, it's only here just for the sake of comparing once before switch\nImpls: list[Impl] = ['cachew', 'cattrs']\n\n\ndef do_test(*, test_name: str, Type, factory, count: int, impl: Impl = 'cachew') -> None:\n    if count > 100 and running_on_ci:\n        pytest.skip(\"test too heavy for CI, only meant to run manually\")\n\n    to_json: Any\n    from_json: Any\n    if impl == 'cachew':\n        marshall = CachewMarshall(Type_=Type)\n        to_json = marshall.dump\n        from_json = marshall.load\n    elif impl == 'legacy':\n        from ..legacy import NTBinder\n\n        # NOTE: legacy binder emits a tuple which can be inserted directly into the database\n        # so 'json dump' and 'json load' should really be disregarded for this flavor\n        # if you're comparing with <other> implementation, you should compare\n        # legacy serializing as the sum of <other> serializing + <other> json dump\n        # that said, this way legacy will have a bit of an advantage since custom types (e.g. datetime)\n        # would normally be handled by sqlalchemy instead\n        binder = NTBinder.make(Type)\n        to_json = binder.to_row\n        from_json = binder.from_row\n    elif impl == 'cattrs':\n        from cattrs import Converter\n\n        converter = Converter()\n\n        from typing import get_args\n\n        # TODO use later\n        # from typing import Union, get_origin\n        # import types\n        # def is_union(type_) -> bool:\n        #     origin = get_origin(type_)\n        #     return origin is Union or origin is types.UnionType\n\n        def union_structure_hook_factory(_):\n            def union_hook(data, type_):\n                args = get_args(type_)\n\n                if data is None:  # we don't try to coerce None into anything\n                    return None\n\n                for t in args:\n                    try:\n                        res = converter.structure(data, t)\n                    except Exception:\n                        continue\n                    else:\n                        return res\n                raise ValueError(f\"Could not cast {data} to {type_}\")\n\n            return union_hook\n\n        # borrowed from https://github.com/python-attrs/cattrs/issues/423\n        # uhh, this doesn't really work straightaway...\n        # likely need to combine what cattr does with configure_tagged_union\n        # converter.register_structure_hook_factory(is_union, union_structure_hook_factory)\n        # configure_tagged_union(\n        #     union=Type,\n        #     converter=converter,\n        # )\n        # NOTE: this seems to give a bit of speedup... maybe raise an issue or something?\n        # fmt: off\n        unstruct_func = converter._unstructure_func.dispatch(Type)  # type: ignore[call-arg, misc]  # about 20% speedup\n        struct_func   = converter._structure_func  .dispatch(Type)  # type: ignore[call-arg, misc]  # TODO speedup\n        # fmt: on\n\n        to_json = unstruct_func\n        # todo would be nice to use partial? but how do we bind a positional arg?\n        from_json = lambda x: struct_func(x, Type)\n    else:\n        raise RuntimeError(impl)\n\n    print(file=sys.stderr)  # kinda annoying, pytest starts printing on the same line as test name\n\n    with profile(test_name + ':baseline'), timer(f'building      {count} objects of type {Type}'):\n        objects = list(factory(count=count))\n\n    jsons: list[Json] = [None for _ in range(count)]\n    with profile(test_name + ':serialize'), timer(f'serializing   {count} objects of type {Type}'):\n        for i in range(count):\n            jsons[i] = to_json(objects[i])  # ty: ignore[invalid-assignment]\n\n    strs: list[bytes] = [None for _ in range(count)]  # type: ignore[misc]\n    with profile(test_name + ':json_dump'), timer(f'json dump     {count} objects of type {Type}'):\n        for i in range(count):\n            # TODO any orjson options to speed up?\n            strs[i] = orjson.dumps(jsons[i])\n\n    db = Path('/tmp/cachew_test/db.sqlite')\n    if db.parent.exists():\n        shutil.rmtree(db.parent)\n    db.parent.mkdir()\n\n    with profile(test_name + ':sqlite_dump'), timer(f'sqlite dump   {count} objects of type {Type}'):\n        with sqlite3.connect(db) as conn:\n            conn.execute('CREATE TABLE data (value BLOB)')\n            conn.executemany('INSERT INTO data (value) VALUES (?)', [(s,) for s in strs])\n        conn.close()\n\n    strs2: list[bytes] = [None for _ in range(count)]  # type: ignore[misc]\n    with profile(test_name + ':sqlite_load'), timer(f'sqlite load   {count} objects of type {Type}'):\n        with sqlite3.connect(db) as conn:\n            i = 0\n            for (value,) in conn.execute('SELECT value FROM data'):\n                strs2[i] = value\n                i += 1\n        conn.close()\n\n    cache = db.parent / 'cache.jsonl'\n\n    with profile(test_name + ':jsonl_dump'), timer(f'jsonl dump    {count} objects of type {Type}'):\n        with cache.open('wb') as fw:\n            for s in strs:\n                fw.write(s + b'\\n')\n\n    strs3: list[bytes] = [None for _ in range(count)]  # type: ignore[misc]\n    with profile(test_name + ':jsonl_load'), timer(f'jsonl load    {count} objects of type {Type}'):\n        i = 0\n        with cache.open('rb') as fr:\n            for l in fr:\n                l = l.rstrip(b'\\n')\n                strs3[i] = l\n                i += 1\n\n    assert strs2[:100] + strs2[-100:] == strs3[:100] + strs3[-100:]  # just in case\n\n    jsons2: list[Json] = [None for _ in range(count)]\n    with profile(test_name + ':json_load'), timer(f'json load     {count} objects of type {Type}'):\n        for i in range(count):\n            # TODO any orjson options to speed up?\n            jsons2[i] = orjson.loads(strs2[i])\n\n    objects2 = [None for _ in range(count)]\n    with profile(test_name + ':deserialize'), timer(f'deserializing {count} objects of type {Type}'):\n        for i in range(count):\n            objects2[i] = from_json(jsons2[i])  # ty: ignore[invalid-argument-type]\n\n    assert objects[:100] + objects[-100:] == objects2[:100] + objects2[-100:]\n\n\n@dataclass\nclass Name:\n    first: str\n    last: str\n\n\n@pytest.mark.parametrize('impl', Impls)\n@pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])\n@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])\ndef test_union_str_dataclass(impl: Impl, count: int, gc_control, request) -> None:\n    # NOTE: previously was union_str_namedtuple, but adapted to work with cattrs for now\n    # perf difference between datacalss/namedtuple here seems negligible so old benchmark results should apply\n\n    if impl == 'cattrs':\n        pytest.skip('TODO need to adjust the handling of Union types..')\n\n    def factory(count: int):\n        objects: list[str | Name] = []\n        for i in range(count):\n            if i % 2 == 0:\n                objects.append(str(i))\n            else:\n                objects.append(Name(first=f'first {i}', last=f'last {i}'))\n        return objects\n\n    do_test(test_name=request.node.name, Type=str | Name, factory=factory, count=count, impl=impl)\n\n\n# OK, performance with calling this manually (not via pytest) is the same\n# do_test_union_str_dataclass(count=1_000_000, test_name='adhoc')\n\n\n@pytest.mark.parametrize('impl', Impls)\n@pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])\n@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])\ndef test_datetimes(impl: Impl, count: int, gc_control, request) -> None:\n    if impl == 'cattrs':\n        pytest.skip('TODO support datetime with pytz for cattrs')\n\n    import pytz\n\n    def factory(*, count: int):\n        tzs = [\n            pytz.timezone('Europe/Berlin'),\n            UTC,\n            pytz.timezone('America/New_York'),\n        ]\n        start = datetime.fromisoformat('1990-01-01T00:00:00')\n        end = datetime.fromisoformat('2030-01-01T00:00:00')\n        step = (end - start) / count\n        for i in range(count):\n            dt = start + step * i\n            tz = tzs[i % len(tzs)]\n            yield dt.replace(tzinfo=tz)\n\n    do_test(test_name=request.node.name, Type=datetime, factory=factory, count=count, impl=impl)\n\n\n@pytest.mark.parametrize('impl', Impls)\n@pytest.mark.parametrize('count', [99, 1_000_000])\n@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])\ndef test_nested_dataclass(impl: Impl, count: int, gc_control, request) -> None:\n    # NOTE: was previously named test_many_from_cachew\n    @dataclass\n    class UUU:\n        xx: int\n        yy: int\n\n    @dataclass\n    class TE2:\n        value: int\n        uuu: UUU\n        value2: int\n\n    def factory(*, count: int):\n        for i in range(count):\n            yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i)\n\n    do_test(test_name=request.node.name, Type=TE2, factory=factory, count=count, impl=impl)\n\n\n# TODO next test should probs be runtimeerror?\n"
  },
  {
    "path": "src/cachew/tests/test_cachew.py",
    "content": "# ruff: noqa: ARG001  # ruff thinks pytest fixtures are unused arguments\nimport hashlib\nimport inspect\nimport platform\nimport string\nimport sys\nimport time\nimport timeit\nfrom collections.abc import Iterable, Iterator, Sequence\nfrom concurrent.futures import ProcessPoolExecutor\nfrom contextlib import nullcontext\nfrom dataclasses import asdict, dataclass\nfrom datetime import UTC, date, datetime\nfrom itertools import chain, islice\nfrom pathlib import Path\nfrom random import Random\nfrom subprocess import check_call, check_output, run\nfrom time import sleep\nfrom typing import (\n    Any,\n    NamedTuple,\n    cast,\n)\n\nimport patchy\nimport pytest\nfrom more_itertools import ilen, last, one, unique_everseen\n\nfrom .. import (\n    Backend,\n    CachewException,\n    cachew,\n    callable_name,\n    get_logger,\n    settings,\n)\nfrom .utils import (\n    gc_control,  # noqa: F401\n    running_on_ci,\n)\n\nlogger = get_logger()\n\n\n@pytest.fixture(autouse=True)\ndef set_default_cachew_dir(tmp_path: Path):\n    tpath = tmp_path / 'cachew_default'\n    settings.DEFAULT_CACHEW_DIR = tpath\n\n\n@pytest.fixture(autouse=True)\ndef throw_on_errors():\n    # NOTE: in tests we always throw on errors, it's a more reasonable default for testing.\n    # we still check defensive behaviour in test_defensive\n    settings.THROW_ON_ERROR = True\n    # TODO restore it?\n\n\n@pytest.fixture(autouse=True, params=['sqlite', 'file'])\ndef set_backend(restore_settings, request):\n    backend = request.param\n    settings.DEFAULT_BACKEND = backend\n    # TODO restore it??\n\n\n@pytest.fixture\ndef restore_settings():\n    orig = {k: v for k, v in settings.__dict__.items() if not k.startswith('__')}\n    try:\n        yield\n    finally:\n        for k, v in orig.items():\n            setattr(settings, k, v)\n\n\nclass UUU(NamedTuple):\n    xx: int\n    yy: int\n\n\ndef test_simple() -> None:\n    # just make sure all the high level cachew stuff is working\n    @cachew\n    def fun() -> Iterable[UUU]:\n        yield from []\n\n    list(fun())\n\n\ndef test_string_annotation_old() -> None:\n    \"\"\"\n    For some reason collections.abc.Iterable doesn't seem to work here on python <= 3.11\n     , it only sees 'UUU' as a string\n    Keeping this just as a demonstration, probably not worth trying to support as it's fairly esoteric combo.\n    \"\"\"\n    from typing import Iterable as typing_Iterable  # noqa: UP035\n\n    @cachew\n    def fun() -> typing_Iterable['UUU']:\n        yield from []\n\n    # should properly infer UUU type\n    list(fun())\n\n\ndef test_string_annotation_new() -> None:\n    @cachew\n    def fun() -> Iterable['UUU']:\n        yield from []\n\n    # should properly infer UUU type\n    list(fun())\n\n\ndef test_custom_hash(tmp_path: Path) -> None:\n    \"\"\"\n    Demo of using argument's modification time to determine if underlying data changed\n    \"\"\"\n    src = tmp_path / 'source'\n    src.write_text('0')\n\n    entities = [\n        UUU(xx=1, yy=1),\n        UUU(xx=2, yy=2),\n        UUU(xx=3, yy=3),\n    ]\n    calls = 0\n\n    def get_path_version(path: Path):\n        ns = path.stat().st_mtime_ns\n        # hmm, this might be unreliable, sometimes mtime doesn't change even after modifications?\n        # I suppose it takes some time for them to sync or something...\n        # so let's compute md5 or something in addition..\n        md5 = hashlib.md5(path.read_bytes()).digest()\n        return str((ns, md5))\n\n    @cachew(\n        cache_path=tmp_path,\n        depends_on=get_path_version,  # when path is updated, underlying cache would be discarded\n    )\n    def data(path: Path) -> Iterable[UUU]:\n        nonlocal calls\n        calls += 1\n        count = int(path.read_text())\n        return entities[:count]\n\n    ldata = lambda: list(data(path=src))\n\n    assert len(ldata()) == 0\n    assert len(ldata()) == 0\n    assert len(ldata()) == 0\n    assert calls == 1\n\n    src.write_text('1')\n    assert ldata() == entities[:1]\n    assert ldata() == entities[:1]\n    assert calls == 2\n\n    src.write_text('3')\n    assert ldata() == entities\n    assert ldata() == entities\n    assert calls == 3\n\n\ndef test_caching(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def data() -> Iterator[UUU]:\n        time.sleep(1)\n        for i in range(5):\n            yield UUU(xx=i, yy=i)\n            time.sleep(1)\n\n    # https://stackoverflow.com/a/40385994/706389\n    template = \"\"\"\ndef inner(_it, _timer{init}):\n    {setup}\n    _t0 = _timer()\n    for _i in _it:\n        retval = {stmt}\n    _t1 = _timer()\n    return _t1 - _t0, retval\n\"\"\"\n    timeit.template = template  # type: ignore[attr-defined]\n\n    timer = timeit.Timer(lambda: len(list(data())))\n    t, cnt = cast(tuple[float, int], timer.timeit(number=1))\n    assert cnt == 5\n    assert t > 5.0, 'should take at least 5 seconds'\n\n    t, cnt = cast(tuple[float, int], timer.timeit(number=1))\n    assert cnt == 5\n    assert t < 2.0, 'should be pretty much instantaneous'\n\n\ndef test_error(tmp_path: Path) -> None:\n    '''\n    Test behaviour when the first time cache is initialized it ends up with an error\n    '''\n    cache_file = tmp_path / 'cache'\n    assert not cache_file.exists(), cache_file  # just precondition\n\n    should_raise = True\n\n    @cachew(cache_file, force_file=True)\n    def fun() -> Iterator[str]:\n        yield 'string1'\n        if should_raise:\n            raise RuntimeError('oops')\n        yield 'string2'\n\n    with pytest.raises(RuntimeError, match='oops'):\n        list(fun())\n\n    # vvv this would be nice but might be tricky because of the way sqlite works (i.e. wal mode creates a file)\n    # assert not cache_file.exists(), cache_file\n\n    # perhaps doesn't hurt either way as long this vvv works properly\n    # shouldn't cache anything and crach again\n    with pytest.raises(RuntimeError, match='oops'):\n        list(fun())\n\n    should_raise = False\n    assert list(fun()) == ['string1', 'string2']\n\n\ndef test_cache_path(tmp_path: Path) -> None:\n    '''\n    Tests various ways of specifying cache path\n    '''\n    calls = 0\n\n    def orig() -> Iterable[int]:\n        nonlocal calls\n        yield 1\n        yield 2\n        calls += 1\n\n    fun = cachew(tmp_path / 'non_existent_dir' / 'cache_dir')(orig)\n    assert list(fun()) == [1, 2]\n    assert calls == 1\n    assert list(fun()) == [1, 2]\n    assert calls == 1\n\n    # dir by default\n    cdir = tmp_path / 'non_existent_dir' / 'cache_dir'\n    assert cdir.is_dir()\n    cfile = one(cdir.glob('*'))\n    assert cfile.name.startswith('cachew.tests.test_cachew:test_cache_path.')\n\n    # treat None as \"don't cache\"\n    fun = cachew(cache_path=None)(orig)\n    assert list(fun()) == [1, 2]\n    assert calls == 2\n    assert list(fun()) == [1, 2]\n    assert calls == 3\n\n    f = tmp_path / 'a_file'\n    f.touch()\n    fun = cachew(cache_path=f)(orig)\n    assert list(fun()) == [1, 2]\n    assert calls == 4\n    assert list(fun()) == [1, 2]\n    assert calls == 4\n\n    fun = cachew(tmp_path / 'name', force_file=True)(orig)\n    assert list(fun()) == [1, 2]\n    assert calls == 5\n    assert list(fun()) == [1, 2]\n    assert calls == 5\n\n    # if passed force_file, also treat as file\n    assert (tmp_path / 'name').is_file()\n\n    # treat None as \"don't cache\" ('factory')\n    # hmm not sure why mypy complains here.. might better if we get to use ParamSpec?\n    fun = cachew(cache_path=lambda *args: None)(orig)  # type: ignore[arg-type]  # noqa: ARG005\n    assert list(fun()) == [1, 2]\n    assert calls == 6\n    assert list(fun()) == [1, 2]\n    assert calls == 7\n    # TODO this won't work at the moment\n    # f.write_text('garbage')\n    # not sure... on the one hand could just delete the garbage file and overwrite with db\n    # on the other hand, wouldn't want to delete some user file by accident\n\n\nclass UGood(NamedTuple):\n    x: int\n\n\nclass UBad:\n    pass\n\n\ndef test_unsupported_class(tmp_path: Path) -> None:\n    with pytest.raises(CachewException, match=r'.*failed to infer cache type.*'):\n\n        @cachew(cache_path=tmp_path)\n        def fun() -> list[UBad]:\n            return [UBad()]\n\n    with pytest.raises(CachewException, match=r\".*can't infer type from.*\"):\n\n        @cachew(cache_path=tmp_path)\n        def fun2() -> Iterable[UGood | UBad]:\n            yield UGood(x=1)\n            yield UBad()\n            yield UGood(x=2)\n\n\nclass TE2(NamedTuple):\n    value: int\n    uuu: UUU\n    value2: int\n\n\n# you can run one specific test (e.g. to profile) by passing it as -k to pytest\n# e.g. -k 'test_many[500000-False]'\n@pytest.mark.parametrize('count', [99, 500_000, 1_000_000])\n@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])\ndef test_many(count: int, tmp_path: Path, gc_control) -> None:\n    if count > 99 and running_on_ci:\n        pytest.skip(\"test would be too slow on CI, only meant to run manually\")\n    # should be a parametrized test perhaps\n    src = tmp_path / 'source'\n    src.touch()\n\n    cache_path = tmp_path / 'test_many'\n\n    @cachew(cache_path=cache_path, force_file=True)\n    def iter_data() -> Iterator[TE2]:\n        for i in range(count):\n            # TODO also profile datetimes?\n            yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i)\n\n    a = time.time()\n    assert ilen(iter_data()) == count  # initial\n    b = time.time()\n    print(f'test_many: initial write to cache took {b - a:.1f}s', file=sys.stderr)\n\n    print(f'test_many: cache size is {cache_path.stat().st_size / 10**6}Mb', file=sys.stderr)\n\n    a = time.time()\n    assert ilen(iter_data()) == count  # hitting cache\n    b = time.time()\n    print(f'test_many: reading from cache took {b - a:.1f}s', file=sys.stderr)\n\n    assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1)\n\n    # serializing to db\n    # in-memory: 16 seconds\n\n    # without transaction: 22secs\n    # without transaction and size 100 chunks -- some crazy amount of time, as expected\n\n    # with transaction:\n    # about 17 secs to write 1M entries (just None)\n    # chunking by 20K doesn't seem to help\n    # chunking by 100 also gives same perf\n\n    # with to_row binding: 21 secs for dummy NamedTuple with None inside, 22 for less trivial class\n\n    # deserializing from db:\n    # initially, took 20 secs to load 1M entries (TE2)\n    # 9 secs currently\n    # 6 secs if we instantiate namedtuple directly via indices\n    # 3.5 secs if we just return None from row\n\n\nclass BB(NamedTuple):\n    xx: int\n    yy: int\n\n\nclass AA(NamedTuple):\n    value: int\n    b: BB | None\n    value2: int\n\n\ndef test_return_type_inference(tmp_path: Path) -> None:\n    \"\"\"\n    Tests that return type (BB) is inferred from the type annotation\n    \"\"\"\n\n    @cachew(tmp_path)\n    def data() -> Iterator[BB]:\n        yield BB(xx=1, yy=2)\n        yield BB(xx=3, yy=4)\n\n    assert len(list(data())) == 2\n    assert len(list(data())) == 2\n\n\ndef test_return_type_mismatch(tmp_path: Path) -> None:\n    # even though user got invalid type annotation here, they specified correct type, and it's the one that should be used\n    @cachew(tmp_path, cls=AA)\n    def data2() -> list[BB]:\n        return [  # ty: ignore[invalid-return-type]\n            AA(value=1, b=None, value2=123),  # type: ignore[list-item]\n        ]\n\n    # TODO hmm, this is kinda a downside that it always returns\n    # could preserve the original return type, but too much trouble for now\n\n    assert list(data2()) == [AA(value=1, b=None, value2=123)]  # type: ignore[comparison-overlap]\n\n\ndef test_return_type_none(tmp_path: Path) -> None:\n    with pytest.raises(CachewException):\n\n        @cachew(tmp_path)\n        def data():\n            return []\n\n\ndef test_callable_cache_path(tmp_path: Path) -> None:\n    \"\"\"\n    Cache path can be function dependent on wrapped function's arguments\n    \"\"\"\n    called: set[str] = set()\n\n    @cachew(cache_path=lambda kind: tmp_path / f'{kind}.cache')\n    def get_data(kind: str) -> Iterator[BB]:\n        assert kind not in called\n        called.add(kind)\n        if kind == 'first':\n            yield BB(xx=1, yy=1)\n        else:\n            yield BB(xx=2, yy=2)\n\n    # fmt: off\n    assert list(get_data('first'))  == [BB(xx=1, yy=1)]\n    assert list(get_data('second')) == [BB(xx=2, yy=2)]\n    assert list(get_data('first'))  == [BB(xx=1, yy=1)]\n    assert list(get_data('second')) == [BB(xx=2, yy=2)]\n    # fmt: on\n\n\ndef test_nested(tmp_path: Path) -> None:\n    d1 = AA(\n        value=1,\n        b=BB(xx=2, yy=3),\n        value2=4,\n    )\n    d2 = AA(\n        value=3,\n        b=None,\n        value2=5,\n    )\n\n    def data():\n        yield d1\n        yield d2\n\n    @cachew(cache_path=tmp_path, cls=AA)\n    def get_data():\n        yield from data()\n\n    assert list(get_data()) == [d1, d2]\n    assert list(get_data()) == [d1, d2]\n\n\nclass BBv2(NamedTuple):\n    xx: int\n    yy: int\n    zz: float\n\n\ndef test_schema_change(tmp_path: Path) -> None:\n    \"\"\"\n    Should discard cache on schema change (BB to BBv2) in this example\n    \"\"\"\n    b = BB(xx=2, yy=3)\n\n    @cachew(cache_path=tmp_path, cls=BB)\n    def get_data():\n        return [b]\n\n    assert list(get_data()) == [b]\n\n    # TODO make type part of key?\n    b2 = BBv2(xx=3, yy=4, zz=5.0)\n\n    @cachew(cache_path=tmp_path, cls=BBv2)\n    def get_data_v2():\n        return [b2]\n\n    assert list(get_data_v2()) == [b2]\n\n\ndef test_transaction(tmp_path: Path) -> None:\n    \"\"\"\n    Should keep old cache and not leave it in some broken state in case of errors\n    \"\"\"\n    # logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)\n\n    class TestError(Exception):\n        pass\n\n    @cachew(cache_path=tmp_path, cls=BB, chunk_by=1)\n    def get_data(version: int):\n        for i in range(3):\n            yield BB(xx=2, yy=i)\n            if version == 2:\n                raise TestError\n\n    exp = [BB(xx=2, yy=0), BB(xx=2, yy=1), BB(xx=2, yy=2)]\n    assert list(get_data(1)) == exp\n    assert list(get_data(1)) == exp\n\n    # TODO test that hash is unchanged?\n    with pytest.raises(TestError):\n        list(get_data(2))\n\n    assert list(get_data(1)) == exp\n\n\nclass Job(NamedTuple):\n    company: str\n    title: str | None\n\n\ndef test_optional(tmp_path: Path) -> None:\n    \"\"\"\n    Tests support for typing.Optional\n    \"\"\"\n\n    @cachew(tmp_path)\n    def data() -> Iterator[Job]:\n        # fmt: off\n        yield Job('google'      , title='engineed')\n        yield Job('selfemployed', title=None)\n        # fmt: on\n\n    list(data())  # trigger cachew\n    # fmt: off\n    assert list(data()) == [\n        Job('google'      , title='engineed'),\n        Job('selfemployed', title=None),\n    ]\n    # fmt: on\n\n\n# TODO add test for optional for misleading type annotation\n\n\nclass Person(NamedTuple):\n    name: str\n    secondname: str\n    age: int\n    job: Job | None\n\n\ndef make_people_data(count: int) -> Iterator[Person]:\n    g = Random(124)\n    chars = string.ascii_uppercase + string.ascii_lowercase\n\n    randstr = lambda len_: ''.join(g.choices(chars, k=len_))\n\n    for _ in range(count):\n        has_job = g.choice([True, False])\n        maybe_job: Job | None = None\n        if has_job:\n            maybe_job = Job(company=randstr(12), title=randstr(8))\n\n        yield Person(\n            name=randstr(5),\n            secondname=randstr(10),\n            age=g.randint(20, 60),\n            job=maybe_job,\n        )\n\n\ndef test_stats(tmp_path: Path) -> None:\n    cache_file = tmp_path / 'cache'\n\n    # 4 + things are string lengths\n    one = (4 + 5) + (4 + 10) + 4 + (4 + 12 + 4 + 8)\n    N = 10000\n\n    @cachew(cache_path=cache_file, cls=Person)\n    def get_people_data() -> Iterator[Person]:\n        yield from make_people_data(count=N)\n\n    list(get_people_data())\n    print(\n        f\"Cache db size for {N} entries: estimated size {one * N // 1024} Kb, actual size {cache_file.stat().st_size // 1024} Kb;\"\n    )\n\n\n@dataclass\nclass Test:\n    field: int\n\n\ndef test_dataclass(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def get_dataclasses() -> Iterator[Test]:\n        yield from [Test(field=i) for i in range(5)]\n\n    assert list(get_dataclasses()) == [Test(field=i) for i in range(5)]\n    assert list(get_dataclasses()) == [Test(field=i) for i in range(5)]\n\n\ndef test_inner_class(tmp_path: Path) -> None:\n    # NOTE: this doesn't work at the moment if from __future__ import annotations is used in client code (e.g. on top of this test)\n    # see test_future_annotations for more info\n\n    @dataclass\n    class InnerDataclass:\n        field: int\n\n    @cachew(tmp_path)\n    def fun() -> Iterator[InnerDataclass]:\n        yield from []\n\n    # should manage to infer type and not crash at least\n    list(fun())\n    list(fun())\n\n\n@dataclass\nclass Dates:\n    d1: datetime\n    d2: datetime\n    d3: datetime\n    d4: datetime\n    d5: datetime\n\n\ndef test_dates(tmp_path: Path) -> None:\n    from zoneinfo import ZoneInfo\n\n    tz = ZoneInfo('Europe/London')\n    dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S')\n    dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S')\n\n    x = Dates(\n        d1=dwinter.replace(tzinfo=tz),\n        d2=dsummer.replace(tzinfo=tz),\n        d3=dwinter,\n        d4=dsummer,\n        d5=dsummer.replace(tzinfo=UTC),\n    )\n\n    @cachew(tmp_path)\n    def fun() -> Iterable[Dates]:\n        yield x\n\n    assert one(fun()) == x\n    assert one(fun()) == x\n\n    # make sure the actuall tzinfo is preserved... otherwise we might end up with raw offsets and lose some info\n    r = one(fun())\n    assert str(r.d1.tzinfo) == str(x.d1.tzinfo)\n    assert str(r.d2.tzinfo) == str(x.d2.tzinfo)\n    assert r.d3.tzname() is None\n    assert r.d4.tzname() is None\n    assert r.d5.tzinfo is UTC\n\n\n# fmt: off\n@dataclass\nclass AllTypes:\n    a_str   : str\n    an_int  : int\n    a_float : float\n    a_bool  : bool\n    a_dt    : datetime\n    a_date  : date\n    a_dict  : dict[str, Any]\n    a_list  : list[Any]\n    a_tuple : tuple[float, str]\n    an_exc  : Exception\n    an_opt  : str | None\n# fmt: on\n\n# TODO support vararg tuples?\n\n\ndef test_types(tmp_path: Path) -> None:\n    import pytz\n\n    tz = pytz.timezone('Europe/Berlin')\n    # fmt: off\n    obj = AllTypes(\n        a_str   = 'abac',\n        an_int  = 1123,\n        a_float = 3.131,\n        a_bool  = True,\n        a_dt    = datetime.now(tz=tz),\n        a_date  = datetime.now().replace(year=2000).date(),\n        a_dict  = {'a': True, 'x': {'whatever': 3.14}},\n        a_list  = ['aba', 123, None],\n        a_tuple = (1.23, '3.2.1'),\n        an_exc  = RuntimeError('error!', 123),\n        an_opt  = 'hello',\n    )\n    # fmt: on\n\n    @cachew(tmp_path)\n    def get() -> Iterator[AllTypes]:\n        yield obj\n\n    def helper(t: AllTypes):\n        # Exceptions can't be directly compared.. so this kinda helps\n        d = asdict(t)\n        d['an_exc'] = d['an_exc'].args\n        return d\n\n    assert helper(one(get())) == helper(obj)\n    assert helper(one(get())) == helper(obj)\n\n\n# TODO if I do perf tests, look at this https://docs.sqlalchemy.org/en/13/_modules/examples/performance/large_resultsets.html\n# TODO should be possible to iterate anonymous tuples too? or just sequences of primitive types?\n\n\ndef test_primitive(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def fun() -> Iterator[str]:\n        yield 'aba'\n        yield 'caba'\n\n    assert list(fun()) == ['aba', 'caba']\n    assert list(fun()) == ['aba', 'caba']\n\n\ndef test_single_value(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def fun_int() -> int:\n        return 123\n\n    assert fun_int() == 123\n    assert fun_int() == 123\n\n    @cachew(tmp_path, cls=('single', str))\n    def fun_str():\n        return 'whatever'\n\n    assert fun_str() == 'whatever'\n    assert fun_str() == 'whatever'\n\n    @cachew(tmp_path)\n    def fun_opt_namedtuple(none: bool) -> UUU | None:  # noqa: FBT001\n        if none:\n            return None\n        else:\n            return UUU(xx=1, yy=2)\n\n    assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2)\n    assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2)\n    assert fun_opt_namedtuple(none=True) is None\n    assert fun_opt_namedtuple(none=True) is None\n\n\nclass O(NamedTuple):\n    x: int\n\n\nclass _HackHash:\n    def __init__(self, x: int) -> None:\n        self.x = x\n\n    def __repr__(self):\n        return repr(self.x)\n\n\ndef test_default_arguments(tmp_path: Path) -> None:\n    hh = _HackHash(1)\n\n    calls = 0\n\n    def orig(a: int, param: _HackHash = hh) -> Iterator[O]:\n        yield O(hh.x)\n        nonlocal calls\n        calls += 1\n\n    def depends_on(a: int, param: _HackHash) -> str:\n        # hmm. in principle this should be str according to typing\n        # on practice though we always convert hash to str, so maybe type should be changed to Any?\n        return (a, param.x)  # type: ignore[return-value]\n\n    fun = cachew(tmp_path, depends_on=depends_on)(orig)\n\n    list(fun(123))\n    assert list(fun(123)) == [O(1)]\n    assert calls == 1\n\n    # now, change hash. That should cause the composite hash to invalidate and recompute\n    hh.x = 2\n    assert list(fun(123)) == [O(2)]\n    assert calls == 2\n\n    # should be ok with explicitly passing\n    assert list(fun(123, param=_HackHash(2))) == [O(2)]\n    assert calls == 2\n\n    # we don't have to handle the default param in the default hash key\n    fun = cachew(tmp_path)(fun)\n    assert list(fun(456)) == [O(2)]\n    assert calls == 3\n    assert list(fun(456)) == [O(2)]\n    assert calls == 3\n\n    # changing the default should trigger the default (i.e. kwargs) key function to invalidate the cache\n    hh.x = 3\n    assert list(fun(456)) == [O(3)]\n    assert calls == 4\n\n    # you don't have to pass the default parameter explicitly\n    fun = cachew(tmp_path, depends_on=lambda a: a)(orig)\n    assert list(fun(456)) == [O(3)]\n    assert calls == 5\n\n    # but watch out if you forget to handle it!\n    hh.x = 4\n    assert list(fun(456)) == [O(3)]\n    assert calls == 5\n\n\nclass U(NamedTuple):\n    x: str | O\n\n\ndef test_union(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def fun() -> Iterator[U]:\n        yield U('hi')\n        yield U(O(123))\n\n    list(fun())\n    assert list(fun()) == [U('hi'), U(O(123))]\n\n\n# NOTE: empty dataclass doesn't have __annotations__ ??? not sure if need to handle it...\n@dataclass\nclass DD:\n    x: int\n\n\ndef test_union_with_dataclass(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def fun() -> Iterator[int | DD]:\n        yield 123\n        yield DD(456)\n\n    assert list(fun()) == [123, DD(456)]\n\n\n# ugh. we need to pass backend here explicitly since it might not get picked up from the fixture\n# that sets it in settings. due to multiprocess stuff\ndef _concurrent_helper(cache_path: Path, count: int, backend: Backend, sleep_s=0.1):\n    @cachew(cache_path, backend=backend)\n    def test(count: int) -> Iterator[int]:\n        for i in range(count):\n            print(f\"{count}: GENERATING {i}\")\n            sleep(sleep_s)\n            yield i * i\n\n    return list(test(count=count))\n\n\n@pytest.fixture\ndef fuzz_cachew_impl():\n    \"\"\"\n    Insert random sleeps in cachew_impl to increase likelihood of concurrency issues\n    \"\"\"\n    from .. import cachew_wrapper\n\n    patch = '''\\\n@@ -189,6 +189,11 @@\n             old_hash = backend.get_old_hash()\n             logger.debug(f'old hash: {old_hash}')\n\n+            from random import random\n+            rs = random() * 2\n+            print(\"sleeping for: \", rs)\n+            from time import sleep; sleep(rs)\n+\n             if new_hash == old_hash:\n                 logger.debug('hash matched: loading from cache')\n                 yield from cached_items()\n'''\n    patchy.patch(cachew_wrapper, patch)\n    yield\n    patchy.unpatch(cachew_wrapper, patch)\n\n\n# TODO fuzz when they start so they enter transaction at different times?\n# TODO how to run it enough times on CI and increase likelihood of failing?\n# for now, stress testing manually:\n# while PYTHONPATH=src pytest -s cachew -k concurrent_writes ; do sleep 0.5; done\n@pytest.mark.xfail(condition=platform.system() == 'Darwin', reason='seems like file writes might not be atomic on osx?')\ndef test_concurrent_writes(tmp_path: Path, fuzz_cachew_impl) -> None:\n    cache_path = tmp_path / 'cache.sqlite'\n\n    # warm up to create the database\n    # FIXME ok, that will be fixed separately with atomic move I suppose\n    _concurrent_helper(cache_path, 1, settings.DEFAULT_BACKEND)\n\n    processes = 5\n    with ProcessPoolExecutor() as pool:\n        futures = [\n            pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND) for count in range(processes)\n        ]\n\n        for count, f in enumerate(futures):\n            assert f.result() == [i * i for i in range(count)]\n\n\n# TODO ugh. need to keep two processes around to test for yield holding transaction lock\n\n\ndef test_concurrent_reads(tmp_path: Path, fuzz_cachew_impl):\n    cache_path = tmp_path / 'cache.sqlite'\n\n    count = 10\n    # warm up\n    _concurrent_helper(cache_path, count, settings.DEFAULT_BACKEND, sleep_s=0)\n\n    processes = 4\n\n    start = time.time()\n    with ProcessPoolExecutor() as pool:\n        futures = [\n            pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND, 1) for _ in range(processes)\n        ]\n\n        for f in futures:\n            print(f.result())\n    end = time.time()\n\n    taken = end - start\n    # should be pretty instantaneous\n    # if it takes more, most likely means that helper was called again\n    assert taken < 5\n\n\ndef test_mcachew(tmp_path: Path):\n    # TODO how to test for defensive behaviour?\n    from cachew.extra import mcachew\n\n    # TODO check throw on error\n    @mcachew(cache_path=tmp_path / 'cache')\n    def func() -> Iterator[str]:\n        yield 'one'\n        yield 'two'\n\n    assert list(func()) == ['one', 'two']\n    assert list(func()) == ['one', 'two']\n\n\ndef test_defensive(restore_settings) -> None:\n    '''\n    Make sure that cachew doesn't crash on misconfiguration\n    '''\n\n    def orig() -> Iterator[int]:\n        yield 123\n\n    def orig2():\n        yield \"x\"\n        yield 123\n\n    fun = cachew(bad_arg=123)(orig)  # type: ignore[call-overload]\n    assert list(fun()) == [123]\n    assert list(fun()) == [123]\n\n    for throw in [True, False]:\n        ctx = pytest.raises(Exception) if throw else nullcontext()\n        settings.THROW_ON_ERROR = throw\n\n        with ctx:\n            fun = cachew(cache_path=lambda: 1 + 'bad_path_provider')(orig)  # type: ignore[arg-type,misc,operator]\n            assert list(fun()) == [123]\n            assert list(fun()) == [123]\n\n            fun = cachew(cache_path=lambda p: '/tmp/' + str(p))(orig)\n            assert list(fun()) == [123]\n            assert list(fun()) == [123]\n\n            fun = cachew(orig2)\n            assert list(fun()) == ['x', 123]\n            assert list(fun()) == ['x', 123]\n\n            settings.DEFAULT_CACHEW_DIR = '/dev/nonexistent'\n            fun = cachew(orig)\n            assert list(fun()) == [123]\n            assert list(fun()) == [123]\n\n\n@pytest.mark.parametrize('throw', [False, True])\ndef test_bad_annotation(*, tmp_path: Path, throw: bool) -> None:\n    \"\"\"\n    this will work in runtime without cachew if from __future__ import annotations is used\n    so should work with cachew decorator as well\n    \"\"\"\n    src = tmp_path / 'src.py'\n    src.write_text(\n        f'''\nfrom __future__ import annotations\n\nfrom cachew import settings, cachew\nsettings.THROW_ON_ERROR = {throw}\n\n@cachew\ndef fun() -> BadType:\n    print(\"called!\")\n    return 0\n\nfun()\n'''.lstrip()\n    )\n\n    ctx = pytest.raises(Exception) if throw else nullcontext()\n    with ctx:\n        assert check_output([sys.executable, src], text=True).strip() == \"called!\"\n\n\ndef test_recursive_simple(tmp_path: Path) -> None:\n    d0 = 0\n    d1 = 1000\n    calls = 0\n\n    @cachew(tmp_path)\n    def factorials(n: int) -> Iterable[int]:\n        nonlocal calls, d0, d1\n        calls += 1\n\n        if n == 0:\n            d0 = len(inspect.stack(0))\n        if n == 1:\n            d1 = len(inspect.stack(0))\n\n        if n == 0:\n            yield 1\n            return\n        prev = factorials(n - 1)\n        last = 1\n        # TODO potentially quadratic? measure perf perhaps?\n        for x in prev:\n            yield x\n            last = x\n        yield last * n\n\n    assert calls == 0\n    assert list(factorials(3)) == [1, 1, 2, 6]\n\n    # make sure the recursion isn't eating too much stack\n    # ideally would have 1? not sure if possible without some insane hacking?\n    # todo maybe check stack frame size as well?\n    assert abs(d0 - d1) <= 2\n\n    assert calls == 4\n    assert list(factorials(3)) == [1, 1, 2, 6]\n    assert calls == 4\n    assert list(factorials(5)) == [1, 1, 2, 6, 24, 120]\n    assert calls == 6\n    assert list(factorials(3)) == [1, 1, 2, 6]\n    assert calls == 10\n\n\ndef test_recursive_deep(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def numbers(n: int) -> Iterable[int]:\n        if n == 0:\n            yield 0\n            return\n        yield from numbers(n - 1)\n        yield n\n\n    @cachew(cache_path=None)\n    def numbers_cache_disabled(n: int) -> Iterable[int]:\n        if n == 0:\n            yield 0\n            return\n        yield from numbers(n - 1)\n        yield n\n\n    rlimit = sys.getrecursionlimit()\n\n    # NOTE in reality it has to do with the number of file descriptors (ulimit -Sn, e.g. 1024?)\n    # but it seems that during the error unrolling, pytest or something else actually hits the recursion limit somehow\n    # pytest ends up with an internal error in such case... which is good enough as long as tests are concerned I guess.\n    sys.setrecursionlimit(2 * 800 + 100)\n    try:\n        # at the moment each recursive call takes two frames (one for the original call, one for cachew_wrapper)\n        # + allow 100 calls for random constant overhead like pytest etc\n        list(numbers(800))\n        list(numbers(800))\n\n        list(numbers_cache_disabled(800))\n        list(numbers_cache_disabled(800))\n    finally:\n        sys.setrecursionlimit(rlimit)\n\n\ndef test_recursive_error(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def rec(n: int) -> Iterable[int]:\n        if n == 0:\n            yield 0\n            return\n        yield from rec(n - 1)\n        yield n\n\n    rlimit = sys.getrecursionlimit()\n    try:\n        sys.setrecursionlimit(50)\n        list(rec(100))\n        raise AssertionError('Expecting recursion error')\n    except RecursionError:\n        pass\n    finally:\n        sys.setrecursionlimit(rlimit)\n\n    # todo not sure if cache file should exist??\n    # either way, at least check that the db is not completely messed up\n    assert len(list(rec(100))) == 101\n\n\ndef test_exceptions(tmp_path: Path) -> None:\n    class X(NamedTuple):\n        a: int\n\n    d = datetime.strptime('20200102 03:04:05', '%Y%m%d %H:%M:%S')\n\n    @cachew(tmp_path)\n    def fun() -> Iterator[Exception]:\n        yield RuntimeError('whatever', 123, d, X(a=123))\n\n    list(fun())\n    [e] = fun()\n    # not sure if there is anything that can be done to preserve type information?\n    assert type(e) is Exception\n    assert e.args == ('whatever', 123, '2020-01-02T03:04:05', 'X(a=123)')\n\n\n# see https://beepb00p.xyz/mypy-error-handling.html#kiss\ndef test_result(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def fun() -> Iterator[Exception | int]:\n        yield 1\n        yield RuntimeError(\"sad!\")\n        yield 123\n\n    list(fun())\n    [v1, ve, v123] = fun()\n    assert v1 == 1\n    assert v123 == 123\n    assert isinstance(ve, Exception)\n    assert ve.args == ('sad!',)\n\n\ndef test_version_change(tmp_path: Path) -> None:\n    calls = 0\n\n    @cachew(tmp_path, logger=logger)\n    def fun() -> Iterator[str]:\n        nonlocal calls\n        calls += 1\n\n        yield from ['a', 'b', 'c']\n\n    list(fun())\n    list(fun())\n    assert calls == 1\n\n    # todo ugh. not sure how to do this as a relative import??\n    import cachew as cachew_module\n\n    old_version = cachew_module.CACHEW_VERSION\n\n    try:\n        cachew_module.CACHEW_VERSION = old_version + '_whatever'\n        # should invalidate cachew now\n        list(fun())\n        assert calls == 2\n        list(fun())\n        assert calls == 2\n    finally:\n        cachew_module.CACHEW_VERSION = old_version\n\n    # and now again, back to the old version\n    list(fun())\n    assert calls == 3\n    list(fun())\n    assert calls == 3\n\n\ndef dump_old_cache(tmp_path: Path) -> None:\n    # call this if you want to get an sql script for version upgrade tests..\n    oc = tmp_path / 'old_cache.sqlite'\n\n    @cachew(oc)\n    def fun() -> Iterator[int]:\n        yield from [1, 2, 3]\n\n    list(fun())\n    assert oc.exists(), oc\n\n    sql = check_output(['sqlite3', oc, '.dump']).decode('utf8')\n    print(sql, file=sys.stderr)\n\n\ndef test_old_cache_v0_6_3(tmp_path: Path) -> None:\n    if settings.DEFAULT_BACKEND != 'sqlite':\n        pytest.skip('this test only makes sense for sqlite backend')\n\n    sql = '''\nPRAGMA foreign_keys=OFF;\nBEGIN TRANSACTION;\nCREATE TABLE hash (\n\tvalue VARCHAR\n);\nINSERT INTO hash VALUES('cachew: 1, schema: {''_'': <class ''int''>}, hash: ()');\nCREATE TABLE IF NOT EXISTS \"table\" (\n\t_cachew_primitive INTEGER\n);\nINSERT INTO \"table\" VALUES(1);\nINSERT INTO \"table\" VALUES(2);\nINSERT INTO \"table\" VALUES(3);\nCOMMIT;\n    '''\n    db = tmp_path / 'cache.sqlite'\n    check_call(['sqlite3', db, sql])\n\n    @cachew(db)\n    def fun() -> Iterator[int]:\n        yield from [1, 2, 3]\n\n    # this tests that it doesn't crash\n    # for actual version upgrade test see test_version_change\n    assert list(fun()) == [1, 2, 3]\n\n\ndef test_disabled(tmp_path: Path) -> None:\n    calls = 0\n\n    @cachew(tmp_path)\n    def fun() -> Iterator[int]:\n        yield 1\n        yield 2\n        nonlocal calls\n        calls += 1\n\n    assert list(fun()) == [1, 2]\n    assert list(fun()) == [1, 2]\n    assert calls == 1\n\n    from cachew.extra import disabled_cachew\n\n    with disabled_cachew():\n        assert list(fun()) == [1, 2]\n        assert calls == 2\n        assert list(fun()) == [1, 2]\n        assert calls == 3\n\n\ndef test_early_exit_simple(tmp_path: Path) -> None:\n    # cachew works on iterators and we'd prefer not to cache if the iterator hasn't been exhausted\n    calls_f = 0\n\n    @cachew(tmp_path)\n    def f() -> Iterator[int]:\n        yield from range(20)\n        nonlocal calls_f\n        calls_f += 1\n\n    calls_g = 0\n\n    @cachew(tmp_path)\n    def g() -> Iterator[int]:\n        yield from f()\n        nonlocal calls_g\n        calls_g += 1\n\n    # only consume 10/20 items\n    assert len(list(islice(g(), 0, 10))) == 10\n    # precondition\n    assert calls_f == 0  # f hasn't been fully exhausted\n    assert calls_g == 0  # g hasn't been fully exhausted\n\n    # todo not sure if need to check that db is empty?\n    assert len(list(g())) == 20\n    assert calls_f == 1\n    assert calls_g == 1\n\n    # should be cached now\n    assert len(list(g())) == 20\n    assert calls_f == 1\n    assert calls_g == 1\n\n\n# see https://github.com/sqlalchemy/sqlalchemy/issues/5522#issuecomment-705156746\ndef test_early_exit_shutdown(tmp_path: Path) -> None:\n    # don't ask... otherwise the exception doesn't appear :shrug:\n    import_hack = '''\nfrom sqlalchemy import Column\n\nimport re\nre.hack = lambda: None\n    '''\n    Path(tmp_path / 'import_hack.py').write_text(import_hack)\n\n    prog = f'''\nimport sys\nsys.path.insert(0, '')\nimport import_hack\n\nimport cachew\ncachew.settings.THROW_ON_ERROR = True # todo check with both?\n@cachew.cachew('{tmp_path}', cls=int)\ndef fun():\n    yield 0\n\ng = fun()\ne = next(g)\n\nprint(\"FINISHED\")\n    '''\n    r = run([sys.executable, '-c', prog], cwd=tmp_path, capture_output=True, check=True)\n    assert r.stdout.strip() == b'FINISHED'\n    assert b'Traceback' not in r.stderr\n\n\n# tests both modes side by side to demonstrate the difference\n@pytest.mark.parametrize('use_synthetic', ['False', 'True'])\ndef test_synthetic_keyset(*, tmp_path: Path, use_synthetic: bool) -> None:\n    # just to keep track of which data we had to compute from scratch\n    _recomputed: list[str] = []\n\n    # assume key i is responsible for numbers i and i-1\n    # in reality this could be some slow function we'd like to avoid calling if its results is already cached\n    # e.g. the key would typically be a filename (e.g. isoformat timestamp)\n    # and the returned values could be the results of an export over the month prior to the timestamp, or something like that\n    # see https://beepb00p.xyz/exports.html#synthetic for more on the motivation\n    def compute(key: str) -> Iterator[str]:\n        _recomputed.append(key)\n        n = int(key)\n        yield str(n - 1)\n        yield str(n)\n\n    # fmt: off\n    # should result in 01 + 12 + 45                     == 01245\n    keys125         = ['1', '2', '5'                    ]\n    # should result in 01 + 12 + 45 + 56 + 67           == 0124567\n    keys12567       = ['1', '2', '5', '6', '7'          ]\n    # should result in 01 + 12 + 45 + 56      + 78 + 89 == 012456789\n    keys125689      = ['1', '2', '5', '6',      '8', '9']\n    # should result in           45 + 56      + 78 + 89 ==    456789\n    keys5689        = [          '5', '6',      '8', '9']\n    # fmt: on\n\n    def recomputed() -> list[str]:\n        r = list(_recomputed)\n        _recomputed.clear()\n        return r\n\n    ## 'cachew_cached' will just be [] if synthetic key is not used, so no impact on data\n    @cachew(tmp_path, synthetic_key=('keys' if use_synthetic else None))\n    def fun_aux(keys: Sequence[str], *, cachew_cached: Iterable[str] = []) -> Iterator[str]:\n        yield from unique_everseen(\n            chain(\n                cachew_cached,\n                *(compute(key) for key in keys),\n            )\n        )\n\n    def fun(keys: Sequence[str]) -> set[str]:\n        return set(fun_aux(keys=keys))\n\n    ##\n\n    # preserve formatting of string arguments it makes easier to read the tes\n    # fmt: off\n    assert fun(keys125) == set('01' '12' '45')\n    assert recomputed() == keys125\n    assert fun(keys125) == set('01' '12' '45')\n    assert recomputed() == []  # should be cached\n\n    assert fun(keys12567) == set('01' '12' '45' '56' '67')\n    if use_synthetic:\n        # 1, 2 and 5 should be already cached from the previous call\n        assert recomputed() == ['6', '7']\n    else:\n        # but without synthetic key this would cause everything to recompute\n        assert recomputed() == keys12567\n    assert fun(keys12567) == set('01' '12' '45' '56' '67')\n    assert recomputed() == []  # should be cached\n\n    assert fun(keys125689) == set('01' '12' '45' '56' '78' '89')\n    if use_synthetic:\n        # similarly, 1 2 5 6 7 are cached from the previous cacll\n        assert recomputed() == ['8', '9']\n    else:\n        # and we need to call against all keys otherwise\n        assert recomputed() == keys125689\n    assert fun(keys125689) == set('01' '12' '45' '56' '78' '89')\n    assert recomputed() == []  # should be cached\n\n    assert fun(keys5689) == set('45' '56' '78' '89')\n    # now the prefix has changed, so if we returned cached items it might return too much\n    # so have to recompute everything\n    assert recomputed() == keys5689\n    assert fun(keys5689) == set('45' '56' '78' '89')\n    assert recomputed() == []  # should be cached\n    # fmt: on\n\n    # TODO maybe call combined function? so it could return total result and last cached?\n    # TODO another option is:\n    # the function yields all cached stuff first\n    # then the user yields stuff from new\n    # and then external function does merging\n    # TODO test with kwargs hash?...\n    # TODO try without and with simultaneously?\n    # TODO check what happens when errors happen?\n    # FIXME check what happens if we switch between modes? (synthetic/non-synthetic)\n    # FIXME make sure this thing works if len(keys) > chunk size?\n    # TODO check what happens when we forget to set 'cachew_cached' argument\n    # TODO check what happens when keys are not str but e.g. Path\n\n\ndef test_db_path_matches_fun_name(tmp_path: Path) -> None:\n    @cachew(tmp_path)\n    def fun_single() -> int:\n        return 123\n\n    @cachew(tmp_path)\n    def fun_multiple() -> Iterable[int]:\n        return [123]\n\n    # write to cache\n    fun_single()\n    list(fun_multiple())\n\n    assert (tmp_path / callable_name(fun_single)).exists()\n    assert (tmp_path / callable_name(fun_multiple)).exists()\n\n\ndef test_type_alias_type_1(tmp_path: Path) -> None:\n    type Int = int\n\n    @cachew(tmp_path)\n    def fun() -> Iterator[Int]:\n        yield 123\n\n    assert list(fun()) == [123]\n    assert list(fun()) == [123]\n\n\ndef test_type_alias_type_2(tmp_path: Path) -> None:\n    type IteratorInt = Iterator[int]\n\n    @cachew(tmp_path)\n    def fun() -> IteratorInt:\n        yield 123\n\n    assert list(fun()) == [123]\n    assert list(fun()) == [123]\n\n\ndef test_type_alias_generic(tmp_path: Path) -> None:\n    type Res[T] = T | Exception\n    type IntRes = Res[int]\n\n    @cachew(tmp_path)\n    def fun() -> Iterator[IntRes]:\n        yield 123\n\n    assert list(fun()) == [123]\n    assert list(fun()) == [123]\n"
  },
  {
    "path": "src/cachew/tests/test_future_annotations.py",
    "content": "from __future__ import annotations\n\nimport os\nimport sys\nimport textwrap\nfrom collections.abc import Iterator\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom subprocess import check_output\nfrom typing import Any\n\nimport pytest\nfrom more_itertools import one\n\nfrom .. import cachew\n\ntype _Str = str  # deliberate, to test 3.12 'type ... = ...' type definitions\n\n\n# fmt: off\n@dataclass\nclass NewStyleTypes1:\n    a_str   : str\n    a_dict  : dict[str, Any]\n    a_list  : list[Any]\n    a_tuple : tuple[float, _Str]\n# fmt: on\n\n\ndef test_types1(tmp_path: Path) -> None:\n    obj = NewStyleTypes1(\n        a_str   = 'abac',\n        a_dict  = {'a': True, 'x': {'whatever': 3.14}},\n        a_list  = ['aba', 123, None],\n        a_tuple = (1.23, '3.2.1'),\n    )  # fmt: skip\n\n    @cachew(tmp_path)\n    def get() -> Iterator[NewStyleTypes1]:\n        yield obj\n\n    assert one(get()) == obj\n    assert one(get()) == obj\n\n\n# fmt: off\n@dataclass\nclass NewStyleTypes2:\n    an_opt  : str | None\n    a_union : _Str | int\n# fmt: on\n\n\ndef test_types2(tmp_path: Path) -> None:\n    obj = NewStyleTypes2(\n        an_opt  = 'hello',\n        a_union = 999,\n    )  # fmt: skip\n\n    @cachew(tmp_path)\n    def get() -> Iterator[NewStyleTypes2]:\n        yield obj\n\n    assert one(get()) == obj\n    assert one(get()) == obj\n\n\n@pytest.mark.parametrize('use_future_annotations', [False, True])\n@pytest.mark.parametrize('local', [False, True])\n@pytest.mark.parametrize('throw', [False, True])\ndef test_future_annotations(\n    *,\n    use_future_annotations: bool,\n    local: bool,\n    throw: bool,\n    tmp_path: Path,\n) -> None:\n    \"\"\"\n    Checks handling of postponed evaluation of annotations (from __future__ import annotations)\n    \"\"\"\n\n    # NOTE: to avoid weird interactions with existing interpreter in which pytest is running\n    #  , we compose a program and running in python directly instead\n    #  (also not sure if it's even possible to tweak postponed annotations without doing that)\n\n    if use_future_annotations and local and throw:\n        # when annotation is local (like inner class), then they end up as strings\n        #  so we can't eval it as we don't have access to a class defined inside function\n        #  keeping this test just to keep track of whether this is fixed at some point\n        #  possibly relevant:\n        #  - https://peps.python.org/pep-0563/#keeping-the-ability-to-use-function-local-state-when-defining-annotations\n        pytest.skip(\"local aliases/classses don't work with from __future__ import annotations\")\n\n    _PREAMBLE = f'''\nfrom pathlib import Path\nimport tempfile\n\nfrom cachew import cachew, settings\nsettings.THROW_ON_ERROR = {throw}\n\ntemp_dir = tempfile.TemporaryDirectory()\ntd = Path(temp_dir.name)\n\n'''\n\n    _TEST = '''\n\ntype Identity[T] = T\nI = int\ntype S = Identity[str]\n\n@cachew(td)\ndef fun() -> list[I | S]:\n    print(\"called\")\n    return [1, \"2\"]\n\nassert list(fun()) == [1, \"2\"]\nassert list(fun()) == [1, \"2\"]\n'''\n\n    if use_future_annotations:\n        code = '''\nfrom __future__ import annotations\n'''\n    else:\n        code = ''\n\n    code += _PREAMBLE\n\n    if local:\n        code += f'''\ndef test() -> None:\n{textwrap.indent(_TEST, prefix=\" \")}\n\ntest()\n'''\n    else:\n        code += _TEST\n\n    run_py = tmp_path / 'run.py'\n    run_py.write_text(code)\n\n    cache_dir = tmp_path / 'cache'\n    cache_dir.mkdir()\n\n    res = check_output(\n        [sys.executable, run_py],\n        env={'TMPDIR': str(cache_dir), **os.environ},\n        text=True,\n    )\n    called = int(res.count('called'))\n    if use_future_annotations and local and not throw:\n        # cachew fails to set up, so no caching but at least it works otherwise\n        assert called == 2\n    else:\n        assert called == 1\n"
  },
  {
    "path": "src/cachew/tests/test_resolve_type_parameters.py",
    "content": "from ..utils import resolve_type_parameters\n\n\ndef test_simple_generic_alias() -> None:\n    # if you define types ad-hoc, they resolve to GenericAlias, not TypeAliasType\n    assert resolve_type_parameters(int) == int  # noqa: E721\n    assert resolve_type_parameters(list[bool]) == list[bool]\n    assert resolve_type_parameters(dict[str, list[float]]) == dict[str, list[float]]\n\n\ndef test_simple_type_keyword() -> None:\n    type Int = int\n\n    assert resolve_type_parameters(Int) == int  # noqa: E721\n    assert resolve_type_parameters(list[Int]) == list[int]\n    assert resolve_type_parameters(dict[str, list[Int]]) == dict[str, list[int]]\n\n\ndef test_generic_collections() -> None:\n    type ListInt = list[int]\n    assert resolve_type_parameters(ListInt) == list[int]\n    assert resolve_type_parameters(dict[str, ListInt]) == dict[str, list[int]]\n\n    type TupleInt = tuple[int, bool]\n    assert resolve_type_parameters(TupleInt) == tuple[int, bool]\n    type TupleIntStr = tuple[TupleInt, str]\n    assert resolve_type_parameters(TupleIntStr) == tuple[tuple[int, bool], str]\n\n    type SetStr = set[str]\n    assert resolve_type_parameters(SetStr) == set[str]\n\n    type DictAlias[K, V] = dict[K, V]\n    assert resolve_type_parameters(DictAlias[str, int]) == dict[str, int]\n    assert resolve_type_parameters(DictAlias[int, list[str]]) == dict[int, list[str]]\n\n    type ComplexDict = dict[str, tuple[ListInt, SetStr]]\n    assert resolve_type_parameters(ComplexDict) == dict[str, tuple[list[int], set[str]]]\n\n\ndef test_generic_type_keyword() -> None:\n    type Id[T] = T\n    type IdInt = Id[int]\n\n    assert resolve_type_parameters(IdInt) == int  # noqa: E721\n    assert resolve_type_parameters(list[IdInt]) == list[int]\n\n    # check multiple uses of type params\n    type Pair[T] = tuple[T, T]\n    type PairInt = Pair[int]\n    assert resolve_type_parameters(PairInt) == tuple[int, int]\n    assert resolve_type_parameters(Pair[str]) == tuple[str, str]\n    assert resolve_type_parameters(list[Pair[int]]) == list[tuple[int, int]]\n\n    # check if type params aren't used\n    type NotUsing1[T, V] = int\n    type NotUsing2[V, W] = NotUsing1[bool, float]\n    type ListInt1 = list[NotUsing2[list, str]]\n    assert resolve_type_parameters(ListInt1) == list[int]\n\n    # Test generic alias with alias as parameter\n    type Container[T] = list[T]\n    type Int = int\n    assert resolve_type_parameters(Container[Int]) == list[int]\n\n\ndef test_chaining() -> None:\n    type Int = int\n    type Int2 = Int\n    type Int3 = Int2\n    assert resolve_type_parameters(Int3) == int  # noqa: E721\n\n    type ListInt3 = list[Int3]\n    assert resolve_type_parameters(ListInt3) == list[int]\n\n    type Box[T] = list[T]\n    type DoubleBox[T] = Box[Box[T]]\n    type DoubleBoxFloat = DoubleBox[float]\n    assert resolve_type_parameters(DoubleBoxFloat) == list[list[float]]\n\n\ndef test_optional_and_union() -> None:\n    type Int = int\n    type MaybeInt = int | None\n    assert resolve_type_parameters(MaybeInt) == (int | None)\n    assert resolve_type_parameters(list[MaybeInt]) == list[int | None]\n\n    type Str = str  # FIXME extract outside?\n\n    type StrOrInt = Str | Int\n    assert resolve_type_parameters(StrOrInt) == (str | int)\n\n    type UnionWithAlias = int | Str\n    assert resolve_type_parameters(UnionWithAlias) == (int | str)\n\n    # Test union in generic contexts\n    type OptionalList[T] = list[T] | None\n    assert resolve_type_parameters(OptionalList[int]) == (list[int] | None)\n    assert resolve_type_parameters(OptionalList[str]) == (list[str] | None)\n\n    # Test nested unions with aliases\n    type Bool = bool\n    type StrOrIntOrBool = StrOrInt | Bool\n    assert resolve_type_parameters(StrOrIntOrBool) == (int | str | bool)\n\n    # Test union with complex aliased types\n    type ListInt = list[int]\n    type DictStrInt = dict[str, int]\n    type ComplexUnion = ListInt | DictStrInt | None\n    assert resolve_type_parameters(ComplexUnion) == (list[int] | dict[str, int] | None)\n\n\ndef test_old_aliases() -> None:\n    \"\"\"\n    Old style typing.* aliases get 'normalised' by typing.get_origin call.\n    This shouldn't really be a problem, so just highihghting it here.\n    \"\"\"\n    from typing import Dict, List, Optional  # noqa: UP035\n\n    type OptionalInt = Optional[int]  # noqa: UP045\n    assert resolve_type_parameters(OptionalInt) == int | None\n\n    type ListInt = List[int]  # noqa: UP006\n    assert resolve_type_parameters(ListInt) == list[int]\n\n    type DictIntStr = Dict[int, str]  # noqa: UP006\n    assert resolve_type_parameters(DictIntStr) == dict[int, str]\n\n\ndef test_old_union() -> None:\n    from typing import Union\n\n    type IntUnion[T] = Union[int, T, bool]  # noqa: UP007\n\n    assert resolve_type_parameters(IntUnion[str]) == (int | str | bool)\n\n\ndef test_typevar() -> None:\n    from typing import TypeVar\n\n    X = TypeVar('X')\n\n    ListX = list[X]\n    type ListInt = ListX[int]\n    assert resolve_type_parameters(ListInt) == list[int]\n\n    SetX = set[X]\n    SetFloat = SetX[float]\n    assert resolve_type_parameters(SetFloat) == set[float]\n\n\ndef test_misc() -> None:\n    \"\"\"\n    Miscellaneous more complex tests.\n    \"\"\"\n\n    # Test union inside list/dict\n    type MaybeStr = str | None\n    assert resolve_type_parameters(list[MaybeStr]) == list[str | None]\n    assert resolve_type_parameters(dict[str, MaybeStr]) == dict[str, str | None]\n\n    # Test union with nested generic aliases\n    type Container[T] = list[T]\n    type OptionalContainer[T] = Container[T] | None\n    assert resolve_type_parameters(OptionalContainer[int]) == (list[int] | None)\n\n    # Test union with multiple aliased generics\n    type ListAlias[T] = list[T]\n    type SetAlias[T] = set[T]\n    type CollectionUnion[T] = ListAlias[T] | SetAlias[T]\n    assert resolve_type_parameters(CollectionUnion[str]) == (list[str] | set[str])\n\n    # Test union in tuple\n    type IntOrStr = int | str\n    assert resolve_type_parameters(tuple[IntOrStr, bool]) == tuple[int | str, bool]\n\n    # Test deeply nested union with aliases\n    type Middle = list[IntOrStr]\n    type Outer = Middle | None\n    assert resolve_type_parameters(Outer) == (list[int | str] | None)\n\n    # Test union with chained aliases\n    type Level1 = int\n    type Level2 = Level1\n    type Level3 = Level2\n    type UnionChained = Level3 | str | None\n    assert resolve_type_parameters(UnionChained) == (int | str | None)\n\n    # Test union with generic that resolves to union\n    type MaybeList[T] = list[T] | None\n    type NestedMaybe = MaybeList[int | str]\n    assert resolve_type_parameters(NestedMaybe) == (list[int | str] | None)\n\n    # Test union with aliased union\n    type NumberOrStr = int | float | str\n    type ExtendedUnion = NumberOrStr | bool\n    assert resolve_type_parameters(ExtendedUnion) == (int | float | str | bool)\n\n    # Test union in dict values and keys\n    type FlexibleKey = str | int\n    type FlexibleValue = list[int] | dict[str, str] | None\n    assert (\n        resolve_type_parameters(dict[FlexibleKey, FlexibleValue]) == dict[str | int, list[int] | dict[str, str] | None]\n    )\n\n    # Test union with same type repeated (Python may or may not normalize this)\n    type RepeatUnion = int | int | str  # noqa: PYI016\n    # Python's union implementation may deduplicate, so we accept both\n    assert resolve_type_parameters(RepeatUnion) == (int | str) or resolve_type_parameters(RepeatUnion) == (int | int | str)  # fmt: skip\n\n    # Test union with TypeAliasType in multiple positions\n    type AliasA = list[int]\n    type AliasB = dict[str, int]\n    type AliasC = set[str]\n    type MultiAliasUnion = AliasA | AliasB | AliasC\n    assert resolve_type_parameters(MultiAliasUnion) == (list[int] | dict[str, int] | set[str])\n\n    # Test generic union with substitution\n    type Result[T, E] = T | E\n    assert resolve_type_parameters(Result[int, str]) == (int | str)\n    assert resolve_type_parameters(Result[list[int], dict[str, str]]) == (list[int] | dict[str, str])\n\n    # Test union with None (Optional pattern) in various positions\n    type OptionalInt = int | None\n    type ListOfOptional = list[OptionalInt]\n    assert resolve_type_parameters(ListOfOptional) == list[int | None]\n\n    # Test union with multiple levels of aliased unions\n    type UnionA = int | str\n    type UnionB = bool | float\n    type CombinedUnion = UnionA | UnionB\n    assert resolve_type_parameters(CombinedUnion) == (int | str | bool | float)\n\n    # Test union as generic parameter with nested aliases\n    type NestedAlias = list[int]\n    type UnionParam[T] = dict[str, T | None]\n    assert resolve_type_parameters(UnionParam[NestedAlias]) == dict[str, list[int] | None]\n\n    # Test complex scenario: generic alias that returns a union, used in another union\n    type ComplexUnion[T] = MaybeList[T] | dict[str, T]\n    assert resolve_type_parameters(ComplexUnion[int]) == (list[int] | None | dict[str, int])\n\n    # Test union in tuple with multiple aliased elements\n    type AliasInt = int\n    type AliasStr = str\n    type TupleWithUnions = tuple[AliasInt | None, list[AliasStr | bool]]\n    assert resolve_type_parameters(TupleWithUnions) == tuple[int | None, list[str | bool]]\n\n    # Test three-way union with all aliased types\n    type TypeA = list[int]\n    type TypeB = dict[str, str]\n    type TypeC = set[bool]\n    type ThreeWayUnion = TypeA | TypeB | TypeC\n    assert resolve_type_parameters(ThreeWayUnion) == (list[int] | dict[str, str] | set[bool])\n\n    # Test union where members themselves contain unions\n    type InnerUnion1 = int | str\n    type InnerUnion2 = bool | float\n    type OuterUnion = list[InnerUnion1] | dict[str, InnerUnion2]\n    assert resolve_type_parameters(OuterUnion) == (list[int | str] | dict[str, bool | float])\n\n    # Test generic union with nested type aliases in parameters\n    type Box[T] = list[T]\n    type OptionBox[T] = Box[T] | None\n    assert resolve_type_parameters(OptionBox[int | str]) == (list[int | str] | None)\n\n    # Test union with mix of generic and non-generic aliases\n    type SimpleAlias = int\n    type GenericAlias[T] = list[T]\n    type MixedUnion[T] = SimpleAlias | GenericAlias[T]\n    assert resolve_type_parameters(MixedUnion[str]) == (int | list[str])\n\n    # Test generic alias that returns the parameter unchanged\n    type Same[T] = T\n    assert resolve_type_parameters(Same[int]) == int  # noqa: E721\n    assert resolve_type_parameters(Same[list[str]]) == list[str]\n    assert resolve_type_parameters(Same[Same[int]]) == int  # noqa: E721\n\n    # Test deeply nested generics\n    type Deep = dict[str, list[tuple[int, set[str]]]]\n    assert resolve_type_parameters(Deep) == dict[str, list[tuple[int, set[str]]]]\n\n    # Test union in complex nested structure\n    type Data[T] = dict[str, list[T] | None]\n    assert resolve_type_parameters(Data[int | str]) == dict[str, list[int | str] | None]\n\n    # Test alias in tuple with mixed types\n    type Mixed = tuple[int, list[str], dict[str, int]]\n    assert resolve_type_parameters(Mixed) == tuple[int, list[str], dict[str, int]]\n"
  },
  {
    "path": "src/cachew/tests/utils.py",
    "content": "import gc\nimport os\nimport sys\nfrom contextlib import contextmanager\nfrom pathlib import Path\n\nimport pytest\n\nPROFILES = Path(__file__).absolute().parent / 'profiles'\n\n\n@contextmanager\ndef profile(name: str):\n    # ugh. seems like pyinstrument slows down code quite a bit?\n    if os.environ.get('PYINSTRUMENT') is None:\n        yield\n        return\n\n    from pyinstrument import Profiler\n\n    with Profiler() as profiler:\n        yield\n\n    PROFILES.mkdir(exist_ok=True)\n    results_file = PROFILES / f\"{name}.html\"\n\n    print(\"results for \" + name, file=sys.stderr)\n    profiler.print()\n\n    results_file.write_text(profiler.output_html())\n\n\ndef timer(name: str):\n    from codetiming import Timer\n\n    return Timer(name=name, text=name + ': ' + '{:.2f}s')\n\n\n@pytest.fixture\ndef gc_control(*, gc_on: bool):\n    if gc_on:\n        # no need to do anything, should be on by default\n        yield\n        return\n\n    gc.disable()\n    try:\n        yield\n    finally:\n        gc.enable()\n\n\nrunning_on_ci = 'CI' in os.environ\n"
  },
  {
    "path": "src/cachew/utils.py",
    "content": "from collections.abc import Mapping\nfrom types import UnionType\nfrom typing import TypeAliasType, TypeVar, get_args, get_origin\n\n\n# https://stackoverflow.com/a/2166841/706389\ndef is_namedtuple(t) -> bool:\n    b = getattr(t, '__bases__', None)\n    if b is None:\n        return False\n    if len(b) != 1 or b[0] is not tuple:\n        return False\n    f = getattr(t, '_fields', None)\n    if not isinstance(f, tuple):\n        return False\n    return all(type(n) == str for n in f)  # noqa: E721\n\n\ndef resolve_type_parameters(t) -> type:\n    return _resolve_type_parameters_aux(t, typevar_to_type={})\n\n\ndef _resolve_type_parameters_aux(t, *, typevar_to_type: Mapping[TypeVar, type]) -> type:\n    if isinstance(t, TypeVar):\n        return typevar_to_type[t]\n\n    # This is the 'left hand side' case, i.e. in type ... =\n    if isinstance(t, TypeAliasType):\n        return _resolve_type_parameters_aux(t.__value__, typevar_to_type=typevar_to_type)\n\n    # note: args is never none\n    raw_args = get_args(t)\n    resolved_args = tuple(_resolve_type_parameters_aux(arg, typevar_to_type=typevar_to_type) for arg in raw_args)\n\n    # UnionType: resolve each member of the union\n    if isinstance(t, UnionType):\n        # Reconstruct the union with resolved args\n        result = resolved_args[0]\n        for arg in resolved_args[1:]:\n            result = result | arg  # type: ignore[assignment]\n        return result\n\n    origin = get_origin(t)\n\n    # Must be a non-generic type\n    if origin is None:\n        return t\n\n    # This is the 'right hand side', e.g. '... = Id[int]' matches this\n    if isinstance(origin, TypeAliasType):\n        type_params = origin.__type_params__\n        new_typevar_to_type: Mapping[TypeVar, type] = {\n            **typevar_to_type,\n            **dict(zip(type_params, resolved_args, strict=True)),  # type: ignore[arg-type]\n        }\n        return _resolve_type_parameters_aux(origin.__value__, typevar_to_type=new_typevar_to_type)\n\n    # Just a regular generic type\n    return origin[resolved_args]\n"
  },
  {
    "path": "tox.ini",
    "content": "[tox]\nminversion = 4\n\n# relies on the correct version of Python installed\n# (we rely on CI for the test matrix)\nenvlist = ruff,tests,mypy,ty\n\n# https://github.com/tox-dev/tox/issues/20#issuecomment-247788333\n# hack to prevent .tox from crapping to the project directory\ntoxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox\n\n[testenv]\n# TODO how to get package name from setuptools?\npackage_name = \"cachew\"\npass_env =\n# useful for tests to know they are running under ci\n    CI\n    CI_*\n# respect user's cache dirs to prevent tox from crapping into project dir\n    PYTHONPYCACHEPREFIX\n    MYPY_CACHE_DIR\n    RUFF_CACHE_DIR\n\nset_env =\n# do not add current working directory to pythonpath\n# generally this is more robust and safer, prevents weird issues later on\n    PYTHONSAFEPATH=1\n\nrunner = uv-venv-lock-runner\nuv_sync_locked = false\n\n\n[testenv:ruff]\nskip_install = true\ndependency_groups = testing\ncommands =\n    {envpython} -m ruff check \\\n        {posargs}\n\n\n[testenv:tests]\ndependency_groups = testing\ncommands =\n    # posargs allow test filtering, e.g. tox ... -- -k test_name\n    {envpython} -m pytest \\\n        --pyargs {[testenv]package_name} \\\n        {posargs}\n\n\n[testenv:mypy]\ndependency_groups = typecheck\ncommands =\n    {envpython} -m mypy --no-install-types \\\n        -p {[testenv]package_name}       \\\n        --txt-report           .coverage.mypy \\\n        --html-report          .coverage.mypy \\\n        # this is for github actions to upload to codecov.io\n        # sadly xml coverage crashes on windows... so we need to disable it\n        {env:CI_MYPY_COVERAGE} \\\n        {posargs}\n\n\n[testenv:ty]\ndependency_groups = typecheck\ncommands =\n    {envpython} -m ty \\\n        check \\\n        {posargs}\n"
  },
  {
    "path": "ty.toml",
    "content": "[src]\nexclude = [\n    \"doc/test_serialization.py\",\n]\n"
  }
]