Repository: karlicoss/cachew Branch: master Commit: 7e785aac758f Files: 46 Total size: 251.7 KB Directory structure: gitextract_k61syhvn/ ├── .ci/ │ ├── release │ └── run ├── .gitattributes ├── .github/ │ └── workflows/ │ └── main.yml ├── .gitignore ├── .idea/ │ └── dictionaries/ │ └── karlicos.xml ├── LICENSE.txt ├── README.ipynb ├── README.md ├── benchmarks/ │ ├── 20230912-comparison-with-legacy.org │ ├── 20230912.org │ └── 20230917.org ├── doc/ │ ├── cachew_disable.md │ ├── serialization.org │ └── test_serialization.py ├── generate-readme ├── github-issues.org ├── misc/ │ ├── profile.py │ └── test_redis/ │ ├── docker-compose.yml │ └── test.py ├── mypy.ini ├── pyproject.toml ├── pytest.ini ├── ruff.toml ├── src/ │ └── cachew/ │ ├── __init__.py │ ├── backend/ │ │ ├── common.py │ │ ├── file.py │ │ └── sqlite.py │ ├── common.py │ ├── compat.py │ ├── experimental.py │ ├── extra.py │ ├── legacy.py │ ├── logging_helper.py │ ├── marshall/ │ │ ├── cachew.py │ │ └── common.py │ ├── py.typed │ ├── pytest.py │ ├── tests/ │ │ ├── marshall.py │ │ ├── test_cachew.py │ │ ├── test_future_annotations.py │ │ ├── test_resolve_type_parameters.py │ │ └── utils.py │ └── utils.py ├── tox.ini └── ty.toml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .ci/release ================================================ #!/usr/bin/env python3 ''' Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]]. - running manually You'll need =UV_PUBLISH_TOKEN= env variable - running on Github Actions Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. Packages are deployed on: - every master commit, onto test pypi - every new tag, onto production pypi ''' UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN' import argparse import os from pathlib import Path from subprocess import check_call is_ci = os.environ.get('CI') is not None def main() -> None: p = argparse.ArgumentParser() p.add_argument('--use-test-pypi', action='store_true') args = p.parse_args() publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else [] root = Path(__file__).absolute().parent.parent os.chdir(root) # just in case check_call(['uv', 'build', '--clear']) if not is_ci: # CI relies on trusted publishers so doesn't need env variable assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed' check_call(['uv', 'publish', *publish_url]) if __name__ == '__main__': main() ================================================ FILE: .ci/run ================================================ #!/bin/bash set -eu cd "$(dirname "$0")" cd .. # git root if ! command -v sudo; then # CI or Docker sometimes doesn't have it, so useful to have a dummy function sudo { "$@" } fi # --parallel-live to show outputs while it's running tox_cmd='run-parallel --parallel-live' if [ -n "${CI-}" ]; then # install OS specific stuff here case "$OSTYPE" in darwin*) # macos : ;; cygwin* | msys* | win*) # windows # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that tox_cmd='run' ;; *) # must be linux? : ;; esac fi # NOTE: expects uv installed uv tool run --with tox-uv tox $tox_cmd "$@" ================================================ FILE: .gitattributes ================================================ *.ipynb filter=nbstripout *.ipynb diff=ipynb ================================================ FILE: .github/workflows/main.yml ================================================ # see https://github.com/karlicoss/pymplate for up-to-date reference name: CI on: push: branches: '*' tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug: # Needed to trigger on others' PRs. # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". pull_request: # Needed to trigger workflows manually. workflow_dispatch: inputs: debug_enabled: type: boolean description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)' required: false default: false schedule: - cron: '31 18 * * 5' # run every Friday jobs: build: strategy: fail-fast: false matrix: platform: [ubuntu-latest, macos-latest] # windows-latest python-version: ['3.12', '3.13', '3.14'] # vvv just an example of excluding stuff from matrix # exclude: [{platform: macos-latest, python-version: '3.6'}] runs-on: ${{ matrix.platform }} # useful for 'optional' pipelines # continue-on-error: ${{ matrix.platform == 'windows-latest' }} steps: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - uses: actions/checkout@v6 with: submodules: recursive fetch-depth: 0 # nicer to have all git history when debugging/for tests - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - uses: astral-sh/setup-uv@v7 with: enable-cache: false # we don't have lock files, so can't use them as cache key - uses: mxschmitt/action-tmate@v3 if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }} # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd... - run: bash .ci/run env: # only compute lxml coverage on ubuntu; it crashes on windows CI_MYPY_COVERAGE: ${{ matrix.platform == 'ubuntu-latest' && '--cobertura-xml-report .coverage.mypy' || '' }} - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms uses: codecov/codecov-action@v5 with: fail_ci_if_error: true # default false token: ${{ secrets.CODECOV_TOKEN }} flags: mypy-${{ matrix.python-version }} files: .coverage.mypy/cobertura.xml pypi: # Do not run it for PRs/cron schedule etc. # NOTE: release tags are guarded by on: push: tags on the top. if: github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') || (github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch))) # Ugh, I tried using matrix or something to explicitly generate only test pypi or prod pypi pipelines. # But github actions is so shit, it's impossible to do any logic at all, e.g. doesn't support conditional matrix, if/else statements for variables etc. needs: [build] # add all other jobs here runs-on: ubuntu-latest permissions: # necessary for Trusted Publishing id-token: write steps: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - uses: actions/checkout@v6 with: submodules: recursive fetch-depth: 0 # pull all commits to correctly infer vcs version - uses: actions/setup-python@v6 with: python-version: '3.12' - uses: astral-sh/setup-uv@v7 with: enable-cache: false # we don't have lock files, so can't use them as cache key - name: 'release to test pypi' # always deploy merged master to test pypi if: github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch) run: .ci/release --use-test-pypi - name: 'release to prod pypi' # always deploy tags to release pypi if: startsWith(github.event.ref, 'refs/tags/') run: .ci/release ================================================ FILE: .gitignore ================================================ # Created by https://www.gitignore.io/api/python,emacs # Edit at https://www.gitignore.io/?templates=python,emacs ### Emacs ### # -*- mode: gitignore; -*- *~ \#*\# /.emacs.desktop /.emacs.desktop.lock *.elc auto-save-list tramp .\#* # Org-mode .org-id-locations *_archive # flymake-mode *_flymake.* # eshell files /eshell/history /eshell/lastdir # elpa packages /elpa/ # reftex files *.rel # AUCTeX auto folder /auto/ # cask packages .cask/ dist/ # Flycheck flycheck_*.el # server auth directory /server/ # projectiles files .projectile # directory configuration .dir-locals.el # network security /network-security.data ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # End of https://www.gitignore.io/api/python,emacs untracked/ ================================================ FILE: .idea/dictionaries/karlicos.xml ================================================ cachew dataclassish pylint typecheck ================================================ FILE: LICENSE.txt ================================================ The MIT License (MIT) Copyright (c) 2019 Dima Gerasimov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ast\n", "from pathlib import Path\n", "\n", "import jedi # ty: ignore[unresolved-import]\n", "\n", "\n", "def git_root() -> Path:\n", " import subprocess\n", "\n", " path_s = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip()\n", " path = Path(path_s)\n", " assert path.is_absolute(), path # just in case\n", " return path\n", "\n", "\n", "src_dir = git_root() / 'src'\n", "assert src_dir.exists(), src_dir # seems like jedi is pretty quiet about missing dirs..\n", "\n", "project = jedi.Project(src_dir)\n", "\n", "\n", "def _find(name: str):\n", " # ugh. sometimes it returns exact dupes for no apparent reason??\n", " completions = set(project.search(name, all_scopes=True))\n", " assert len(completions) == 1, f\"Expected one completion for {name}, got {completions}\"\n", " [c] = completions\n", " [c] = c.goto() # todo what is this for?\n", " return c\n", "\n", "\n", "def rlink(name: str) -> str:\n", " c = _find(name)\n", " if c.module_path is None:\n", " # TODO ugh raise an issue on tracker or something??\n", " # seems to only happen for namsepace packages..\n", " assert c.description.startswith('namespace '), c\n", " res = name.replace('.', '/')\n", " assert (src_dir / res).exists(), res\n", " return f'src/{res}'\n", " else:\n", " rpath = Path(c.module_path).relative_to(src_dir)\n", " return f'src/{rpath}#L{c.line}'\n", "\n", "\n", "# TODO ugh.. annoying, seems like Jedi can't get the functions source?\n", "# maybe because it's doing partial parsing or something?\n", "# there is c._get_module_context().code_lines, but it returns all lines in a source file??\n", "def getsource(symbol: str) -> str:\n", " c = _find(symbol)\n", " p = Path(c.module_path)\n", " # TODO check that it's a function?\n", " function_name = symbol.split('.')[-1]\n", " assert p.exists(), p\n", " src = p.read_text()\n", " src_lines = src.splitlines(keepends=True)\n", " for x in ast.walk(ast.parse(src)):\n", " if isinstance(x, ast.FunctionDef) and x.name == function_name:\n", " break\n", " else:\n", " raise RuntimeError(f'Function not found: {symbol}')\n", "\n", " # ugh lineno is 1-indexed, and seems like a closed interval?\n", " return ''.join(src_lines[x.lineno - 1 : x.end_lineno])\n", "\n", "\n", "def getdoc(symbol: str) -> str:\n", " c = _find(symbol)\n", " doc = c.docstring()\n", " assert doc is not None, symbol\n", " return doc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TODO just get rid of this in favor of native markdown + rlink?\n", "def flink(title: str, name: str | None = None) -> str:\n", " if name is None:\n", " name = title.replace('`', '') # meh\n", " if name.startswith('tests'):\n", " name = name.replace('tests', 'cachew.tests.test_cachew')\n", " # FIXME just replace in code..\n", "\n", " return f\"[{title}]({rlink(name)})\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Markdown as md # ty: ignore[unresolved-import]\n", "\n", "dmd = lambda x: display(md(x.strip())) # ty: ignore[unresolved-reference]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": false, "ein.hycell": false, "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "dmd('''\n", "\n", "''')" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "source": [ "# What is Cachew?\n", "TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.\n", "Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.\n", "\n", "In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.\n", "\n", "A simple type is defined as:\n", "\n", "- primitive: `str`/`int`/`float`/`bool`\n", "- JSON-like types (`dict`/`list`/`tuple`)\n", "- `datetime`\n", "- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )\n", "- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)\n", "- [dataclasses](https://docs.python.org/3/library/dataclasses.html)\n", "\n", "\n", "That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.\n", "Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.\n", "\n", "## Motivation\n", "\n", "I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.\n", "\n", "Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,\n", "comparing on the next run and returning cached data if nothing changed.\n", "\n", "Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.\n", "\n", "\n", "# Examples\n", "## Processing Wikipedia\n", "Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.\n", "Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.\n", "\n", "\n", "With this library your can achieve it through single `@cachew` decorator." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": false, "ein.hycell": false, "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "# FIXME hmm seems like this doesn't work if there are type annotations on cachew_impl? odd\n", "# likely this? https://github.com/davidhalter/jedi/issues/2025\n", "doc = getdoc('cachew_impl').split('Usage example:')[-1].lstrip()\n", "dmd(f\"\"\"```python\n", "{doc}\n", "```\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.\n", "\n", "When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Incremental data exports\n", "This is my most common usecase of cachew, which I'll illustrate with example.\n", "\n", "I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.\n", "Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).\n", "That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:\n", "\n", " ...\n", " 20190715100026.db\n", " 20190716100138.db\n", " 20190717101651.db\n", " 20190718100118.db\n", " 20190719100701.db\n", " ...\n", "\n", "To access **all** of historic temperature data, I have two options:\n", "\n", "- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:\n", " \n", " def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n", " for chunk in chunks:\n", " # read measurements from 'chunk' and yield unseen ones\n", "\n", " This is very **easy, but slow** and you waste CPU for no reason every time you need data.\n", "\n", "- Keep a 'master' database and write code to merge chunks in it.\n", "\n", " This is very **efficient, but tedious**:\n", " \n", " - requires serializing/deserializing data -- boilerplate\n", " - requires manually managing sqlite database -- error prone, hard to get right every time\n", " - requires careful scheduling, ideally you want to access new data without having to refresh cache\n", "\n", " \n", "Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:\n", "\n", " @cachew \n", " def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n", " # ...\n", " \n", "- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast\n", "- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)\n", "\n", " All the complexity of handling database is hidden in `cachew` implementation." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": false, "ein.hycell": false, "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "link = rlink('composite_hash')\n", "\n", "dmd(f'''\n", "# How it works\n", "\n", "- first your objects get {flink('converted', 'cachew.marshall.cachew.CachewMarshall')} into a simpler JSON-like representation\n", "- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).\n", "\n", "When the function is called, cachew [computes the hash of your function's arguments ]({link})\n", "and compares it against the previously stored hash value.\n", "\n", "- If they match, it would deserialize and yield whatever is stored in the cache database\n", "- If the hash mismatches, the original function is called and new data is stored along with the new hash\n", "''')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": false, "ein.hycell": false, "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "dmd('# Features')\n", "types = [f'`{t}`' for t in ['str', 'int', 'float', 'bool', 'datetime', 'date', 'Exception']]\n", "dmd(f\"\"\"\n", "* automatic schema inference: {flink('1', 'tests.test_return_type_inference')}, {flink('2', 'tests.test_return_type_mismatch')}\n", "* supported types:\n", "\n", " * primitive: {', '.join(types)}\n", "\n", " See {flink('tests.test_types')}, {flink('tests.test_primitive')}, {flink('tests.test_dates')}, {flink('tests.test_exceptions')}\n", " * {flink('@dataclass and NamedTuple', 'tests.test_dataclass')}\n", " * {flink('Optional', 'tests.test_optional')} types\n", " * {flink('Union', 'tests.test_union')} types\n", " * {flink('nested datatypes', 'tests.test_nested')}\n", "\n", "* detects {flink('datatype schema changes', 'tests.test_schema_change')} and discards old data automatically\n", "\"\"\")\n", "# * custom hash function TODO example with mtime?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Performance\n", "Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.\n", "\n", "During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.\n", "\n", "It would almost certainly make your program faster if your computations take more than several seconds.\n", "\n", "You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": false, "ein.hycell": false, "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "dmd(f\"\"\"\n", "# Using\n", "See {flink('docstring', 'cachew_impl')} for up-to-date documentation on parameters and return types.\n", "You can also use {flink('extensive unit tests', 'tests')} as a reference.\n", "\n", "Some useful (but optional) arguments of `@cachew` decorator:\n", "\n", "* `cache_path` can be a directory, or a callable that {flink('returns a path', 'tests.test_callable_cache_path')} and depends on function's arguments.\n", "\n", " By default, `settings.DEFAULT_CACHEW_DIR` is used.\n", "\n", "* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.\n", "\n", " By default it just uses string representation of the arguments, you can also specify a custom callable.\n", "\n", " For instance, it can be used to {flink('discard cache', 'tests.test_custom_hash')} if the input file was modified.\n", "\n", "* `cls` is the type that would be serialized.\n", "\n", " By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "source": [ "# Installing\n", "Package is available on [pypi](https://pypi.org/project/cachew/).\n", "\n", " pip3 install --user cachew\n", " \n", "## Developing\n", "I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI." ] }, { "cell_type": "markdown", "metadata": { "ein.tags": "worksheet-0", "slideshow": { "slide_type": "-" } }, "source": [ "# Implementation\n", "\n", "* why NamedTuples and dataclasses?\n", " \n", " `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.\n", " Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.\n", " \n", " If you want to find out more why you should use more dataclasses in your code I suggest these links:\n", " \n", " - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)\n", " - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)\n", " \n", "* why not `pandas.DataFrame`?\n", "\n", " DataFrames are great and can be serialised to csv or pickled.\n", " They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.\n", " They also can't be nested.\n", "\n", "* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?\n", " \n", " ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.\n", "\n", " * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.\n", " Also it doesn't support nested types.\n", " \n", "* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?\n", "\n", " Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.\n", "\n", " Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:\n", " \n", " * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api\n", " * https://pypi.org/project/marshmallow-dataclass\n", " \n", " I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).\n", " So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.\n", "\n", "* why `sqlite` database for storage?\n", "\n", " It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.\n", "\n", " There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.\n", " It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.\n", " \n", " It would also be interesting to experiment with in-RAM storages.\n", "\n", " I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tips and tricks\n", "## Optional dependency\n", "You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dmd(f\"\"\"```python\n", "{getsource('cachew.extra.mcachew')}\n", "```\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.\n", "\n", "## Settings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dmd(f'''\n", "{flink('cachew.settings')} exposes some parameters that allow you to control `cachew` behaviour:\n", "- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).\n", " You can also use {flink('cachew.extra.disabled_cachew')} context manager to do it temporarily.\n", "- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the \"user cache directory\" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).\n", "- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.\n", " Set to `True` to catch errors earlier.\n", "- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).\n", "\n", "''')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Updating this readme\n", "This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script." ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { "display_name": "cachew", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" }, "name": "README.ipynb" }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: README.md ================================================ # What is Cachew? TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache. Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it. In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types. A simple type is defined as: - primitive: `str`/`int`/`float`/`bool` - JSON-like types (`dict`/`list`/`tuple`) - `datetime` - `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) ) - [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple) - [dataclasses](https://docs.python.org/3/library/dataclasses.html) That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing. Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries. ## Motivation I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases. Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files, comparing on the next run and returning cached data if nothing changed. Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task. # Examples ## Processing Wikipedia Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive. Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow. With this library your can achieve it through single `@cachew` decorator. ```python >>> from typing import NamedTuple, Iterator >>> class Link(NamedTuple): ... url : str ... text: str ... >>> @cachew ... def extract_links(archive_path: str) -> Iterator[Link]: ... for i in range(5): ... # simulate slow IO ... # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours ... import time; time.sleep(1) ... yield Link(url=f'http://link{i}.org', text=f'text {i}') ... >>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run [Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')] >>> from timeit import Timer >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1) ... # second run is cached, so should take less time >>> print(f"call took {int(res)} seconds") call took 0 seconds >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1) ... # now file has changed, so the cache will be discarded >>> print(f"call took {int(res)} seconds") call took 5 seconds ``` When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it. When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately. ## Incremental data exports This is my most common usecase of cachew, which I'll illustrate with example. I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity. Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements). That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.: ... 20190715100026.db 20190716100138.db 20190717101651.db 20190718100118.db 20190719100701.db ... To access **all** of historic temperature data, I have two options: - Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like: def measurements(chunks: List[Path]) -> Iterator[Measurement]: for chunk in chunks: # read measurements from 'chunk' and yield unseen ones This is very **easy, but slow** and you waste CPU for no reason every time you need data. - Keep a 'master' database and write code to merge chunks in it. This is very **efficient, but tedious**: - requires serializing/deserializing data -- boilerplate - requires manually managing sqlite database -- error prone, hard to get right every time - requires careful scheduling, ideally you want to access new data without having to refresh cache Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function: @cachew def measurements(chunks: List[Path]) -> Iterator[Measurement]: # ... - as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast - you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data) All the complexity of handling database is hidden in `cachew` implementation. # How it works - first your objects get [converted](src/cachew/marshall/cachew.py#L29) into a simpler JSON-like representation - after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson). When the function is called, cachew [computes the hash of your function's arguments ](src/cachew/__init__.py#L580) and compares it against the previously stored hash value. - If they match, it would deserialize and yield whatever is stored in the cache database - If the hash mismatches, the original function is called and new data is stored along with the new hash # Features * automatic schema inference: [1](src/cachew/tests/test_cachew.py#L381), [2](src/cachew/tests/test_cachew.py#L395) * supported types: * primitive: `str`, `int`, `float`, `bool`, `datetime`, `date`, `Exception` See [tests.test_types](src/cachew/tests/test_cachew.py#L682), [tests.test_primitive](src/cachew/tests/test_cachew.py#L720), [tests.test_dates](src/cachew/tests/test_cachew.py#L632), [tests.test_exceptions](src/cachew/tests/test_cachew.py#L1124) * [@dataclass and NamedTuple](src/cachew/tests/test_cachew.py#L597) * [Optional](src/cachew/tests/test_cachew.py#L524) types * [Union](src/cachew/tests/test_cachew.py#L827) types * [nested datatypes](src/cachew/tests/test_cachew.py#L440) * detects [datatype schema changes](src/cachew/tests/test_cachew.py#L470) and discards old data automatically # Performance Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure. During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps. It would almost certainly make your program faster if your computations take more than several seconds. You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py). # Using See [docstring](src/cachew/__init__.py#L279) for up-to-date documentation on parameters and return types. You can also use [extensive unit tests](src/cachew/tests/test_cachew.py#L1) as a reference. Some useful (but optional) arguments of `@cachew` decorator: * `cache_path` can be a directory, or a callable that [returns a path](src/cachew/tests/test_cachew.py#L417) and depends on function's arguments. By default, `settings.DEFAULT_CACHEW_DIR` is used. * `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated. By default it just uses string representation of the arguments, you can also specify a custom callable. For instance, it can be used to [discard cache](src/cachew/tests/test_cachew.py#L115) if the input file was modified. * `cls` is the type that would be serialized. By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache. # Installing Package is available on [pypi](https://pypi.org/project/cachew/). pip3 install --user cachew ## Developing I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI. # Implementation * why NamedTuples and dataclasses? `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python. Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions. If you want to find out more why you should use more dataclasses in your code I suggest these links: - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes) - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes) * why not `pandas.DataFrame`? DataFrames are great and can be serialised to csv or pickled. They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature. They also can't be nested. * why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)? ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose. * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class. Also it doesn't support nested types. * why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`? Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface. Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted: * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api * https://pypi.org/project/marshmallow-dataclass I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org). So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite. * why `sqlite` database for storage? It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees. There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`. It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable. It would also be interesting to experiment with in-RAM storages. I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance. # Tips and tricks ## Optional dependency You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet: ```python def mcachew(*args, **kwargs): """ Stands for 'Maybe cachew'. Defensive wrapper around @cachew to make it an optional dependency. """ try: import cachew except ModuleNotFoundError: import warnings warnings.warn( 'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew', stacklevel=2, ) return lambda orig_func: orig_func else: return cachew.cachew(*args, **kwargs) ``` Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing. ## Settings [cachew.settings](src/cachew/__init__.py#L55) exposes some parameters that allow you to control `cachew` behaviour: - `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging). You can also use [cachew.extra.disabled_cachew](src/cachew/extra.py#L25) context manager to do it temporarily. - `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the "user cache directory" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)). - `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues. Set to `True` to catch errors earlier. - `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too). ## Updating this readme This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script. ================================================ FILE: benchmarks/20230912-comparison-with-legacy.org ================================================ Running on @karlicoss desktop PC, =python3.10=. This is basically to justify switching to the new serialization method - old way, =legacy= used to 'flatten' the type into an sqlite row - new way, =cachew=, just dumps it as a dict, then to bytes via =orjson= and stores in a single sqlite column The numbers between legacy and cachew can't be directly compared though. Legacy =serializing= step emits a tuple, which can be inserted directly into the database. So to compare it with the new way, we need to compare with the sum of =serializing= + =json dump=. That said this won't be exact comparison either, since legacy binder relied on sqlalchemy to dump custom types to sqlite types (e.g. =datetime= or =Exception=). So legacy will have a slight advantage this way, but it's fine. So we can see that for: - =test_union_str_dataclass= - new implementation: =0.53 + 0.45s= to serialize; =0.29 + 0.48= to deserialize - old implementation: =2.38s= to serialize; =1.92= to deserialize - =test_nested_dataclass= - new implementation: =1.05 + 0.26s= to serialize; =0.50 + 1.42= to deserialize - old implementation: =1.92s= to serialize; =1.88= to deserialize For both tests, serialization if quite a bit faster with the new implementation. On the second test, they are on par for deserialization, but as I mention these numbers are in favor of the legacy implementation. In addition, keeping everything in one column unlocks some othe optimizations which wouldn't be possible with multiple columns. #+begin_example $ pytest --pyargs cachew.tests.marshall -k 'gc_off and 1000000 and not cattrs' -s =========================================================== test session starts ============================================================ platform linux -- Python 3.10.12, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3 cachedir: .pytest_cache rootdir: /code/cachew_jsonpickle configfile: pytest.ini plugins: anyio-3.6.2 collected 100 items / 95 deselected / 5 selected src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew] building 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.34s serializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.53s json dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s sqlite dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.08s sqlite load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s jsonl dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.18s jsonl load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.13s json load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.48s PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-legacy] building 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.35s serializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 2.38s json dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.22s sqlite dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.06s sqlite load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s jsonl dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s jsonl load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s json load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.23s deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.92s PASSED src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-cachew] building 1000000 objects of type .TE2'>: 0.58s serializing 1000000 objects of type .TE2'>: 1.05s json dump 1000000 objects of type .TE2'>: 0.26s sqlite dump 1000000 objects of type .TE2'>: 1.03s sqlite load 1000000 objects of type .TE2'>: 0.30s jsonl dump 1000000 objects of type .TE2'>: 0.14s jsonl load 1000000 objects of type .TE2'>: 0.14s json load 1000000 objects of type .TE2'>: 0.50s deserializing 1000000 objects of type .TE2'>: 1.42s PASSED src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-legacy] building 1000000 objects of type .TE2'>: 0.56s serializing 1000000 objects of type .TE2'>: 1.92s json dump 1000000 objects of type .TE2'>: 0.21s sqlite dump 1000000 objects of type .TE2'>: 0.99s sqlite load 1000000 objects of type .TE2'>: 0.29s jsonl dump 1000000 objects of type .TE2'>: 0.12s jsonl load 1000000 objects of type .TE2'>: 0.12s json load 1000000 objects of type .TE2'>: 0.24s deserializing 1000000 objects of type .TE2'>: 1.88s PASSED #+end_example ================================================ FILE: benchmarks/20230912.org ================================================ Running on @karlicoss desktop PC, =python3.10= - serializing/deserializing here refers to converting object to json-ish python dictionary (not actual json string!) - json dump/json load refers to converting the dict above to a json string and back - sqlite dump/jsonl dump refers to saving/loading these strings to a persistent storage #+begin_example $ pytest --pyargs --ignore-glob '*test_cachew*' -k marshall -s =========================================================== test session starts ============================================================ platform linux -- Python 3.10.6, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3 cachedir: .pytest_cache configfile: pytest.ini plugins: anyio-3.6.2 collected 37 items / 8 deselected / 29 selected src/cachew/marshall/cachew.py::test_serialize_and_deserialize PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew] building 1000000 objects of type str | cachew.tests.marshall.Name: 0.60s serializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.85s json dump 1000000 objects of type str | cachew.tests.marshall.Name: 0.46s sqlite dump 1000000 objects of type str | cachew.tests.marshall.Name: 1.11s sqlite load 1000000 objects of type str | cachew.tests.marshall.Name: 0.31s jsonl dump 1000000 objects of type str | cachew.tests.marshall.Name: 0.13s jsonl load 1000000 objects of type str | cachew.tests.marshall.Name: 0.13s json load 1000000 objects of type str | cachew.tests.marshall.Name: 1.04s deserializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.86s PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...) src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew] building 5000000 objects of type str | cachew.tests.marshall.Name: 3.00s serializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.38s json dump 5000000 objects of type str | cachew.tests.marshall.Name: 2.14s sqlite dump 5000000 objects of type str | cachew.tests.marshall.Name: 5.43s sqlite load 5000000 objects of type str | cachew.tests.marshall.Name: 1.47s jsonl dump 5000000 objects of type str | cachew.tests.marshall.Name: 0.62s jsonl load 5000000 objects of type str | cachew.tests.marshall.Name: 0.64s json load 5000000 objects of type str | cachew.tests.marshall.Name: 4.74s deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.06s PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...) src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...) src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew] building 5000000 objects of type str | cachew.tests.marshall.Name: 1.77s serializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.59s json dump 5000000 objects of type str | cachew.tests.marshall.Name: 1.22s sqlite dump 5000000 objects of type str | cachew.tests.marshall.Name: 5.28s sqlite load 5000000 objects of type str | cachew.tests.marshall.Name: 1.58s jsonl dump 5000000 objects of type str | cachew.tests.marshall.Name: 0.64s jsonl load 5000000 objects of type str | cachew.tests.marshall.Name: 0.66s json load 5000000 objects of type str | cachew.tests.marshall.Name: 1.53s deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.60s PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...) src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew] building 1000000 objects of type : 1.05s serializing 1000000 objects of type : 1.28s json dump 1000000 objects of type : 0.22s sqlite dump 1000000 objects of type : 1.14s sqlite load 1000000 objects of type : 0.30s jsonl dump 1000000 objects of type : 0.14s jsonl load 1000000 objects of type : 0.14s json load 1000000 objects of type : 0.70s deserializing 1000000 objects of type : 2.20s PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs) src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew] building 5000000 objects of type : 5.08s serializing 5000000 objects of type : 6.35s json dump 5000000 objects of type : 1.13s sqlite dump 5000000 objects of type : 5.58s sqlite load 5000000 objects of type : 1.47s jsonl dump 5000000 objects of type : 0.69s jsonl load 5000000 objects of type : 0.70s json load 5000000 objects of type : 6.85s deserializing 5000000 objects of type : 11.10s PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs) src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew] building 1000000 objects of type : 1.37s serializing 1000000 objects of type : 1.25s json dump 1000000 objects of type : 0.24s sqlite dump 1000000 objects of type : 1.12s sqlite load 1000000 objects of type : 0.29s jsonl dump 1000000 objects of type : 0.14s jsonl load 1000000 objects of type : 0.14s json load 1000000 objects of type : 0.24s deserializing 1000000 objects of type : 2.17s PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs) src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew] building 5000000 objects of type : 5.10s serializing 5000000 objects of type : 6.22s json dump 5000000 objects of type : 1.17s sqlite dump 5000000 objects of type : 5.43s sqlite load 5000000 objects of type : 1.54s jsonl dump 5000000 objects of type : 0.70s jsonl load 5000000 objects of type : 0.71s json load 5000000 objects of type : 1.22s deserializing 5000000 objects of type : 10.97s PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs) src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew] building 1000000 objects of type .TE2'>: 1.64s serializing 1000000 objects of type .TE2'>: 1.43s json dump 1000000 objects of type .TE2'>: 0.30s sqlite dump 1000000 objects of type .TE2'>: 1.16s sqlite load 1000000 objects of type .TE2'>: 0.30s jsonl dump 1000000 objects of type .TE2'>: 0.15s jsonl load 1000000 objects of type .TE2'>: 0.15s json load 1000000 objects of type .TE2'>: 1.02s deserializing 1000000 objects of type .TE2'>: 2.78s PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs] building 1000000 objects of type .TE2'>: 1.88s serializing 1000000 objects of type .TE2'>: 0.80s json dump 1000000 objects of type .TE2'>: 0.31s sqlite dump 1000000 objects of type .TE2'>: 1.39s sqlite load 1000000 objects of type .TE2'>: 0.31s jsonl dump 1000000 objects of type .TE2'>: 0.15s jsonl load 1000000 objects of type .TE2'>: 0.15s json load 1000000 objects of type .TE2'>: 1.03s deserializing 1000000 objects of type .TE2'>: 2.61s PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew] building 1000000 objects of type .TE2'>: 0.57s serializing 1000000 objects of type .TE2'>: 1.08s json dump 1000000 objects of type .TE2'>: 0.29s sqlite dump 1000000 objects of type .TE2'>: 1.09s sqlite load 1000000 objects of type .TE2'>: 0.30s jsonl dump 1000000 objects of type .TE2'>: 0.15s jsonl load 1000000 objects of type .TE2'>: 0.15s json load 1000000 objects of type .TE2'>: 0.50s deserializing 1000000 objects of type .TE2'>: 1.43s PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs] building 1000000 objects of type .TE2'>: 0.57s serializing 1000000 objects of type .TE2'>: 0.39s json dump 1000000 objects of type .TE2'>: 0.29s sqlite dump 1000000 objects of type .TE2'>: 1.16s sqlite load 1000000 objects of type .TE2'>: 0.32s jsonl dump 1000000 objects of type .TE2'>: 0.16s jsonl load 1000000 objects of type .TE2'>: 0.15s json load 1000000 objects of type .TE2'>: 0.50s deserializing 1000000 objects of type .TE2'>: 1.29s PASSED ============================================================ slowest durations ============================================================= 44.87s call src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew] 38.76s call src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew] 28.65s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew] 20.05s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew] 9.82s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew] 9.51s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs] 8.37s call src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew] 8.20s call src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew] 6.45s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew] 5.93s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew] 5.78s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs] 3.98s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew] 0.01s call src/cachew/marshall/cachew.py::test_serialize_and_deserialize (68 durations < 0.005s hidden. Use -vv to show these durations.) ========================================================= short test summary info ========================================================== SKIPPED [6] src/cachew/tests/marshall.py:171: TODO need to adjust the handling of Union types.. SKIPPED [4] src/cachew/tests/marshall.py:194: TODO support datetime with pytz for cattrs PASSED src/cachew/marshall/cachew.py::test_serialize_and_deserialize PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew] PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew] PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew] PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew] PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew] PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew] PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew] PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew] PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew] PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs] PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew] PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs] #+end_example ================================================ FILE: benchmarks/20230917.org ================================================ Running on @karlicoss desktop PC, =python3.10= Just a comparison of =sqlite= and =file= backends. #+begin_example $ pytest --pyargs -k 'test_many and gc_off and 3000000' -s src/cachew/tests/test_cachew.py::test_many[sqlite-gc_off-3000000] [INFO 2023-09-17 02:02:09,946 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many..iter_data: wrote 3000000 objects to cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many) test_many: initial write to cache took 13.6s test_many: cache size is 229.220352Mb [INFO 2023-09-17 02:02:10,780 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many..iter_data: loading 3000000 objects from cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many) test_many: reading from cache took 7.0s PASSED src/cachew/tests/test_cachew.py::test_many[file-gc_off-3000000] [INFO 2023-09-17 02:02:23,944 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many..iter_data: wrote 3000000 objects to cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many) test_many: initial write to cache took 6.1s test_many: cache size is 202.555667Mb [INFO 2023-09-17 02:02:23,945 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many..iter_data: loading objects from cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many) test_many: reading from cache took 5.4s #+end_example ================================================ FILE: doc/cachew_disable.md ================================================ Can put this in the README.md once its been tested a bit ### Disable through Environment Variables To disable a `cachew` function in some module, you can use the `CACHEW_DISABLE` environment variable. This is a colon-delimited (like a `$PATH`) list of modules to disable. It disables modules given some name recursively, and supports [unix-style globs](https://docs.python.org/3/library/fnmatch.html) For example, say you were using [HPI](https://github.com/karlicoss/HPI) which internally uses a snippet like `mcachew` above. You may want to enable `cachew` for _most_ modules, but disable them for specific ones. For example take: ``` my/browser ├── active_browser.py ├── all.py ├── common.py └── export.py my/reddit ├── __init__.py ├── all.py ├── common.py ├── pushshift.py └── rexport.py ``` To disable `cachew` in all of these files: `export CACHEW_DISABLE=my.browser:my.reddit` (disables for all submodules) To disable just for a particular module: `export CACHEW_DISABLE='my.browser.export'` Similarly to `$PATH` manipulations, you can do this in your shell configuration incrementally: ``` CACHEW_DISABLE='my.reddit.rexport' if some condition...; then CACHEW_DISABLE="my.browser.export:$CACHEW_DISABLE" fi export CACHEW_DISABLE ``` You can also use globs, e.g. `CACHEW_DISABLE='my.*.gdpr` To disable `cachew` everywhere, you could set `export CACHEW_DISABLE='*'` ================================================ FILE: doc/serialization.org ================================================ Cachew works kinda like =functools.lru_cache=, but it also works in-between program runs. For that, it needs to somehow persist the objects on the disk (unlike =lru_cache= which just keeps references to the objects already in process memory). While persisting objects to the cache, essentially cachew needs to map them into simpler types, i.e. ones you can keep in a database like strings/ints/binary blobs. At the moment (as of =v0.13.0=), we use sqlite as the cache store, with =sqlalchemy= as the interface to interact with it. The way cachew works now is, to save the object in cache: - first it's "flattened out" to conform to the database row model, so individual fields (including recursive fields) become database columns - python types are mapped into sqlalchemy types, with extra =sqlalchemy.TypeDecorator= instances to support custom types like =datetime= or =Exception= You can find a more detailed example [[https://github.com/karlicoss/cachew/blob/175afade0a417bfd533ced174365d246b8a7dabc/src/cachew/__init__.py#L319-L353][here]]. A big problem is that in general it's not really possible to serialize, and especially to deserialize back an arbitrary object in Python, unless you resort to binary serialization like =pickle= (which is very slow and comes with its own hosts of issues). However in cachew we require the user to supply the *type signature* for the functions that are cached, so we can benefit from it for serializing and deserializing. Few years ago, when I implemented =cachew= at first, there weren't really many options for serialization driven by type signatures, so I implemented the custom code I mentioned above to support that. In 2023, however, more and more libraries are benefiting from type signatures, in particular for serializing stuff. So I decided to give it another go, in hope of using some mature library, simplifying cachew's code, and possibly getting a perfromance boost. It's possible that I missed some documentation so if you think the problems I am describing can actually be worked around, please don't hesitate to let me know. * Comparison In cachew the very minimum we're aiming to support are: - all json-ish types, e.g. =int=/=str=/=dict=/=list= etc - =dataclass= and =NamedTuple= - =Optional= and =Union= - custom types, e.g. =datetime=, =Exception= (e.g. at least preserve exception message) See [[file:test_serialization.py]] for more specific examples and supporting evidence for my summary here. ** [[https://docs.python.org/3.10/library/pickle.html][pickle]] Builtin pickle module can handle any objects, without even needing type annotations. However, it's [[https://www.benfrederickson.com/dont-pickle-your-data/][famously very slow]], so I even didn't consider using it. It's also not secure in general, although in our case we control the objects we save/load from cache, so it's not a big issue. ** [[https://github.com/jsonpickle/jsonpickle#readme][jsonpickle]] Jsonpickle -- similar to pickle, can handle any types. I [[https://github.com/karlicoss/cachew/commit/048df33e65560205d63845f022b027a27719ff48][gave it a go]] just in case, and it's an order of magnitude slower than custom serialization code I already had, which is a no-go. ** [[https://github.com/lidatong/dataclasses-json/#readme][dataclasses-json]] # TODO link to code - CON: requires annotating all dataclasses involved with =@dataclass_json=, recursively. This is a blocker from using it in =cachew=. - CON: requires the type to be a =@dataclass= to annotate So if you have something simpler you'll have to wrap it into a dummy dataclass or something. - PRO: supports =Union= correctly ** [[https://github.com/marshmallow-code/marshmallow][marshmallow]] By default marshmallow doesn't support dataclasses or unions, but there are some extra packages - for dataclasses https://github.com/lovasoa/marshmallow_dataclass - PRO: doesn't require modifying the original class, handles recursion out of the box - CON: doesn't handle =Union= correctly This is a blocker for cachew. In addition it has a custom implementation of Union handling (rather than e.g. relying on =python-marshmallow-union=). - https://github.com/adamboche/python-marshmallow-union I didn't even get to try it since if dataclasses don't work marshmallow is a no-go for me. Plus for some reason =marshmallow_dataclass= has a custom Union handling implementation which is different from this one, so it's going to be a huge mess. ** [[https://github.com/pydantic/pydantic#readme][pydantic]] - PRO: if you use =TypeAdapter=, you can serialize/deserialize arbitrary types without decorating/inheriting from =BaseModel= - CON: doesn't handle =Union= correctly Again, this is a bit blocker. I've created an issue on pydantic bug tracker here: https://github.com/pydantic/pydantic/issues/7391 Kind of sad, because otherwise pydantic seemed promising! ** [[https://github.com/python-attrs/cattrs#features][cattrs]] - PRO: doesn't require modifying the classes you serialise - PRO: rich feature set, clearly aiming to comply with standard python's typing annotations - CON: there is an issue with handling =NamedTuple= It isn't converted to a dictionary like =dataclass= does, [[https://github.com/python-attrs/cattrs/issues/425][likely a bug]]? - =Union= types are supported, but require some extra configuration Unions work, but you have to 'register' them first. A bit annoying that this is necessary even for simple unions like =int | str=, although [[https://github.com/python-attrs/cattrs/issues/423][possible]] to workaround. The plus side is that cattr has a builtin utility for Union type discrimination. I guess for my application I could traverse the type and register all necessary Unions with =catrrs=? # TODO create an issue to support opting in everywhere by default? Since the above seems quite good, I did a quick cachew hack on [[https://github.com/karlicoss/cachew/tree/cattrs][cattrs branch]] to try and use it. The pipeline is the following: - serialize type to a dictionary with primitive types via =cattrs= - serialize dictionary to a byte string via =orjson= - persist the byte string as an sqlite database row (for deserializing we just do the same in reverse) You can find the results [[https://github.com/karlicoss/cachew/commit/82691b10cd1d4ced4862dff21cf038fb83f9525c][here]] -- cattrs proved to be quite a huge speedup over my custom serialization code! It needs a bit more work and evaluation for use in =cachew=, however it's super promising! # TODO https://catt.rs/en/stable/preconf.html#orjson Some interesting reading about cattrs: - https://threeofwands.com/why-cattrs-is-so-fast/#v2-the-genconverter - https://threeofwands.com/why-i-use-attrs-instead-of-pydantic * Verdict The biggest shared issues are that most of this libraries: - require modifying the original class definitions, either by inheriting or decorating - don't handle =Union= at all or don't handle it corectly (usually relying on the structural equivalence rather than actual types) So for most of them, I even didn't get to trying to support custom types and measuing performance with =cachew=. Of all of them only =cattrs= stood out, it takes builtin python typing and performance very seriously, and very configurable. So if you need no bullshit serialization in python, I can definitely recommend it. I might switch to it in [[https://github.com/karlicoss/promnesia][promnesia]] (where we have full control over the type we serialize in the database), and could potentially be used in HPI for [[https://github.com/karlicoss/HPI/blob/master/my/core/serialize.py][my.core.serialize]]. ================================================ FILE: doc/test_serialization.py ================================================ #!/usr/bin/env python3 from dataclasses import dataclass from typing import NamedTuple, Union def test_dataclasses_json(): # pip install dataclasses-json from dataclasses_json import dataclass_json @dataclass class Inner: value: int @dataclass class Outer: inner: Inner ### issue 1: requires @dataclass_json annotation on all involved dataclasses obj = Outer(inner=Inner(value=123)) # noqa: F841 # we don't control the types that are passed to us, so we can't use the @dataclass_json # but we can just call the decorator directly # HOWEVER: this modifies the original class, Outer!! OuterJson = dataclass_json(Outer) # noqa: F841 # it adds 'from_dict', 'from_json', 'schema', 'to_dict', 'to_json' attributes to it # now if you try # print(OuterJson.schema().dump(obj)) # you get a warning that it wants you to add annotations to Inner classes too. # this isn't really an option for us. ### ### issue 2: can't dump anything unless the top level type is a dataclass? ### could wrap into a dummy dataclass or something, but is wasteful in terms of performance ### ### nice thing: correctly serializes Union types, even if they share the same attributes @dataclass_json @dataclass class City: name: str @dataclass_json @dataclass class Country: name: str @dataclass_json @dataclass class WithUnion: union: Union[City, Country] # noqa: UP007 objs = [ WithUnion(union=City(name='London')), WithUnion(union=Country(name='UK')), ] schema = WithUnion.schema() json = schema.dumps(objs, many=True) objs2 = schema.loads(json, many=True) print("objects ", objs) print("json ", json) # NOTE: it dumps [{"union": {"name": "London", "__type": "City"}}, {"union": {"name": "UK", "__type": "Country"}}] # so types are correctly distinguished print("restored ", objs2) assert objs == objs2, (objs, objs2) ### def test_marshmallow_dataclass(): # pip3 install --user marshmallow-dataclass[union] import marshmallow_dataclass ### issue 1: the top level type has to be a dataclass? ### although possible that we could use regular marshmallow for that instead ### ### issue 2: doesn't handle unions correctly @dataclass class City: name: str @dataclass class Country: name: str @dataclass class WithUnion: union: Union[City, Country] # noqa: UP007 objs = [ WithUnion(union=City(name="London")), WithUnion(union=Country(name="UK")), ] # NOTE: good, doesn't require adding annotations on the original classes schema = marshmallow_dataclass.class_schema(WithUnion)() json = schema.dumps(objs, many=True) objs2 = schema.loads(json, many=True) print("objects ", objs) print("json ", json) # NOTE: it dumps [{"union": {"value": 123}}, {"union": {"value": 123}}] # so it doesn't distingush based on types => won't deserialize correctly print("restored ", objs2) # assert objs == objs2, (objs, objs2) # ^ this assert fails! ### def test_pydantic(): from pydantic import TypeAdapter ### issue: doesn't handle Unions correctly @dataclass class City: name: str @dataclass class Country: name: str @dataclass class WithUnion: union: Union[City, Country] # noqa: UP007 objs = [ WithUnion(union=City(name="London")), WithUnion(union=Country(name="UK")), ] # NOTE: nice, doesn't require annotating the original classes with anything Schema = TypeAdapter(list[WithUnion]) json = Schema.dump_python( objs, # round_rtip: Whether to output the serialized data in a way that is compatible with deserialization # not sure, doesn't seem to impact anything.. round_trip=True, ) objs2 = Schema.validate_python(json) print("objects ", objs) print("json ", json) print("restored ", objs2) # assert objs == objs2, (objs, objs2) # ^ this assert fails! # created an issue https://github.com/pydantic/pydantic/issues/7391 ### def test_cattrs(): from cattrs import Converter from cattrs.strategies import configure_tagged_union converter = Converter() ### issue: NamedTuples aren't unstructured? asked here https://github.com/python-attrs/cattrs/issues/425 class X(NamedTuple): value: int d = converter.unstructure(X(value=123), X) # noqa: F841 # NOTE: this assert doesn't pass! # assert isinstance(d, dict) ### ### good: handles Union correctly (although some extra configuring required) @dataclass class City: name: str @dataclass class Country: name: str @dataclass class WithUnion: union: Union[City, Country] # noqa: UP007 objs = [ WithUnion(union=City(name="London")), WithUnion(union=Country(name="UK")), ] configure_tagged_union( union=City | Country, converter=converter, ) # NOTE: nice -- doesn't require decorating original classes json = converter.unstructure(objs, list[WithUnion]) assert isinstance(json, list) objs2 = converter.structure(json, list[WithUnion]) print("objects ", objs) # NOTE: dumps it as [{'union': {'name': 'London', '_type': 'City'}}, {'union': {'name': 'UK', '_type': 'Country'}}] print("json ", json) print("restored ", objs2) assert objs == objs2, (objs, objs2) ### ### issue: unions of simple types aren't supported? # see https://github.com/python-attrs/cattrs/issues/423 mixed: list[int | str] = [ 123, 'Jakarta', ] json = converter.unstructure(mixed, list[int | str]) # NOTE: this fails # mixed2 = converter.structure(json , list[int | str]) ### test_dataclasses_json() test_marshmallow_dataclass() test_pydantic() test_cattrs() ================================================ FILE: generate-readme ================================================ #!/bin/bash set -eu cd "$(dirname "$0")" # --no-input seems to work well # but if need more targeted approach, pparently can mark certain cells with tag and use '--TagRemovePreprocessor.remove_cell_tags={"noexport"}' ? exec uvx --with jupyter --from jupyter-core jupyter nbconvert --execute --to markdown --no-input README.ipynb # TODO run it on CI to make sure it renders and up to date? ================================================ FILE: github-issues.org ================================================ #+todo: OPEN | CLOSED * Issues of cachew :PROPERTIES: :since: :url: https://api.github.com/repos/karlicoss/cachew :END: ** OPEN keep hash along each cached entity instead of separate table? :PROPERTIES: :tags: ("prio-B") :id: 15 :date-modification: 2020-01-08T22:26:04+0000 :date-creation: 2020-01-08T22:26:04+0000 :author: "karlicoss" :END: : At the moment there are two separate tables: one for latest hash value, another for cached entities. : It might be simpler and safer to keep a single table, with hash along with each cached entity. : ** OPEN support multiple cached values? :PROPERTIES: :tags: ("prio-B") :id: 14 :date-modification: 2020-01-08T22:26:03+0000 :date-creation: 2020-01-08T22:26:02+0000 :author: "karlicoss" :END: : At the moment it's LRU(1) cache, it some usecases it makes sense to cache more values though : ** OPEN support pathlib.Path :PROPERTIES: :tags: ("prio-C") :id: 13 :date-modification: 2020-01-08T22:26:02+0000 :date-creation: 2020-01-08T22:26:01+0000 :author: "karlicoss" :END: : Path is a trivial wrapper around str. I guess generally think of a good way to allow adhoc mapping of simple types. : Perhaps current Exception makes sense. : ** OPEN support defensive behaviour :PROPERTIES: :tags: ("prio-C") :id: 12 :date-modification: 2020-01-08T22:26:01+0000 :date-creation: 2020-01-08T22:26:00+0000 :author: "karlicoss" :END: : E.g. if we can't serialize for some reason, bail the database but at least yield values anyway : ** OPEN Add Redis support :PROPERTIES: :id: 9 :date-modification: 2020-01-06T00:48:59+0000 :date-creation: 2020-01-06T00:48:59+0000 :author: "softinio" :END: : Add Redis support as an alternative to sqlite : : This would be a great feature as it will make this solution easier to use in an enterprise production environment as getting a redis instance shared amonst multiple instances of your app is very easy and cost effective to use. : ** OPEN better pytz support? :PROPERTIES: :tags: ("prio-C") :id: 6 :date-modification: 2020-01-05T13:34:51+0000 :date-creation: 2020-01-05T13:33:25+0000 :author: "karlicoss" :END: ** CLOSED Optional feature: Exception support :PROPERTIES: :id: 11 :date-modification: 2020-01-08T21:56:56+0000 :date-creation: 2020-01-08T21:34:03+0000 :author: "karlicoss" :END: ** CLOSED Add doc on defensive/optional usage :PROPERTIES: :id: 10 :date-modification: 2020-01-06T23:48:54+0000 :date-creation: 2020-01-06T23:47:39+0000 :author: "karlicoss" :END: ** CLOSED Safer concurrent writes handling :PROPERTIES: :id: 8 :date-modification: 2020-01-05T22:32:13+0000 :date-creation: 2020-01-05T22:08:24+0000 :author: "karlicoss" :END: ** CLOSED Update readme :PROPERTIES: :id: 7 :date-modification: 2020-01-05T15:29:37+0000 :date-creation: 2020-01-05T15:24:38+0000 :author: "karlicoss" :END: ** CLOSED support for dataclasses :PROPERTIES: :id: 1 :date-modification: 2020-01-05T13:34:50+0000 :date-creation: 2019-07-30T21:45:30+0100 :author: "karlicoss" :END: ** CLOSED Fix Json support for python3.6 :PROPERTIES: :id: 2 :date-modification: 2020-01-05T13:33:28+0000 :date-creation: 2019-12-08T12:21:58+0000 :author: "karlicoss" :END: ** CLOSED Fix bug when default argument is explicitly specified :PROPERTIES: :id: 3 :date-modification: 2020-01-05T13:33:27+0000 :date-creation: 2019-12-08T17:56:51+0000 :author: "karlicoss" :END: ** CLOSED Union types :PROPERTIES: :id: 4 :date-modification: 2020-01-05T13:33:27+0000 :date-creation: 2019-12-19T23:32:55+0000 :author: "karlicoss" :END: ** CLOSED support top level primitive types :PROPERTIES: :id: 5 :date-modification: 2020-01-05T13:33:26+0000 :date-creation: 2019-12-20T00:09:00+0000 :author: "karlicoss" :END: ================================================ FILE: misc/profile.py ================================================ #!/usr/bin/env python3 import sqlite3 from collections.abc import Iterator from pathlib import Path import sqlalchemy from codetiming import Timer from more_itertools import ilen from cachew import cachew # todo not sure it really helps much? import gc # isort: skip gc.disable() def timer(name: str) -> Timer: return Timer(name=name, text=name + ': ' + '{:.2f}s') def test_ints() -> None: N = 5_000_000 base = Path('/tmp/cachew_profiling/') # shutil.rmtree(base) base.mkdir(exist_ok=True, parents=True) cache_path = base / 'ints' def fun_nocachew(n) -> Iterator[int]: yield from range(n) @cachew(cache_path=cache_path, force_file=True) def fun(n) -> Iterator[int]: yield from range(n) # with timer('no caching'): # ilen(fun_nocachew(N)) # with timer('initial call'): # ilen(fun(N)) assert cache_path.exists() # just in case with timer('reading directly via sqlite'): total = 0 with sqlite3.connect(cache_path) as conn: for (_x,) in conn.execute('SELECT * FROM cache'): total += 1 assert total == N # just in case with timer('reading directly via sqlalchemy'): total = 0 engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}') from sqlalchemy import Column, MetaData, Table meta = MetaData() table_cache = Table('cache', meta, Column('_cachew_primitive', sqlalchemy.Integer)) with engine.connect() as conn: with timer('sqlalchemy querying'): rows = conn.execute(table_cache.select()) for (_x,) in rows: total += 1 engine.dispose() assert total == N # just in case cache_size_mb = cache_path.stat().st_size / 10**6 print(f'cache size: {cache_size_mb:.1f} Mb') with timer('subsequent call'): ilen(fun(N)) test_ints() ================================================ FILE: misc/test_redis/docker-compose.yml ================================================ services: redis: image: "redis:alpine" # restart: always command: - "sh" - "-euc" - | exec redis-server # - | # echo "requirepass '$$REDIS_PASSWORD'" > /etc/redis.conf # exec redis-server /etc/redis.conf # environment: # REDIS_PASSWORD: "password" ports: - 6379:6379 volumes: - "redis-cachew:/data:rw" volumes: redis-cachew: ================================================ FILE: misc/test_redis/test.py ================================================ #!/usr/bin/env python3 from time import time import redis # ty: ignore[unresolved-import] from loguru import logger # ty: ignore[unresolved-import] from more_itertools import ilen r = redis.Redis(host='localhost', port=6379, db=0) N = 1_000_000 def items(): yield from map(str, range(N)) TAG = 'keys' def reset(): r.delete(TAG) def write(): for i, obj in enumerate(items()): key = f'obj:{i}' r.hset(key, 'data', obj) r.lpush(TAG, key) def read(): keys = r.lrange(TAG, 0, -1) result = (r.hget(key, 'data') for key in keys) print('total', ilen(result)) # TODO could use lmove for atomic operations? def write2(): for obj in items(): r.lpush(TAG, obj) def read2(): result = r.lrange(TAG, 0, -1) print('total', ilen(result)) reset() a = time() write2() b = time() logger.info(f'writing took {b - a:.1f}s') a = time() read2() b = time() logger.info(f'reading took {b - a:.1f}s') # with read()/write() # 100000 strings: # 2023-09-09 01:50:23.498 | INFO | __main__::37 - writing took 13.1s # 2023-09-09 01:50:30.052 | INFO | __main__::42 - reading took 6.6s # hmm kinda slow.. # with read2/write2, writing about 7secs, and reading is instantaneous?? # for 1M objects, writing took 60 secs, and reading 0.2s? # lol could be promising... # I guess it's not iterative, but could retrieve items in batches? ================================================ FILE: mypy.ini ================================================ [mypy] pretty = True show_error_context = True show_column_numbers = True show_error_end = True check_untyped_defs = True # see https://mypy.readthedocs.io/en/stable/error_code_list2.html warn_redundant_casts = True strict_equality = True warn_unused_ignores = True enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable # an example of suppressing # [mypy-my.config.repos.pdfannots.pdfannots] # ignore_errors = True ================================================ FILE: pyproject.toml ================================================ # see https://github.com/karlicoss/pymplate for up-to-date reference [project] dynamic = ["version"] # version is managed by build backend name = "cachew" dependencies = [ "platformdirs", # default cache dir "sqlalchemy>=1.0", # cache DB interaction "orjson", # fast json serialization "typing-extensions",# for depreceated decorator ] requires-python = ">=3.12" ## these need to be set if you're planning to upload to pypi # description = "TODO" license = {file = "LICENSE.txt"} authors = [ {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, ] maintainers = [ {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, ] # keywords = [] # # see: http://pypi.python.org/pypi?%3Aaction=list_classifiers # classifiers = [ # ] [project.urls] Homepage = "https://github.com/karlicoss/cachew" ## [project.optional-dependencies] optional = [ "colorlog", ] [dependency-groups] # TODO: not sure, on the one hand could just use 'standard' dev dependency group # On the other hand, it's a bit annoying that it's always included by default? # To make sure it's not included, need to use `uv run --exact --no-default-groups ...` testing = [ "pytest>=9", # need version 9 for proper namespace package support "ruff", "pytz", "more-itertools", "patchy", # for injecting sleeps and testing concurrent behaviour "enlighten", # used in logging helper, but not really required "cattrs", # benchmarking alternative marshalling implementation "pyinstrument", # for profiling from within tests "codetiming", # Timer context manager ] typecheck = [ { include-group = "testing" }, "mypy", "lxml", # for mypy html coverage "ty>=0.0.3", "types-pytz", # optional runtime only dependency "cachew[optional]", ] [build-system] requires = ["hatchling", "hatch-vcs"] build-backend = "hatchling.build" # unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894 [tool.hatch.build.targets.wheel] packages = ["src/cachew"] [tool.hatch.version] source = "vcs" [tool.hatch.version.raw-options] version_scheme = "python-simplified-semver" local_scheme = "dirty-tag" ================================================ FILE: pytest.ini ================================================ [pytest] # discover files that don't follow test_ naming. Useful to keep tests along with the source code python_files = *.py # this is necessary for --pyargs to discover implicit namespace packages correctly consider_namespace_packages = true # see https://docs.pytest.org/en/stable/reference/reference.html#confval-strict # disable for now -- some macos tests ('file backend') are flaky # strict = true addopts = # prevent pytest cache from being created... it craps into project dir and I never use it anyway -p no:cacheprovider # -rap to print tests summary even when they are successful -rap --verbose # otherwise it won't discover doctests --doctest-modules # show all test durations (unless they are too short) --durations=0 ================================================ FILE: ruff.toml ================================================ line-length = 120 # impacts import sorting lint.extend-select = [ "ALL", ] lint.ignore = [ "D", # annoying nags about docstrings "N", # pep naming "TCH", # type checking rules, mostly just suggests moving imports under TYPE_CHECKING "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives "FIX", # complains about fixmes/todos -- annoying "TD", # complains about todo formatting -- too annoying "ANN", # missing type annotations? seems way to strict though "EM" , # suggests assigning all exception messages into a variable first... pretty annoying ### too opinionated style checks "E501", # too long lines "E731", # assigning lambda instead of using def "E741", # Ambiguous variable name: `l` "E742", # Ambiguous class name: `O "E401", # Multiple imports on one line "F403", # import *` used; unable to detect undefined names ### ### "E722", # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing.. "F811", # Redefinition of unused # this gets in the way of pytest fixtures (e.g. in cachew) ## might be nice .. but later and I don't wanna make it strict "E402", # Module level import not at top of file ### these are just nitpicky, we usually know better "PLR0911", # too many return statements "PLR0912", # too many branches "PLR0913", # too many function arguments "PLR0915", # too many statements "PLR1714", # consider merging multiple comparisons "PLR2044", # line with empty comment "PLR5501", # use elif instead of else if "PLR2004", # magic value in comparison -- super annoying in tests ### "PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check "B009", # calling gettattr with constant attribute -- this is useful to convince mypy "B010", # same as above, but setattr "B017", # pytest.raises(Exception) "B023", # seems to result in false positives? # complains about useless pass, but has sort of a false positive if the function has a docstring? # this is common for click entrypoints (e.g. in __main__), so disable "PIE790", # a bit too annoying, offers to convert for loops to list comprehension # , which may heart readability "PERF401", # suggests no using exception in for loops # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost" "PERF203", "RET504", # unnecessary assignment before returning -- that can be useful for readability "RET505", # unnecessary else after return -- can hurt readability "PLW0603", # global variable update.. we usually know why we are doing this "PLW2901", # for loop variable overwritten, usually this is intentional "PT011", # pytest raises is too broad "COM812", # trailing comma missing -- mostly just being annoying with long multiline strings "TRY003", # suggests defining exception messages in exception class -- kinda annoying "TRY201", # raise without specifying exception name -- sometimes hurts readability "TRY400", # a bit dumb, and results in false positives (see https://github.com/astral-sh/ruff/issues/18070) "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging "TID252", # Prefer absolute imports over relative imports from parent modules ## too annoying "T20", # just complains about prints and pprints (TODO maybe consider later?) "Q", # flake quotes, too annoying "C90", # some complexity checking "G004", # logging statement uses f string "ERA001", # commented out code "SLF001", # private member accessed "BLE001", # do not catch 'blind' Exception "INP001", # complains about implicit namespace packages "SIM102", # if statements collapsing, often hurts readability "SIM103", # multiple conditions collapsing, often hurts readability "SIM105", # suggests using contextlib.suppress instad of try/except -- this wouldn't be mypy friendly "SIM108", # suggests using ternary operation instead of if -- hurts readability "SIM110", # suggests using any(...) instead of for look/return -- hurts readability "SIM117", # suggests using single with statement instead of nested -- doesn't work in tests "RSE102", # complains about missing parens in exceptions ## "PLC0415", # "imports should be at the top level" -- not realistic "ISC001", # implicit string concatenation -- we do use it in tests ] extend-exclude = [ "src/cachew/legacy.py", # TODO dunno, remove it for good? ] ================================================ FILE: src/cachew/__init__.py ================================================ import fnmatch import functools import importlib.metadata import inspect import json import logging import os import stat import warnings from collections.abc import Callable, Iterable from dataclasses import dataclass from pathlib import Path from typing import ( TYPE_CHECKING, Any, Literal, cast, get_args, get_origin, get_type_hints, overload, ) try: # orjson might not be available on some architectures, so let's make it defensive just in case from orjson import dumps as orjson_dumps from orjson import loads as orjson_loads except: warnings.warn("orjson couldn't be imported. It's _highly_ recommended for better caching performance", stacklevel=2) def orjson_dumps(*args, **kwargs): # type: ignore[misc] # sqlite needs a blob return json.dumps(*args, **kwargs).encode('utf8') orjson_loads = json.loads # ty: ignore[invalid-assignment] import platformdirs from .backend.common import AbstractBackend from .backend.file import FileBackend from .backend.sqlite import SqliteBackend from .common import CachewException, SourceHash, TypeNotSupported from .logging_helper import make_logger from .marshall.cachew import CachewMarshall, build_schema from .utils import resolve_type_parameters # in case of changes in the way cachew stores data, this should be changed to discard old caches CACHEW_VERSION: str = importlib.metadata.version(__name__) type PathIsh = Path | str Backend = Literal['sqlite', 'file'] class settings: ''' Global settings, you can override them after importing cachew ''' ''' Toggle to disable caching ''' ENABLE: bool = True DEFAULT_CACHEW_DIR: PathIsh = Path(platformdirs.user_cache_dir('cachew')) ''' Set to true if you want to fail early. Otherwise falls back to non-cached version ''' THROW_ON_ERROR: bool = False DEFAULT_BACKEND: Backend = 'sqlite' def get_logger() -> logging.Logger: return make_logger(__name__) BACKENDS: dict[Backend, type[AbstractBackend]] = { 'file': FileBackend, 'sqlite': SqliteBackend, } type PathProvider[**P] = PathIsh | Callable[P, PathIsh] type HashFunction[**P] = Callable[P, SourceHash] def default_hash(*args, **kwargs) -> SourceHash: # TODO eh, demand hash? it's not safe either... ugh # can lead to werid consequences otherwise.. return str(args + tuple(sorted(kwargs.items()))) # good enough?? # TODO give it as an example in docs def mtime_hash(path: Path, *args, **kwargs) -> SourceHash: mt = path.stat().st_mtime return default_hash(f'{path}.{mt}', *args, **kwargs) Failure = str # deliberately not a type =, used in type checks type Kind = Literal['single', 'multiple'] type Inferred = tuple[Kind, type[Any]] def infer_return_type(func) -> Failure | Inferred: """ >>> def const() -> int: ... return 123 >>> infer_return_type(const) ('single', ) >>> from typing import Optional >>> def first_character(s: str) -> Optional[str]: ... return None if len(s) == 0 else s[0] >>> kind, opt = infer_return_type(first_character) >>> # in 3.8, Optional[str] is printed as Union[str, None], so need to hack around this >>> (kind, opt == Optional[str]) ('single', True) # tuple is an iterable.. but presumably should be treated as a single value >>> from typing import Tuple >>> def a_tuple() -> Tuple[int, str]: ... return (123, 'hi') >>> infer_return_type(a_tuple) ('single', tuple[int, str]) >>> from typing import Collection, NamedTuple >>> class Person(NamedTuple): ... name: str ... age: int >>> def person_provider() -> Collection[Person]: ... return [] >>> infer_return_type(person_provider) ('multiple', ) >>> def single_str() -> str: ... return 'hello' >>> infer_return_type(single_str) ('single', ) >>> def single_person() -> Person: ... return Person(name="what", age=-1) >>> infer_return_type(single_person) ('single', ) >>> from typing import Sequence >>> def int_provider() -> Sequence[int]: ... return (1, 2, 3) >>> infer_return_type(int_provider) ('multiple', ) >>> from typing import Iterator >>> def union_provider() -> Iterator[str | int]: ... yield 1 ... yield 'aaa' >>> infer_return_type(union_provider) ('multiple', str | int) >>> from typing import Iterator >>> type Str = str >>> type Int = int >>> type IteratorStrInt = Iterator[Str | Int] >>> def iterator_str_int() -> IteratorStrInt: ... yield 1 ... yield 'aaa' >>> infer_return_type(iterator_str_int) ('multiple', str | int) # a bit of an edge case >>> from typing import Tuple >>> def empty_tuple() -> Iterator[Tuple[()]]: ... yield () >>> infer_return_type(empty_tuple) ('multiple', tuple[()]) ... # doctest: +ELLIPSIS >>> def untyped(): ... return 123 >>> infer_return_type(untyped) 'no return type annotation...' >>> from typing import List >>> class Custom: ... pass >>> def unsupported() -> Custom: ... return Custom() >>> infer_return_type(unsupported) "can't infer type from : can't cache " >>> def unsupported_list() -> List[Custom]: ... return [Custom()] >>> infer_return_type(unsupported_list) "can't infer type from list[cachew.Custom]: can't cache " """ try: hints = get_type_hints(func) except Exception as ne: # get_type_hints might fail if types are forward defined or missing # see test_future_annotation for an example return str(ne) rtype = hints.get('return', None) if rtype is None: return f"no return type annotation on {func}" rtype = resolve_type_parameters(rtype) def bail(reason: str) -> str: return f"can't infer type from {rtype}: " + reason # first we wanna check if the top level type is some sort of iterable that makes sense ot cache # e.g. List/Sequence/Iterator etc return_multiple = _returns_multiple(rtype) if return_multiple: # then the actual type to cache will be the argument of the top level one args = get_args(rtype) if args is None: return bail("has no __args__") if len(args) != 1: return bail(f"wrong number of __args__: {args}") (cached_type,) = args else: cached_type = rtype try: build_schema(Type=cached_type) except TypeNotSupported as ex: return bail(f"can't cache {ex.type_}") return ('multiple' if return_multiple else 'single', cached_type) def _returns_multiple(rtype) -> bool: origin = get_origin(rtype) if origin is None: return False if origin is tuple: # usually tuples are more like single values rather than a sequence? (+ this works for namedtuple) return False try: return issubclass(origin, Iterable) except TypeError: # that would happen if origin is not a 'proper' type, e.g. is a Union or something # seems like exception is the easiest way to check return False # https://stackoverflow.com/questions/653368/how-to-create-a-python-decorator-that-can-be-used-either-with-or-without-paramet def doublewrap(f): @functools.wraps(f) def new_dec(*args, **kwargs): if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): # actual decorated function return f(args[0]) else: # decorator arguments return lambda realf: f(realf, *args, **kwargs) return new_dec def cachew_error(e: Exception, *, logger: logging.Logger) -> None: if settings.THROW_ON_ERROR: # TODO would be nice to throw from the original code line -- maybe mess with the stack here? raise e logger.error("error while setting up cache, falling back to non-cached version") logger.exception(e) use_default_path = cast(Path, object()) # using cachew_impl here just to use different signatures during type checking (see below) @doublewrap def cachew_impl[**P]( func=None, # TODO should probably type it after switch to python 3.10/proper paramspec cache_path: PathProvider[P] | None = use_default_path, *, force_file: bool = False, cls: type | tuple[Kind, type] | None = None, depends_on: HashFunction[P] = default_hash, logger: logging.Logger | None = None, chunk_by: int = 100, # NOTE: allowed values for chunk_by depend on the system. # some systems (to be more specific, sqlite builds), it might be too large and cause issues # ideally this would be more defensive/autodetected, maybe with a warning? # you can use 'test_many' to experiment # - too small values (e.g. 10) are slower than 100 (presumably, too many sql statements) # - too large values (e.g. 10K) are slightly slower as well (not sure why?) synthetic_key: str | None = None, backend: Backend | None = None, **kwargs, ): r""" Database-backed cache decorator. TODO more description? # TODO use this doc in readme? :param cache_path: if not set, `cachew.settings.DEFAULT_CACHEW_DIR` will be used. :param force_file: if set to True, assume `cache_path` is a regular file (instead of a directory) :param cls: if not set, cachew will attempt to infer it from return type annotation. See :func:`infer_return_type` and :func:`cachew.tests.test_cachew.test_return_type_inference`. :param depends_on: hash function to determine whether the underlying . Can potentially benefit from the use of side effects (e.g. file modification time). TODO link to test? :param logger: custom logger, if not specified will use logger named `cachew`. See :func:`get_logger`. :return: iterator over original or cached items Usage example: >>> from typing import NamedTuple, Iterator >>> class Link(NamedTuple): ... url : str ... text: str ... >>> @cachew ... def extract_links(archive_path: str) -> Iterator[Link]: ... for i in range(5): ... # simulate slow IO ... # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours ... import time; time.sleep(1) ... yield Link(url=f'http://link{i}.org', text=f'text {i}') ... >>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run [Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')] >>> from timeit import Timer >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1) ... # second run is cached, so should take less time >>> print(f"call took {int(res)} seconds") call took 0 seconds >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1) ... # now file has changed, so the cache will be discarded >>> print(f"call took {int(res)} seconds") call took 5 seconds """ if logger is None: module_name = getattr(func, '__module__', None) if module_name is not None and module_name in logging.Logger.manager.loggerDict: # if logger for the function's module already exists, reuse it logger = logging.getLogger(module_name) else: # rely on default cachew logger logger = get_logger() class AddFuncName(logging.LoggerAdapter): def process(self, msg, kwargs): extra = self.extra assert extra is not None func_name = extra['func_name'] return f'[{func_name}] {msg}', kwargs assert func is not None func_name = callable_name(func) adapter = AddFuncName(logger, {'func_name': func_name}) logger = cast(logging.Logger, adapter) hashf = kwargs.get('hashf') if hashf is not None: warnings.warn("'hashf' is deprecated. Please use 'depends_on' instead", stacklevel=2) depends_on = hashf # todo not very nice that ENABLE check is scattered across two places if not settings.ENABLE or cache_path is None: logger.debug('cache explicitly disabled (settings.ENABLE is False or cache_path is None)') return func if cache_path is use_default_path: cache_path = settings.DEFAULT_CACHEW_DIR logger.debug(f'no cache_path specified, using the default {cache_path}') use_kind: Kind | None = None use_cls: type | None = None if cls is not None: # defensive here since typing. objects passed as cls might fail on isinstance try: is_tuple = isinstance(cls, tuple) except: is_tuple = False if is_tuple: use_kind, use_cls = cls # type: ignore[misc] else: use_kind = 'multiple' use_cls = cls # type: ignore[assignment] # TODO fuzz infer_return_type, should never crash? inference_res = infer_return_type(func) if isinstance(inference_res, Failure): msg = f"failed to infer cache type: {inference_res}. See https://github.com/karlicoss/cachew#features for the list of supported types." if use_cls is None: ex = CachewException(msg) cachew_error(ex, logger=logger) return func else: # it's ok, assuming user knows better logger.debug(msg) assert use_kind is not None else: (inferred_kind, inferred_cls) = inference_res if use_cls is None: logger.debug(f'using inferred type {inferred_kind} {inferred_cls}') (use_kind, use_cls) = (inferred_kind, inferred_cls) else: assert use_kind is not None if (use_kind, use_cls) != inference_res: logger.warning( f"inferred type {inference_res} mismatches explicitly specified type {(use_kind, use_cls)}" ) # TODO not sure if should be more serious error... if use_kind == 'single': # pretend it's an iterable, this is just simpler for cachew_wrapper @functools.wraps(func) def _func(*args, **kwargs): return [func(*args, **kwargs)] else: _func = func assert use_cls is not None ctx = Context( func =_func, cache_path =cache_path, force_file =force_file, cls_ =use_cls, depends_on =depends_on, logger =logger, chunk_by =chunk_by, synthetic_key=synthetic_key, backend =backend, ) # fmt: skip # hack to avoid extra stack frame (see test_recursive*) @functools.wraps(func) def binder(*args, **kwargs): kwargs['_cachew_context'] = ctx res = cachew_wrapper(*args, **kwargs) if use_kind == 'single': lres = list(res) assert len(lres) == 1, lres # shouldn't happen return lres[0] return res return binder if TYPE_CHECKING: # we need two versions due to @doublewrap # this is when we just annotate as @cachew without any args @overload def cachew[F: Callable](fun: F) -> F: ... # NOTE: we won't really be able to make sure the args of cache_path are the same as args of the wrapped function # because when cachew() is called, we don't know anything about the wrapped function yet # but at least it works for checking that cachew_path and depdns_on have the same args :shrug: @overload def cachew[F, **P]( cache_path: PathProvider[P] | None = ..., *, force_file: bool = ..., cls: type | tuple[Kind, type] | None = ..., depends_on: HashFunction[P] = ..., logger: logging.Logger | None = ..., chunk_by: int = ..., synthetic_key: str | None = ..., backend: Backend | None = ..., ) -> Callable[[F], F]: ... def cachew(*args, **kwargs): # make ty happy raise NotImplementedError else: cachew = cachew_impl def callable_name(func: Callable) -> str: # some functions don't have __module__ mod = getattr(func, '__module__', None) or '' return f'{mod}:{getattr(func, "__qualname__")}' def callable_module_name(func: Callable) -> str | None: return getattr(func, '__module__', None) # could cache this, but might be worth not to, so the user can change it on the fly? def _parse_disabled_modules(logger: logging.Logger | None = None) -> list[str]: # e.g. CACHEW_DISABLE=my.browser:my.reddit if 'CACHEW_DISABLE' not in os.environ: return [] disabled = os.environ['CACHEW_DISABLE'] if disabled.strip() == '': return [] if ',' in disabled and logger: logger.warning( 'CACHEW_DISABLE contains a comma, but this expects a $PATH-like, colon-separated list; ' f'try something like CACHEW_DISABLE={disabled.replace(",", ":")}' ) # remove any empty strings incase did something like CACHEW_DISABLE=my.module:$CACHEW_DISABLE return [p for p in disabled.split(':') if p.strip() != ''] def _matches_disabled_module(module_name: str, pattern: str) -> bool: ''' >>> _matches_disabled_module('my.browser', 'my.browser') True >>> _matches_disabled_module('my.browser', 'my.*') True >>> _matches_disabled_module('my.browser', 'my') True >>> _matches_disabled_module('my.browser', 'my.browse*') True >>> _matches_disabled_module('my.browser.export', 'my.browser') True >>> _matches_disabled_module('mysomething.else', '*') # CACHEW_DISABLE='*' disables everything True >>> _matches_disabled_module('my.browser', 'my.br?????') # fnmatch supports unix-like patterns True >>> _matches_disabled_module('my.browser', 'my.browse') False >>> _matches_disabled_module('mysomething.else', 'my') # since not at '.' boundary, doesn't match False >>> _matches_disabled_module('mysomething.else', '') False >>> _matches_disabled_module('my.browser', 'my.browser.export') False ''' if module_name == pattern: return True module_parts = module_name.split('.') pattern_parts = pattern.split('.') # e.g. if pattern is 'module.submod.inner_module' and module is just 'module.submod' # theres no possible way for it to match if len(module_parts) < len(pattern_parts): return False for mp, pp in zip(module_parts, pattern_parts, strict=False): if fnmatch.fnmatch(mp, pp): continue return False return True def _module_is_disabled(module_name: str, logger: logging.Logger) -> bool: disabled_modules = _parse_disabled_modules(logger) for pat in disabled_modules: if _matches_disabled_module(module_name, pat): logger.debug( f"caching disabled for {module_name} (matched '{pat}' from 'CACHEW_DISABLE={os.environ['CACHEW_DISABLE']})'" ) return True return False # fmt: off _CACHEW_CACHED = 'cachew_cached' # TODO add to docs _SYNTHETIC_KEY = 'synthetic_key' _SYNTHETIC_KEY_VALUE = 'synthetic_key_value' _DEPENDENCIES = 'dependencies' # fmt: on @dataclass class Context[**P]: # fmt: off func : Callable cache_path : PathProvider[P] force_file : bool cls_ : type depends_on : HashFunction[P] logger : logging.Logger chunk_by : int synthetic_key: str | None backend : Backend | None def composite_hash(self, *args, **kwargs) -> dict[str, Any]: fsig = inspect.signature(self.func) # defaults wouldn't be passed in kwargs, but they can be an implicit dependency (especially inbetween program runs) defaults = { k: v.default for k, v in fsig.parameters.items() if v.default is not inspect.Parameter.empty } # but only pass default if the user wants it in the hash function? hsig = inspect.signature(self.depends_on) defaults = { k: v for k, v in defaults.items() if k in hsig.parameters or 'kwargs' in hsig.parameters } kwargs = {**defaults, **kwargs} schema = str(self.cls_) hash_parts = { 'cachew' : CACHEW_VERSION, 'schema' : schema, _DEPENDENCIES : str(self.depends_on(*args, **kwargs)), } synthetic_key = self.synthetic_key if synthetic_key is not None: hash_parts[_SYNTHETIC_KEY ] = synthetic_key hash_parts[_SYNTHETIC_KEY_VALUE] = kwargs[synthetic_key] # FIXME assert it's in kwargs in the first place? # FIXME support positional args too? maybe extract the name from signature somehow? dunno # need to test it return hash_parts # fmt: on def cachew_wrapper[**P]( *args, _cachew_context: Context[P], **kwargs, ): C = _cachew_context # fmt: off func = C.func cache_path = C.cache_path force_file = C.force_file cls = C.cls_ logger = C.logger chunk_by = C.chunk_by synthetic_key = C.synthetic_key backend_name = C.backend # fmt: on used_backend = backend_name or settings.DEFAULT_BACKEND func_name = callable_name(func) if not settings.ENABLE: logger.debug('cache explicitly disabled (settings.ENABLE is False)') yield from func(*args, **kwargs) return mod_name = callable_module_name(func) if mod_name is not None and _module_is_disabled(mod_name, logger): yield from func(*args, **kwargs) return def get_db_path() -> Path | None: db_path: Path if callable(cache_path): pp = cache_path(*args, **kwargs) if pp is None: logger.debug('cache explicitly disabled (cache_path is None)') # early return, in this case we just yield the original items from the function return None else: db_path = Path(pp) else: db_path = Path(cache_path) db_path.parent.mkdir(parents=True, exist_ok=True) # need to be atomic here, hence calling stat() once and then just using the results try: # note: stat follows symlinks (which is what we want) st = db_path.stat() except FileNotFoundError: # doesn't exist. then it's controlled by force_file if force_file: # just use db_path as is pass else: db_path.mkdir(parents=True, exist_ok=True) db_path = db_path / func_name else: # already exists, so just use callable name if it's a dir if stat.S_ISDIR(st.st_mode): db_path = db_path / func_name logger.debug(f'using {used_backend}:{db_path} for cache') return db_path def try_use_synthetic_key() -> None: if synthetic_key is None: return # attempt to use existing cache if possible, as a 'prefix' old_hash_d: dict[str, Any] = {} if old_hash is not None: try: old_hash_d = json.loads(old_hash) except json.JSONDecodeError: # possible if we used old cachew version (<=0.8.1), hash wasn't json pass hash_diffs = { k: new_hash_d.get(k) == old_hash_d.get(k) for k in (*new_hash_d.keys(), *old_hash_d.keys()) # the only 'allowed' differences for hash, otherwise need to recompute (e.g. if schema changed) if k not in {_SYNTHETIC_KEY_VALUE, _DEPENDENCIES} } cache_compatible = all(hash_diffs.values()) if not cache_compatible: return def missing_keys(cached: list[str], wanted: list[str]) -> list[str] | None: # FIXME assert both cached and wanted are sorted? since we rely on it # if not, then the user could use some custom key for caching (e.g. normalise filenames etc) # although in this case passing it into the function wouldn't make sense? if len(cached) == 0: # no point trying to reuse anything, cache should be empty? return None if len(wanted) == 0: # similar, no way to reuse cache return None if cached[0] != wanted[0]: # there is no common prefix, so no way to reuse cache really return None last_cached = cached[-1] # ok, now actually figure out which items are missing for i, k in enumerate(wanted): if k > last_cached: # ok, rest of items are missing return wanted[i:] # otherwise too many things are cached, and we seem to wante less return None new_values: list[str] = new_hash_d[_SYNTHETIC_KEY_VALUE] old_values: list[str] = old_hash_d[_SYNTHETIC_KEY_VALUE] missing = missing_keys(cached=old_values, wanted=new_values) if missing is not None: # can reuse cache kwargs[_CACHEW_CACHED] = cached_items() kwargs[synthetic_key] = missing early_exit = False def written_to_cache(): nonlocal early_exit datas = func(*args, **kwargs) if isinstance(backend, FileBackend): # FIXME uhhh.. this is a bit crap # but in sqlite mode we don't want to publish new hash before we write new items # maybe should use tmp table for hashes as well? backend.write_new_hash(new_hash) else: # happens later for sqlite pass flush_blobs = backend.flush_blobs chunk: list[Any] = [] def flush() -> None: nonlocal chunk if len(chunk) > 0: flush_blobs(chunk=chunk) chunk = [] total_objects = 0 for obj in datas: try: total_objects += 1 yield obj except GeneratorExit: early_exit = True return dct = marshall.dump(obj) blob = orjson_dumps(dct) chunk.append(blob) if len(chunk) >= chunk_by: flush() flush() backend.finalize(new_hash) logger.info(f'wrote {total_objects} objects to cachew ({used_backend}:{db_path})') def cached_items(): total_cached = backend.cached_blobs_total() total_cached_s = '' if total_cached is None else f'{total_cached} ' logger.info(f'loading {total_cached_s}objects from cachew ({used_backend}:{db_path})') for blob in backend.cached_blobs(): j = orjson_loads(blob) obj = marshall.load(j) yield obj # NOTE: annoyingly huge try/catch ahead... # but it lets us save a function call, hence a stack frame # see test_recursive* try: db_path = get_db_path() if db_path is None: yield from func(*args, **kwargs) return BackendCls = BACKENDS[used_backend] new_hash_d = C.composite_hash(*args, **kwargs) new_hash: SourceHash = json.dumps(new_hash_d) logger.debug(f'new hash: {new_hash}') marshall: CachewMarshall[Any] = CachewMarshall(Type_=cls) with BackendCls(cache_path=db_path, logger=logger) as backend: old_hash = backend.get_old_hash() logger.debug(f'old hash: {old_hash}') if new_hash == old_hash: logger.debug('hash matched: loading from cache') yield from cached_items() return logger.debug('hash mismatch: computing data and writing to db') try_use_synthetic_key() got_write = backend.get_exclusive_write() if not got_write: # NOTE: this is the bit we really have to watch out for and not put in a helper function # otherwise it's causing an extra stack frame on every call # the rest (reading from cachew or writing to cachew) happens once per function call? so not a huge deal yield from func(*args, **kwargs) return # at this point we're guaranteed to have an exclusive write transaction yield from written_to_cache() except Exception as e: # sigh... see test_early_exit_shutdown... if early_exit and 'Cannot operate on a closed database' in str(e): return # todo hmm, kinda annoying that it tries calling the function twice? # but gonna require some sophisticated cooperation with the cached wrapper otherwise cachew_error(e, logger=logger) yield from func(*args, **kwargs) __all__ = [ 'CachewException', 'HashFunction', 'SourceHash', 'cachew', 'get_logger', ] ================================================ FILE: src/cachew/backend/common.py ================================================ import logging from abc import abstractmethod from collections.abc import Iterator, Sequence from pathlib import Path from ..common import SourceHash class AbstractBackend: @abstractmethod def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None: raise NotImplementedError @abstractmethod def __enter__(self): raise NotImplementedError def __exit__(self, *args) -> None: raise NotImplementedError def get_old_hash(self) -> SourceHash | None: raise NotImplementedError def cached_blobs_total(self) -> int | None: raise NotImplementedError def cached_blobs(self) -> Iterator[bytes]: raise NotImplementedError def get_exclusive_write(self) -> bool: ''' Returns whether it actually managed to get it ''' raise NotImplementedError def write_new_hash(self, new_hash: SourceHash) -> None: raise NotImplementedError def flush_blobs(self, chunk: Sequence[bytes]) -> None: raise NotImplementedError def finalize(self, new_hash: SourceHash) -> None: raise NotImplementedError ================================================ FILE: src/cachew/backend/file.py ================================================ import logging from collections.abc import Iterator, Sequence from pathlib import Path from typing import ( BinaryIO, ) from ..common import SourceHash from .common import AbstractBackend class FileBackend(AbstractBackend): jsonl: Path jsonl_tmp: Path jsonl_fr: BinaryIO | None jsonl_tmp_fw: BinaryIO | None def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None: self.logger = logger self.jsonl = cache_path self.jsonl_tmp = Path(str(self.jsonl) + '.tmp') self.jsonl_fr = None self.jsonl_tmp_fw = None def __enter__(self) -> 'FileBackend': try: self.jsonl_fr = self.jsonl.open('rb') except FileNotFoundError: self.jsonl_fr = None return self def __exit__(self, *args) -> None: if self.jsonl_tmp_fw is not None: # might still exist in case of early exit self.jsonl_tmp.unlink(missing_ok=True) # NOTE: need to unlink first # otherwise possible that someone else might open the file before we unlink it self.jsonl_tmp_fw.close() if self.jsonl_fr is not None: self.jsonl_fr.close() def get_old_hash(self) -> SourceHash | None: if self.jsonl_fr is None: return None hash_line = self.jsonl_fr.readline().rstrip(b'\n') return hash_line.decode('utf8') def cached_blobs_total(self) -> int | None: # not really sure how to support that for a plaintext file? # could wc -l but it might be costly.. return None def cached_blobs(self) -> Iterator[bytes]: assert self.jsonl_fr is not None # should be guaranteed by get_old_hash yield from self.jsonl_fr # yields line by line def get_exclusive_write(self) -> bool: # NOTE: opening in x (exclusive write) mode just in case, so it throws if file exists try: self.jsonl_tmp_fw = self.jsonl_tmp.open('xb') except FileExistsError: self.jsonl_tmp_fw = None return False else: return True def write_new_hash(self, new_hash: SourceHash) -> None: assert self.jsonl_tmp_fw is not None self.jsonl_tmp_fw.write(new_hash.encode('utf8') + b'\n') def flush_blobs(self, chunk: Sequence[bytes]) -> None: fw = self.jsonl_tmp_fw assert fw is not None for blob in chunk: fw.write(blob) fw.write(b'\n') def finalize(self, new_hash: SourceHash) -> None: # noqa: ARG002 # TODO defensive?? self.jsonl_tmp.rename(self.jsonl) ================================================ FILE: src/cachew/backend/sqlite.py ================================================ import logging import sqlite3 import time import warnings from collections.abc import Iterator, Sequence from pathlib import Path import sqlalchemy import sqlalchemy.exc from sqlalchemy import Column, Table, event, text from sqlalchemy.dialects import sqlite from ..common import SourceHash from .common import AbstractBackend class SqliteBackend(AbstractBackend): def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None: self.logger = logger self.engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}', connect_args={'timeout': 0}) # NOTE: timeout is necessary so we don't lose time waiting during recursive calls # by default, it's several seconds? you'd see 'test_recursive' test performance degrade @event.listens_for(self.engine, 'connect') def set_sqlite_pragma(dbapi_connection, connection_record): # noqa: ARG001 # without wal, concurrent reading/writing is not gonna work # ugh. that's odd, how are we supposed to set WAL if the very fact of setting wal might lock the db? while True: try: dbapi_connection.execute('PRAGMA journal_mode=WAL') break except sqlite3.OperationalError as oe: if 'database is locked' not in str(oe): # ugh, pretty annoying that exception doesn't include database path for some reason raise RuntimeError(f'Error while setting WAL on {cache_path}') from oe time.sleep(0.1) self.connection = self.engine.connect() """ Erm... this is pretty confusing. https://docs.sqlalchemy.org/en/13/dialects/sqlite.html#transaction-isolation-level Somehow without this thing sqlalchemy logs BEGIN (implicit) instead of BEGIN TRANSACTION which actually works in sqlite... Judging by sqlalchemy/dialects/sqlite/base.py, looks like some sort of python sqlite driver problem?? test_transaction should check this behaviour """ @event.listens_for(self.connection, 'begin') def do_begin(conn): # NOTE there is also BEGIN CONCURRENT in newer versions of sqlite. could use it later? conn.execute(text('BEGIN DEFERRED')) self.meta = sqlalchemy.MetaData() self.table_hash = Table('hash', self.meta, Column('value', sqlalchemy.String)) # fmt: off # actual cache self.table_cache = Table('cache' , self.meta, Column('data', sqlalchemy.BLOB)) # temporary table, we use it to insert and then (atomically?) rename to the above table at the very end self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.BLOB)) # fmt: on def __enter__(self) -> 'SqliteBackend': # NOTE: deferred transaction self.transaction = self.connection.begin() # FIXME this is a bit crap.. is there a nicer way to use another ctx manager here? self.transaction.__enter__() return self def __exit__(self, *args) -> None: self.transaction.__exit__(*args) self.connection.close() self.engine.dispose() def get_old_hash(self) -> SourceHash | None: # first, try to do as much as possible read-only, benefiting from deferred transaction old_hashes: Sequence try: # not sure if there is a better way... cursor = self.connection.execute(self.table_hash.select()) except sqlalchemy.exc.OperationalError as e: # meh. not sure if this is a good way to handle this.. if 'no such table: hash' in str(e): old_hashes = [] else: raise e else: old_hashes = cursor.fetchall() assert len(old_hashes) <= 1, old_hashes # shouldn't happen old_hash: SourceHash | None if len(old_hashes) == 0: old_hash = None else: old_hash = old_hashes[0][0] # returns a tuple... return old_hash def cached_blobs_total(self) -> int | None: [(total,)] = self.connection.execute(sqlalchemy.select(sqlalchemy.func.count()).select_from(self.table_cache)) return total def cached_blobs(self) -> Iterator[bytes]: rows = self.connection.execute(self.table_cache.select()) # by default, sqlalchemy wraps all results into Row object # this can cause quite a lot of overhead if you're reading many rows # it seems that in principle, sqlalchemy supports just returning bare underlying tuple from the dbapi # but from browsing the code it doesn't seem like this functionality exposed # if you're looking for cues, see # - ._source_supports_scalars # - ._generate_rows # - ._row_getter # by using this raw iterator we speed up reading the cache quite a bit # asked here https://github.com/sqlalchemy/sqlalchemy/discussions/10350 raw_row_iterator = getattr(rows, '_raw_row_iterator', None) if raw_row_iterator is None: warnings.warn( "CursorResult._raw_row_iterator method isn't found. This could lead to degraded cache reading performance.", stacklevel=2, ) row_iterator = rows else: row_iterator = raw_row_iterator() for (blob,) in row_iterator: yield blob def get_exclusive_write(self) -> bool: # NOTE on recursive calls # somewhat magically, they should work as expected with no extra database inserts? # the top level call 'wins' the write transaction and once it's gathered all data, will write it # the 'intermediate' level calls fail to get it and will pass data through # the cached 'bottom' level is read only and will be yielded without a write transaction try: # first 'write' statement will upgrade transaction to write transaction which might fail due to concurrency # see https://www.sqlite.org/lang_transaction.html # NOTE: because of 'checkfirst=True', only the last .create will guarantee the transaction upgrade to write transaction self.table_hash.create(self.connection, checkfirst=True) # 'table' used to be old 'cache' table name, so we just delete it regardless # otherwise it might overinfalte the cache db with stale values self.connection.execute(text('DROP TABLE IF EXISTS `table`')) # NOTE: we have to use .drop and then .create (e.g. instead of some sort of replace) # since it's possible to have schema changes inbetween calls # checkfirst=True because it might be the first time we're using cache self.table_cache_tmp.drop(self.connection, checkfirst=True) self.table_cache_tmp.create(self.connection) except sqlalchemy.exc.OperationalError as e: if e.code == 'e3q8' and 'database is locked' in str(e): # someone else must be have won the write lock # not much we can do here # NOTE: important to close early, otherwise we might hold onto too many file descriptors during yielding # see test_recursive_deep # (normally connection is closed in SqliteBackend.__exit__) self.connection.close() # in this case all the callee can do is just to call the actual function return False else: raise e return True def flush_blobs(self, chunk: Sequence[bytes]) -> None: # uhh. this gives a huge speedup for inserting # since we don't have to create intermediate dictionaries # TODO move this to __init__? insert_into_table_cache_tmp_raw = str( self.table_cache_tmp.insert().compile(dialect=sqlite.dialect(paramstyle='qmark')) ) # I also tried setting paramstyle='qmark' in create_engine, but it seems to be ignored :( # idk what benefit sqlalchemy gives at this point, seems to just complicate things self.connection.exec_driver_sql(insert_into_table_cache_tmp_raw, [(c,) for c in chunk]) def finalize(self, new_hash: SourceHash) -> None: # delete hash first, so if we are interrupted somewhere, it mismatches next time and everything is recomputed self.connection.execute(self.table_hash.delete()) # checkfirst is necessary since it might not have existed in the first place # e.g. first time we use cache self.table_cache.drop(self.connection, checkfirst=True) # meh https://docs.sqlalchemy.org/en/14/faq/metadata_schema.html#does-sqlalchemy-support-alter-table-create-view-create-trigger-schema-upgrade-functionality # also seems like sqlalchemy doesn't have any primitives to escape table names.. sigh self.connection.execute(text(f"ALTER TABLE `{self.table_cache_tmp.name}` RENAME TO `{self.table_cache.name}`")) self.connection.execute(self.table_hash.insert().values([{'value': new_hash}])) ================================================ FILE: src/cachew/common.py ================================================ from dataclasses import dataclass # TODO better name to represent what it means? type SourceHash = str class CachewException(RuntimeError): pass @dataclass class TypeNotSupported(CachewException): type_: type reason: str def __str__(self) -> str: return f"{self.type_} isn't supported by cachew: {self.reason}. See https://github.com/karlicoss/cachew#features for the list of supported types." ================================================ FILE: src/cachew/compat.py ================================================ import sys if sys.version_info[:2] >= (3, 13): from warnings import deprecated else: from typing_extensions import deprecated __all__ = ["deprecated"] ================================================ FILE: src/cachew/experimental.py ================================================ from typing import TYPE_CHECKING if not TYPE_CHECKING: from .compat import deprecated @deprecated("Exceptions are not an experimental feature anymore and enabled by default.") def enable_exceptions() -> None: pass @deprecated("Exceptions are not an experimental feature anymore and enabled by default.") def disable_exceptions() -> None: pass ================================================ FILE: src/cachew/extra.py ================================================ # todo Ideally, needs doublewraps as well? also typing helpers def mcachew(*args, **kwargs): """ Stands for 'Maybe cachew'. Defensive wrapper around @cachew to make it an optional dependency. """ try: import cachew except ModuleNotFoundError: import warnings warnings.warn( 'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew', stacklevel=2, ) return lambda orig_func: orig_func else: return cachew.cachew(*args, **kwargs) from contextlib import contextmanager @contextmanager def disabled_cachew(): from . import settings orig = settings.ENABLE try: settings.ENABLE = False yield finally: settings.ENABLE = orig ================================================ FILE: src/cachew/legacy.py ================================================ import typing import warnings from collections.abc import Iterable, Iterator, Sequence from dataclasses import dataclass from datetime import date, datetime from itertools import chain, islice from pathlib import Path from typing import ( Any, Generic, NamedTuple, Optional, TypeVar, Union, ) import sqlalchemy from sqlalchemy import Column from .pytest import parametrize from .common import CachewException def get_union_args(cls) -> Optional[tuple[type]]: if getattr(cls, '__origin__', None) != Union: return None args = cls.__args__ args = tuple(e for e in args if e is not type(None)) assert len(args) > 0 return args # ty: ignore[invalid-return-type] def is_union(cls) -> bool: return get_union_args(cls) is not None Types = Union[ type[str], type[int], type[float], type[bool], type[datetime], type[date], type[dict], type[list], type[Exception], type[NamedTuple], ] Values = Union[ str, int, float, bool, datetime, date, dict, list, Exception, NamedTuple, ] PRIMITIVE_TYPES = { str, int, float, bool, datetime, date, dict, list, Exception, } def is_primitive(cls: type) -> bool: """ >>> from typing import Dict, Any >>> is_primitive(int) True >>> is_primitive(set) False >>> is_primitive(dict) True """ return cls in PRIMITIVE_TYPES class IsoDateTime(sqlalchemy.TypeDecorator): # in theory could use something more effecient? e.g. blob for encoded datetime and tz? # but practically, the difference seems to be pretty small, so perhaps fine for now impl = sqlalchemy.String cache_ok = True @property def python_type(self): return datetime def process_literal_param(self, value, dialect): raise NotImplementedError() # make pylint happy def process_bind_param(self, value: Optional[datetime], dialect) -> Optional[str]: # noqa: ARG002 if value is None: return None # ok, it's a bit hacky... attempt to preserve pytz infromation iso = value.isoformat() tz = getattr(value, 'tzinfo', None) if tz is None: return iso try: import pytz except ImportError: self.warn_pytz() return iso else: if isinstance(tz, pytz.BaseTzInfo): zone = tz.zone # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6 assert zone is not None, tz return iso + ' ' + zone else: return iso def process_result_value(self, value: Optional[str], dialect) -> Optional[datetime]: # noqa: ARG002 if value is None: return None spl = value.split(' ') dt = datetime.fromisoformat(spl[0]) if len(spl) <= 1: return dt zone = spl[1] # else attempt to decypher pytz tzinfo try: import pytz except ImportError: self.warn_pytz() return dt else: tz = pytz.timezone(zone) return dt.astimezone(tz) def warn_pytz(self) -> None: warnings.warn('install pytz for better timezone support while serializing with cachew', stacklevel=2) # a bit hacky, but works... class IsoDate(IsoDateTime): impl = sqlalchemy.String cache_ok = True @property def python_type(self): return date def process_literal_param(self, value, dialect): raise NotImplementedError() # make pylint happy def process_result_value(self, value: Optional[str], dialect) -> Optional[date]: # type: ignore[override] res = super().process_result_value(value, dialect) if res is None: return None return res.date() jtypes = (int, float, bool, type(None)) class ExceptionAdapter(sqlalchemy.TypeDecorator): ''' Enables support for caching Exceptions. Exception is treated as JSON and serialized. It's useful for defensive error handling, in case of cachew in particular for preserving error state. I elaborate on it here: [mypy-driven error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss). ''' impl = sqlalchemy.JSON cache_ok = True @property def python_type(self): return Exception def process_literal_param(self, value, dialect): raise NotImplementedError() # make pylint happy def process_bind_param(self, value: Optional[Exception], dialect) -> Optional[list[Any]]: # noqa: ARG002 if value is None: return None sargs: list[Any] = [] for a in value.args: if any(isinstance(a, t) for t in jtypes): sargs.append(a) elif isinstance(a, date): sargs.append(a.isoformat()) else: sargs.append(str(a)) return sargs def process_result_value(self, value: Optional[str], dialect) -> Optional[Exception]: # noqa: ARG002 if value is None: return None # sadly, can't do much to convert back from the strings? Unless I serialize the type info as well? return Exception(*value) # fmt: off PRIMITIVES = { str : sqlalchemy.String, int : sqlalchemy.Integer, float : sqlalchemy.Float, bool : sqlalchemy.Boolean, datetime : IsoDateTime, date : IsoDate, dict : sqlalchemy.JSON, list : sqlalchemy.JSON, Exception: ExceptionAdapter, } # fmt: on assert set(PRIMITIVES.keys()) == PRIMITIVE_TYPES def strip_optional(cls) -> tuple[type, bool]: """ >>> from typing import Optional, NamedTuple >>> strip_optional(Optional[int]) (, True) >>> class X(NamedTuple): ... x: int >>> strip_optional(X) (, False) """ is_opt: bool = False args = get_union_args(cls) if args is not None and len(args) == 1: cls = args[0] # meh is_opt = True return (cls, is_opt) def strip_generic(tp): """ >>> from typing import List >>> strip_generic(List[int]) >>> strip_generic(str) """ GA = getattr(typing, '_GenericAlias') # ugh, can't make both mypy and pylint happy here? if isinstance(tp, GA): return tp.__origin__ return tp NT = TypeVar('NT') # sadly, bound=NamedTuple is not working yet in mypy # https://github.com/python/mypy/issues/685 # also needs to support dataclasses? @dataclass class NTBinder(Generic[NT]): """ >>> class Job(NamedTuple): ... company: str ... title: Optional[str] >>> class Person(NamedTuple): ... name: str ... age: int ... job: Optional[Job] NTBinder is a helper class for inteacting with sqlite database. Hierarchy is flattened: >>> binder = NTBinder.make(Person) >>> [(c.name, type(c.type)) for c in binder.columns] ... # doctest: +NORMALIZE_WHITESPACE [('name', ), ('age', ), ('_job_is_null', ), ('job_company', ), ('job_title', )] >>> person = Person(name='alan', age=40, job=None) to_row converts object to a sql-friendly tuple. job=None, so we end up with True in _job_is_null field >>> tuple(binder.to_row(person)) ('alan', 40, True, None, None) from_row does reverse conversion >>> binder.from_row(('alan', 40, True, None, None)) Person(name='alan', age=40, job=None) >>> binder.from_row(('ann', 25, True, None, None, 'extra')) Traceback (most recent call last): ... cachew.common.CachewException: unconsumed items in iterator ['extra'] """ name: Optional[str] # None means toplevel type_: Types span: int # not sure if span should include optional col? primitive: bool optional: bool union: Optional[type] # helper, which isn't None if type is Union fields: Sequence[Any] # mypy can't handle cyclic definition at this point :( @staticmethod def make(tp: type[NT], name: Optional[str] = None) -> 'NTBinder[NT]': tp, optional = strip_optional(tp) # ty: ignore[invalid-assignment] union: Optional[type] fields: tuple[Any, ...] primitive: bool union_args = get_union_args(tp) if union_args is not None: CachewUnion = NamedTuple('_CachewUnionRepr', [(x.__name__, Optional[x]) for x in union_args]) # type: ignore[misc] union = CachewUnion primitive = False fields = (NTBinder.make(tp=CachewUnion, name='_cachew_union_repr'),) span = 1 else: union = None tp = strip_generic(tp) primitive = is_primitive(tp) if primitive: if name is None: name = '_cachew_primitive' # meh. presumably, top level if primitive: fields = () span = 1 else: annotations = typing.get_type_hints(tp) if annotations == {}: raise CachewException( f"{tp} (field '{name}'): doesn't look like a supported type to cache. See https://github.com/karlicoss/cachew#features for the list of supported types." ) fields = tuple(NTBinder.make(tp=ann, name=fname) for fname, ann in annotations.items()) span = sum(f.span for f in fields) + (1 if optional else 0) return NTBinder( name=name, type_=tp, # type: ignore[arg-type] span=span, primitive=primitive, optional=optional, union=union, fields=fields, ) @property def columns(self) -> list[Column]: return list(self.iter_columns()) # TODO not necessarily namedtuple? could be primitive type def to_row(self, obj: NT) -> tuple[Optional[Values], ...]: return tuple(self._to_row(obj)) def from_row(self, row: Iterable[Any]) -> NT: riter = iter(row) res = self._from_row(riter) remaining = list(islice(riter, 0, 1)) if len(remaining) != 0: raise CachewException(f'unconsumed items in iterator {remaining}') assert res is not None # nosec # help mypy; top level will not be None return res def _to_row(self, obj) -> Iterator[Optional[Values]]: if self.primitive: yield obj elif self.union is not None: CachewUnion = self.union (uf,) = self.fields # TODO assert only one of them matches?? union = CachewUnion(**{f.name: obj if isinstance(obj, f.type_) else None for f in uf.fields}) yield from uf._to_row(union) else: if self.optional: is_none = obj is None yield is_none else: is_none = False assert obj is not None # TODO hmm, that last assert is not very symmetric... if is_none: for _ in range(self.span - 1): yield None else: yield from chain.from_iterable(f._to_row(getattr(obj, f.name)) for f in self.fields) def _from_row(self, row_iter): if self.primitive: return next(row_iter) elif self.union is not None: CachewUnion = self.union # noqa: F841 (uf,) = self.fields # TODO assert only one of them is not None? union_params = [r for r in uf._from_row(row_iter) if r is not None] assert len(union_params) == 1, union_params return union_params[0] else: if self.optional: is_none = next(row_iter) else: is_none = False if is_none: for _ in range(self.span - 1): x = next(row_iter) assert x is None, x # huh. assert is kinda opposite of producing value return None else: return self.type_(*(f._from_row(row_iter) for f in self.fields)) # TODO not sure if we want to allow optionals on top level? def iter_columns(self) -> Iterator[Column]: used_names: set[str] = set() def col(name: str, tp) -> Column: while name in used_names: name = '_' + name used_names.add(name) return Column(name, tp) if self.primitive: if self.name is None: raise AssertionError yield col(self.name, PRIMITIVES[self.type_]) else: prefix = '' if self.name is None else self.name + '_' if self.optional: yield col(f'_{prefix}is_null', sqlalchemy.Boolean) for f in self.fields: for c in f.iter_columns(): yield col(f'{prefix}{c.name}', c.type) def __str__(self): lines = [' ' * level + str(x.name) + ('?' if x.optional else '') + f' ' for level, x in self.flatten()] return '\n'.join(lines) def __repr__(self): return str(self) def flatten(self, level=0): yield (level, self) for f in self.fields: yield from f.flatten(level=level + 1) def test_mypy_annotations() -> None: # mypy won't handle, so this has to be dynamic vs = [] for t in Types.__args__: # type: ignore[attr-defined] (arg,) = t.__args__ vs.append(arg) def types(ts): return sorted(ts, key=lambda t: str(t)) assert types(vs) == types(Values.__args__) # type: ignore[attr-defined] for p in PRIMITIVE_TYPES: assert p in Values.__args__ # type: ignore[attr-defined] @parametrize( ('tp', 'val'), [ (int, 22), (bool, False), (Optional[str], 'abacaba'), (Union[str, int], 1), ], ) def test_ntbinder_primitive(tp, val) -> None: b = NTBinder.make(tp, name='x') row = b.to_row(val) vv = b.from_row(list(row)) assert vv == val def test_unique_columns(tmp_path: Path) -> None: # noqa: ARG001 class Job(NamedTuple): company: str title: Optional[str] class Breaky(NamedTuple): job_title: int job: Optional[Job] assert [c.name for c in NTBinder.make(Breaky).columns] == [ 'job_title', '_job_is_null', 'job_company', '_job_title', ] ================================================ FILE: src/cachew/logging_helper.py ================================================ from __future__ import annotations import logging import os import warnings from functools import lru_cache from typing import TYPE_CHECKING def test() -> None: import sys from collections.abc import Callable M: Callable[[str], None] = lambda s: print(s, file=sys.stderr) ## prepare exception for later try: None.whatever # type: ignore[attr-defined] # noqa: B018 except Exception as e: ex = e ## M(" Logging module's defaults are not great:") l = logging.getLogger('default_logger') l.error( "For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level" ) M("\n The reason is that you need to remember to call basicConfig() first. Let's do it now:") logging.basicConfig() l.error( "OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number" ) M( "\n Also exception logging is kinda lame, doesn't print traceback by default unless you remember to pass exc_info:" ) l.exception(ex) # type: ignore[possibly-undefined] M( "\n\n With make_logger you get a reasonable logging format, colours (via colorlog library) and other neat things:" ) ll = make_logger('test') # No need for basicConfig! ll.info("default level is INFO") ll.debug("... so this shouldn't be displayed") ll.warning("warnings are easy to spot!") M("\n Exceptions print traceback by default now:") ll.exception(ex) M( "\n You can (and should) use it via regular logging.getLogger after that, e.g. let's set logging level to DEBUG now" ) logging.getLogger('test').setLevel(logging.DEBUG) ll.debug("... now debug messages are also displayed") DEFAULT_LEVEL = 'INFO' FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)-4d]{end} %(message)s' FORMAT_NOCOLOR = FORMAT.format(start='', end='') Level = int LevelIsh = Level | str | None def mklevel(level: LevelIsh) -> Level: if level is None: return logging.NOTSET if isinstance(level, int): return level return getattr(logging, level.upper()) def get_collapse_level() -> Level | None: # TODO not sure if should be specific to logger name? cl = os.environ.get('LOGGING_COLLAPSE', None) if cl is not None: return mklevel(cl) # legacy name, maybe deprecate? cl = os.environ.get('COLLAPSE_DEBUG_LOGS', None) if cl is not None: return logging.DEBUG return None def get_env_level(name: str) -> Level | None: PREFIX = 'LOGGING_LEVEL_' # e.g. LOGGING_LEVEL_my_hypothesis=debug # shell doesn't allow using dots in var names without escaping, so also support underscore syntax lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None) if lvl is not None: return mklevel(lvl) return None def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> None: """ Wrapper to simplify logging setup. """ if isinstance(logger, str): logger = logging.getLogger(logger) if level is None: level = DEFAULT_LEVEL # env level always takes precedence env_level = get_env_level(logger.name) if env_level is not None: lvl = env_level else: lvl = mklevel(level) if logger.level == logging.NOTSET: # if it's already set, the user requested a different logging level, let's respect that logger.setLevel(lvl) _setup_handlers_and_formatters(name=logger.name) # cached since this should only be done once per logger instance @lru_cache(None) def _setup_handlers_and_formatters(name: str) -> None: logger = logging.getLogger(name) logger.addFilter(AddExceptionTraceback()) ch = logging.StreamHandler() collapse_level = get_collapse_level() ch = logging.StreamHandler() if collapse_level is None else CollapseLogsHandler(maxlevel=collapse_level) # default level for handler is NOTSET, which will make it process all messages # we rely on the logger to actually accept/reject log msgs logger.addHandler(ch) # this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand) # even if log entry is handled by this logger ... not sure what's the point of this behaviour?? logger.propagate = False try: # try colorlog first, so user gets nice colored logs import colorlog except ModuleNotFoundError: warnings.warn("You might want to 'pip install colorlog' for nice colored logs", stacklevel=2) formatter = logging.Formatter(FORMAT_NOCOLOR) else: # log_color/reset are specific to colorlog FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s') # colorlog should detect tty in principle, but doesn't handle everything for some reason # see https://github.com/borntyping/python-colorlog/issues/71 if ch.stream.isatty(): formatter = colorlog.ColoredFormatter(FORMAT_COLOR) else: formatter = logging.Formatter(FORMAT_NOCOLOR) ch.setFormatter(formatter) # by default, logging.exception isn't logging traceback unless called inside of the exception handler # which is a bit annoying since we have to pass exc_info explicitly # also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default # todo also amend by post about defensive error handling? class AddExceptionTraceback(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: if record.levelname == 'ERROR': exc = record.msg if isinstance(exc, BaseException): if record.exc_info is None or record.exc_info == (None, None, None): exc_info = (type(exc), exc, exc.__traceback__) record.exc_info = exc_info return True # todo also save full log in a file? class CollapseLogsHandler(logging.StreamHandler): ''' Collapses subsequent debug log lines and redraws on the same line. Hopefully this gives both a sense of progress and doesn't clutter the terminal as much? ''' last: bool = False maxlevel: Level = logging.DEBUG # everything with less or equal level will be collapsed def __init__(self, *args, maxlevel: Level, **kwargs) -> None: super().__init__(*args, **kwargs) self.maxlevel = maxlevel def emit(self, record: logging.LogRecord) -> None: try: msg = self.format(record) cur = record.levelno <= self.maxlevel and '\n' not in msg if cur: if self.last: self.stream.write('\033[K' + '\r') # clear line + return carriage else: if self.last: self.stream.write('\n') # clean up after the last line self.last = cur columns, _ = os.get_terminal_size(0) # ugh. the columns thing is meh. dunno I guess ultimately need curses for that # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc) self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\n')) self.flush() except: self.handleError(record) def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger: logger = logging.getLogger(name) setup_logger(logger, level=level) return logger # ughh. hacky way to have a single enlighten instance per interpreter, so it can be shared between modules # not sure about this. I guess this should definitely be behind some flag # OK, when stdout is not a tty, enlighten doesn't log anything, good def get_enlighten(): # TODO could add env variable to disable enlighten for a module? from unittest.mock import ( Mock, # Mock to return stub so cients don't have to think about it ) # for now hidden behind the flag since it's a little experimental if os.environ.get('ENLIGHTEN_ENABLE', None) is None: return Mock() try: import enlighten # type: ignore[import-untyped] except ModuleNotFoundError: warnings.warn("You might want to 'pip install enlighten' for a nice progress bar", stacklevel=2) return Mock() # dirty, but otherwise a bit unclear how to share enlighten manager between packages that call each other instance = getattr(enlighten, 'INSTANCE', None) if instance is not None: return instance instance = enlighten.get_manager() setattr(enlighten, 'INSTANCE', instance) return instance if __name__ == '__main__': test() ## legacy/deprecated methods for backwards compatilibity if not TYPE_CHECKING: LazyLogger = make_logger logger = make_logger ## ================================================ FILE: src/cachew/marshall/cachew.py ================================================ from __future__ import annotations import types from abc import abstractmethod from collections import abc from collections.abc import Sequence from dataclasses import dataclass, is_dataclass from datetime import UTC, date, datetime from numbers import Real from typing import ( # noqa: UP035 Any, Dict, List, NamedTuple, Optional, Tuple, Union, get_args, get_origin, get_type_hints, ) from zoneinfo import ZoneInfo from ..common import TypeNotSupported from ..utils import is_namedtuple, resolve_type_parameters from .common import AbstractMarshall, Json class CachewMarshall[T](AbstractMarshall[T]): def __init__(self, Type_: type[T]) -> None: self.schema = build_schema(Type_) def dump(self, obj: T) -> Json: return self.schema.dump(obj) def load(self, dct: Json) -> T: return self.schema.load(dct) # NOTE: using slots gives a small speedup (maybe 5%?) # I suppose faster access to fields or something.. @dataclass(slots=True) class Schema: type: Any @abstractmethod def dump(self, obj): raise NotImplementedError @abstractmethod def load(self, dct): raise NotImplementedError @dataclass(slots=True) class SPrimitive(Schema): def dump(self, obj): # NOTE: returning here directly (instead of calling identity lambda) gives about 20% speedup # I think custom types should have their own Schema subclass return obj # prim = primitives_to.get(self.type) # assert prim is not None # return prim(o) def load(self, dct): return dct # prim = primitives_from.get(self.type) # assert prim is not None # return prim(d) @dataclass(slots=True) class SDataclass(Schema): # using list of tuples instead of dict gives about 5% speedup fields: tuple[tuple[str, Schema], ...] def dump(self, obj): # TODO would be nice if we didn't create a dictionary here # considering it is going to be serialized to json anyway # maybe we need to yield json bits actually? return { # would be kinda nice if we didn't have to use getattr here # but I think for dataclass this is actually the fastest way # TODO for NamedTuples could just use them as tuples.. think about separating k: ks.dump(getattr(obj, k)) for k, ks in self.fields } def load(self, dct): # dict comprehension is meh, but not sure if there is a faster way? return self.type(**{ k: ks.load(dct[k]) for k, ks in self.fields }) # fmt: skip @dataclass(slots=True) class SUnion(Schema): # it's a bit faster to cache indices here, gives about 15% speedup args: tuple[tuple[int, Schema], ...] def dump(self, obj): if obj is None: # if it's a None, then doesn't really matter how to serialize and deserialize it return (0, None) # TODO could do a bit of magic here and remember the last index that worked? # that way if some objects dominate the Union, the first isinstance would always work for tidx, a in self.args: if isinstance(obj, a.type): # this takes quite a lot of time (sort of expected?) # using lists instead of dicts gives a bit of a speedup (about 15%) # so probably worth it even though a bit cryptic # also could add a tag or something? # NOTE: using tuple instead of list gives a tiiny speedup jj = a.dump(obj) return (tidx, jj) # { # '__union_index__': tidx, # '__value__': jj, # } raise RuntimeError(f"shouldn't happen: {self.args} {obj}") def load(self, dct): # tidx = d['__union_index__'] # s = self.args[tidx] # return s.load(d['__value__']) tidx, val = dct if val is None: # counterpart for None handling in .dump method return None _, s = self.args[tidx] return s.load(val) @dataclass(slots=True) class SList(Schema): arg: Schema def dump(self, obj): return tuple(self.arg.dump(i) for i in obj) def load(self, dct): return [self.arg.load(i) for i in dct] @dataclass(slots=True) class STuple(Schema): args: tuple[Schema, ...] def dump(self, obj): return tuple(a.dump(i) for a, i in zip(self.args, obj, strict=True)) def load(self, dct): return tuple(a.load(i) for a, i in zip(self.args, dct, strict=True)) @dataclass(slots=True) class SSequence(Schema): arg: Schema def dump(self, obj): return tuple(self.arg.dump(i) for i in obj) def load(self, dct): return tuple(self.arg.load(i) for i in dct) @dataclass(slots=True) class SDict(Schema): ft: SPrimitive tt: Schema def dump(self, obj): return { k: self.tt.dump(v) for k, v in obj.items() } # fmt: skip def load(self, dct): return { k: self.tt.load(v) for k, v in dct.items() } # fmt: skip # TODO unify with primitives? JTypes = {int, str, type(None), float, bool} def _exc_helper(args): for a in args: at = type(a) if at in JTypes: yield a elif issubclass(at, date): # TODO would be nice to restore datetime from cache too # maybe generally save exception as a union? or intact and let orjson save it? yield a.isoformat() else: yield str(a) # not much we can do.. @dataclass(slots=True) class SException(Schema): def dump(self, obj: Exception) -> Json: return tuple(_exc_helper(obj.args)) def load(self, dct: Json): return self.type(*dct) try: # defensive to avoid dependency on pytz when we switch to python >= 3.9 import pytz except ModuleNotFoundError: # dummy, this is only needed for isinstance check below class pytz_BaseTzInfo: zone: str def make_tz_pytz(zone: str): raise RuntimeError(f"Install pytz to deserialize {zone}") else: pytz_BaseTzInfo = pytz.BaseTzInfo # type: ignore[misc,assignment] make_tz_pytz = pytz.timezone # just ints to avoid inflating db size # for now, we try to preserve actual timezone object just in case since they do have somewhat incompatible apis _TZTAG_ZONEINFO = 1 _TZTAG_PYTZ = 2 @dataclass(slots=True) class SDatetime(Schema): def dump(self, obj: datetime) -> Json: iso = obj.isoformat() tz = obj.tzinfo if tz is None: return (iso, None, None) if isinstance(tz, ZoneInfo): return (iso, tz.key, _TZTAG_ZONEINFO) elif isinstance(tz, pytz_BaseTzInfo): zone = tz.zone # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6 assert zone is not None, (obj, tz) return (iso, zone, _TZTAG_PYTZ) else: return (iso, None, None) def load(self, dct: tuple): iso, zone, zone_tag = dct dt = datetime.fromisoformat(iso) if zone is None: return dt make_tz = ZoneInfo if zone_tag == _TZTAG_ZONEINFO else make_tz_pytz tz = make_tz(zone) return dt.astimezone(tz) @dataclass(slots=True) class SDate(Schema): def dump(self, obj: date) -> Json: return obj.isoformat() def load(self, dct: str): return date.fromisoformat(dct) PRIMITIVES = { # int and float are handled a bit differently to allow implicit casts # isinstance(.., Real) works both for int and for float # Real can't be serialized back, but if you look in SPrimitive, it leaves the values intact anyway # since the actual serialization of primitives is handled by orjson int: Real, float: Real, str: str, type(None): type(None), bool: bool, # if type is Any, there isn't much we can do to dump it -- just dump into json and rely on the best # so in this sense it works exacly like primitives Any: Any, } def build_schema(Type) -> Schema: # just to avoid confusion in case of weirdness with stringish type annotations assert not isinstance(Type, str), Type Type = resolve_type_parameters(Type) ptype = PRIMITIVES.get(Type) if ptype is not None: return SPrimitive(type=ptype) origin = get_origin(Type) # origin is 'unsubscripted/erased' version of type # if origin is NOT None, it's some sort of generic type if origin is None: if issubclass(Type, Exception): return SException(type=Type) if issubclass(Type, datetime): return SDatetime(type=Type) if issubclass(Type, date): return SDate(type=Type) if not (is_dataclass(Type) or is_namedtuple(Type)): raise TypeNotSupported(type_=Type, reason='unknown type') try: hints = get_type_hints(Type) except TypeError as te: # this can happen for instance on 3.9 if pipe syntax was used for Union types # would be nice to provide a friendlier error though raise TypeNotSupported(type_=Type, reason='failed to get type hints') from te fields = tuple((k, build_schema(t)) for k, t in hints.items()) return SDataclass( type=Type, fields=fields, ) args = get_args(Type) is_union = origin is Union or origin is types.UnionType if is_union: # We 'erasing' types (since generic types don't work with isinstance checks). # So we need to make sure the types are unique to make sure we can deserialise them. schemas = [build_schema(a) for a in args] union_types = [s.type for s in schemas if s.type is not Real] if len(set(union_types)) != len(union_types): raise TypeNotSupported(type_=Type, reason=f'runtime union arguments are not unique: {union_types}') return SUnion( type=origin, args=tuple( (tidx, s) for tidx, s in enumerate(schemas) ), ) # fmt: skip is_listish = origin is list if is_listish: (t,) = args return SList( type=origin, arg=build_schema(t), ) # hmm check for is typing.Sequence doesn't pass for some reason # perhaps because it's a deprecated alias? is_tuplish = origin is tuple or origin is abc.Sequence if is_tuplish: if origin is tuple: # this is for Tuple[()], which is the way to represent empty tuple # before python 3.11, get_args for that gives ((),) instead of an empty tuple () as one might expect if args == ((),): args = () return STuple( type=origin, args=tuple(build_schema(a) for a in args), ) else: (t,) = args return SSequence( type=origin, arg=build_schema(t), ) is_dictish = origin is dict if is_dictish: (ft, tt) = args fts = build_schema(ft) tts = build_schema(tt) assert isinstance(fts, SPrimitive) return SDict( type=origin, ft=fts, tt=tts, ) raise RuntimeError(f"unsupported: {Type=} {origin=} {args=}") ######### tests def _test_identity(obj, Type_, expected=None): if expected is None: expected = obj m = CachewMarshall(Type_) j = m.dump(obj) obj2 = m.load(j) # Exception's don't support equality normally, so we need to do some hacks.. def normalise(x): if isinstance(x, Exception): return (type(x), x.args) if type(x) is list: return [(type(i), i.args) if isinstance(i, Exception) else i for i in x] return x # ugh that doesn't work # def exc_eq(s, other): # return (type(s), s.args) == (type(other), other.args) # Exception.__eq__ = exc_eq assert normalise(expected) == normalise(obj2), (expected, obj2) return (j, obj2) ## this is used for test below... # however if we define this inside the test function, it fails if from __future__ import annotations is present on the file.. type _IntType = int type _StrIntType = str | int ## # TODO customise with cattrs def test_serialize_and_deserialize() -> None: import pytest helper = _test_identity # primitives helper(1, int) helper('aaa', str) helper(None, type(None)) # TODO emit other value as none type? not sure what should happen # implicit casts, simple version helper(None, int) helper(None, str) helper(1, float) # implicit casts, inside other types # technically not type safe, but might happen in practice # doesn't matter how to deserialize None anyway so let's allow this helper(None, str | int) # old syntax helper(None, Union[str, int]) # noqa: UP007 # even though 1 is not isinstance(float), often it ends up as float in data # see https://github.com/karlicoss/cachew/issues/54 helper(1, float | str) helper(2, float | int) helper(2.0, float | int) helper((1, 2), tuple[int, float]) # optionals helper('aaa', str | None) helper(None, str | None) # old syntax helper('aaa', Optional[str]) # noqa: UP045 helper('aaa', Union[str, None]) # noqa: UP007 helper(None, Union[str, None]) # noqa: UP007 # lists/tuples/sequences # TODO test with from __future__ import annotations.. helper([1, 2, 3], list[int]) helper([1, 2, 3], Optional[List[int]]) # noqa: UP006,UP045 helper([1, 2, 3], Sequence[int], expected=(1, 2, 3)) helper((1, 2, 3), Sequence[int]) helper((1, 2, 3), tuple[int, int, int]) # old syntax helper([1, 2, 3], List[int]) # noqa: UP006 helper((1, 2, 3), Tuple[int, int, int]) # noqa: UP006 helper((1, 2, 3), Optional[tuple[int, int, int]]) # noqa: UP045 # dicts helper({'a': 'aa', 'b': 'bb'}, dict[str, str]) helper({'a': None, 'b': 'bb'}, dict[str, str | None]) helper({'a': 'aa', 'b': 'bb'}, dict[str, str]) # old syntax helper({'a': None, 'b': 'bb'}, Dict[str, Optional[str]]) # noqa: UP006,UP045 # unions helper('aaa', str | int) # old syntax helper(1, Union[str, int]) # noqa: UP007 # compounds of simple types helper(['1', 2, '3'], list[str | int]) # old syntax helper(['1', 2, '3'], list[Union[str, int]]) # noqa: UP007 # TODO need to add test for equivalent dataclasses @dataclass class Point: x: int y: int # dataclasses helper(Point(x=1, y=2), Point) # Namedtuple class NT(NamedTuple): first: str last: str helper(NT(first='aaa', last='bbb'), NT) @dataclass class WithJson: id: int raw_data: dict[str, Any] ## type aliases including new 3.12 type aliases # this works.. StrInt = str | int helper('aaa', StrInt) helper('aaa', _StrIntType) helper([1, 2, 3], list[_IntType]) @dataclass class TestTypeAlias: x: _IntType value: _StrIntType helper(TestTypeAlias(x=1, value='aaa'), TestTypeAlias) ## # json-ish stuff helper({}, dict[str, Any]) helper(WithJson(id=123, raw_data={'payload': 'whatever', 'tags': ['a', 'b', 'c']}), WithJson) helper([], list[Any]) # exceptions helper(RuntimeError('whatever!'), RuntimeError) # fmt: off helper([ RuntimeError('I', 'am', 'exception', 123), Point(x=1, y=2), Point(x=11, y=22), RuntimeError('more stuff'), RuntimeError(), ], list[RuntimeError | Point]) exc_with_datetime = Exception('I happenned on', datetime.fromisoformat('2021-04-03T10:11:12')) exc_with_datetime_exp = Exception('I happenned on', '2021-04-03T10:11:12') helper(exc_with_datetime, Exception, expected=exc_with_datetime_exp) # fmt: on # datetimes import pytz tz_london = pytz.timezone('Europe/London') dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S') dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S') dwinter_tz = tz_london.localize(dwinter) dsummer_tz = tz_london.localize(dsummer) dates_tz = [ dwinter_tz, dsummer_tz, ] tz_sydney = ZoneInfo('Australia/Sydney') ## these will have same local time (2025-04-06 02:01:00) in Sydney due to DST shift! ## the second one will have fold=1 set to disambiguate utc_before_shift = datetime.fromisoformat('2025-04-05T15:01:00+00:00') utc_after__shift = datetime.fromisoformat('2025-04-05T16:01:00+00:00') ## sydney_before = utc_before_shift.astimezone(tz_sydney) sydney__after = utc_after__shift.astimezone(tz_sydney) dates_tz.extend([sydney_before, sydney__after]) dates = [ *dates_tz, dwinter, dsummer, dsummer.replace(tzinfo=UTC), ] for d in dates: _jj, dd = helper(d, datetime) assert str(d) == str(dd) # test that we preserve zone names if d in dates_tz: # this works both with pytz and zoneinfo without getting .zone or .key attributes assert str(d.tzinfo) == str(dd.tzinfo) assert helper(dsummer_tz, datetime)[0] == ('2020-08-03T01:02:03+01:00', 'Europe/London', _TZTAG_PYTZ) assert helper(dwinter, datetime)[0] == ('2020-02-03T01:02:03', None, None) assert helper(sydney_before, datetime)[0] == ('2025-04-06T02:01:00+11:00', 'Australia/Sydney', _TZTAG_ZONEINFO) assert helper(sydney__after, datetime)[0] == ('2025-04-06T02:01:00+10:00', 'Australia/Sydney', _TZTAG_ZONEINFO) assert helper(dwinter.date(), date)[0] == '2020-02-03' # unsupported types class NotSupported: pass with pytest.raises(RuntimeError, match=r".*NotSupported.* isn't supported by cachew"): helper([NotSupported()], list[NotSupported]) # edge cases helper((), tuple[()]) # unions of generic sequences and such # these don't work because the erased type of both is just 'list'.. # so there is no way to tell which one we need to construct :( with pytest.raises(TypeNotSupported, match=r".*runtime union arguments are not unique"): helper([1, 2, 3], list[int] | list[Exception]) with pytest.raises(TypeNotSupported, match=r".*runtime union arguments are not unique"): helper([1, 2, 3], list[Exception] | list[int]) ================================================ FILE: src/cachew/marshall/common.py ================================================ from abc import abstractmethod from typing import Any type Json = dict[str, Any] | tuple[Any, ...] | str | float | int | bool | None class AbstractMarshall[T]: @abstractmethod def dump(self, obj: T) -> Json: raise NotImplementedError @abstractmethod def load(self, dct: Json) -> T: raise NotImplementedError ================================================ FILE: src/cachew/py.typed ================================================ ================================================ FILE: src/cachew/pytest.py ================================================ """ Helpers to prevent depending on pytest in runtime """ import sys import typing under_pytest = 'pytest' in sys.modules if typing.TYPE_CHECKING or under_pytest: import pytest parametrize = pytest.mark.parametrize else: def parametrize(*_args, **_kwargs): def wrapper(f): return f return wrapper ================================================ FILE: src/cachew/tests/marshall.py ================================================ # ruff: noqa: ARG001 # ruff thinks pytest fixtures are unused arguments import shutil import sqlite3 import sys from dataclasses import dataclass from datetime import UTC, datetime from pathlib import Path from typing import Any, Literal import orjson import pytest from ..marshall.cachew import CachewMarshall from ..marshall.common import Json from .utils import ( gc_control, # noqa: F401 profile, running_on_ci, timer, ) Impl = Literal[ 'cachew', # our custom deserialization 'cattrs', 'legacy', # our legacy deserialization ] # don't include legacy by default, it's only here just for the sake of comparing once before switch Impls: list[Impl] = ['cachew', 'cattrs'] def do_test(*, test_name: str, Type, factory, count: int, impl: Impl = 'cachew') -> None: if count > 100 and running_on_ci: pytest.skip("test too heavy for CI, only meant to run manually") to_json: Any from_json: Any if impl == 'cachew': marshall = CachewMarshall(Type_=Type) to_json = marshall.dump from_json = marshall.load elif impl == 'legacy': from ..legacy import NTBinder # NOTE: legacy binder emits a tuple which can be inserted directly into the database # so 'json dump' and 'json load' should really be disregarded for this flavor # if you're comparing with implementation, you should compare # legacy serializing as the sum of serializing + json dump # that said, this way legacy will have a bit of an advantage since custom types (e.g. datetime) # would normally be handled by sqlalchemy instead binder = NTBinder.make(Type) to_json = binder.to_row from_json = binder.from_row elif impl == 'cattrs': from cattrs import Converter converter = Converter() from typing import get_args # TODO use later # from typing import Union, get_origin # import types # def is_union(type_) -> bool: # origin = get_origin(type_) # return origin is Union or origin is types.UnionType def union_structure_hook_factory(_): def union_hook(data, type_): args = get_args(type_) if data is None: # we don't try to coerce None into anything return None for t in args: try: res = converter.structure(data, t) except Exception: continue else: return res raise ValueError(f"Could not cast {data} to {type_}") return union_hook # borrowed from https://github.com/python-attrs/cattrs/issues/423 # uhh, this doesn't really work straightaway... # likely need to combine what cattr does with configure_tagged_union # converter.register_structure_hook_factory(is_union, union_structure_hook_factory) # configure_tagged_union( # union=Type, # converter=converter, # ) # NOTE: this seems to give a bit of speedup... maybe raise an issue or something? # fmt: off unstruct_func = converter._unstructure_func.dispatch(Type) # type: ignore[call-arg, misc] # about 20% speedup struct_func = converter._structure_func .dispatch(Type) # type: ignore[call-arg, misc] # TODO speedup # fmt: on to_json = unstruct_func # todo would be nice to use partial? but how do we bind a positional arg? from_json = lambda x: struct_func(x, Type) else: raise RuntimeError(impl) print(file=sys.stderr) # kinda annoying, pytest starts printing on the same line as test name with profile(test_name + ':baseline'), timer(f'building {count} objects of type {Type}'): objects = list(factory(count=count)) jsons: list[Json] = [None for _ in range(count)] with profile(test_name + ':serialize'), timer(f'serializing {count} objects of type {Type}'): for i in range(count): jsons[i] = to_json(objects[i]) # ty: ignore[invalid-assignment] strs: list[bytes] = [None for _ in range(count)] # type: ignore[misc] with profile(test_name + ':json_dump'), timer(f'json dump {count} objects of type {Type}'): for i in range(count): # TODO any orjson options to speed up? strs[i] = orjson.dumps(jsons[i]) db = Path('/tmp/cachew_test/db.sqlite') if db.parent.exists(): shutil.rmtree(db.parent) db.parent.mkdir() with profile(test_name + ':sqlite_dump'), timer(f'sqlite dump {count} objects of type {Type}'): with sqlite3.connect(db) as conn: conn.execute('CREATE TABLE data (value BLOB)') conn.executemany('INSERT INTO data (value) VALUES (?)', [(s,) for s in strs]) conn.close() strs2: list[bytes] = [None for _ in range(count)] # type: ignore[misc] with profile(test_name + ':sqlite_load'), timer(f'sqlite load {count} objects of type {Type}'): with sqlite3.connect(db) as conn: i = 0 for (value,) in conn.execute('SELECT value FROM data'): strs2[i] = value i += 1 conn.close() cache = db.parent / 'cache.jsonl' with profile(test_name + ':jsonl_dump'), timer(f'jsonl dump {count} objects of type {Type}'): with cache.open('wb') as fw: for s in strs: fw.write(s + b'\n') strs3: list[bytes] = [None for _ in range(count)] # type: ignore[misc] with profile(test_name + ':jsonl_load'), timer(f'jsonl load {count} objects of type {Type}'): i = 0 with cache.open('rb') as fr: for l in fr: l = l.rstrip(b'\n') strs3[i] = l i += 1 assert strs2[:100] + strs2[-100:] == strs3[:100] + strs3[-100:] # just in case jsons2: list[Json] = [None for _ in range(count)] with profile(test_name + ':json_load'), timer(f'json load {count} objects of type {Type}'): for i in range(count): # TODO any orjson options to speed up? jsons2[i] = orjson.loads(strs2[i]) objects2 = [None for _ in range(count)] with profile(test_name + ':deserialize'), timer(f'deserializing {count} objects of type {Type}'): for i in range(count): objects2[i] = from_json(jsons2[i]) # ty: ignore[invalid-argument-type] assert objects[:100] + objects[-100:] == objects2[:100] + objects2[-100:] @dataclass class Name: first: str last: str @pytest.mark.parametrize('impl', Impls) @pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000]) @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off']) def test_union_str_dataclass(impl: Impl, count: int, gc_control, request) -> None: # NOTE: previously was union_str_namedtuple, but adapted to work with cattrs for now # perf difference between datacalss/namedtuple here seems negligible so old benchmark results should apply if impl == 'cattrs': pytest.skip('TODO need to adjust the handling of Union types..') def factory(count: int): objects: list[str | Name] = [] for i in range(count): if i % 2 == 0: objects.append(str(i)) else: objects.append(Name(first=f'first {i}', last=f'last {i}')) return objects do_test(test_name=request.node.name, Type=str | Name, factory=factory, count=count, impl=impl) # OK, performance with calling this manually (not via pytest) is the same # do_test_union_str_dataclass(count=1_000_000, test_name='adhoc') @pytest.mark.parametrize('impl', Impls) @pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000]) @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off']) def test_datetimes(impl: Impl, count: int, gc_control, request) -> None: if impl == 'cattrs': pytest.skip('TODO support datetime with pytz for cattrs') import pytz def factory(*, count: int): tzs = [ pytz.timezone('Europe/Berlin'), UTC, pytz.timezone('America/New_York'), ] start = datetime.fromisoformat('1990-01-01T00:00:00') end = datetime.fromisoformat('2030-01-01T00:00:00') step = (end - start) / count for i in range(count): dt = start + step * i tz = tzs[i % len(tzs)] yield dt.replace(tzinfo=tz) do_test(test_name=request.node.name, Type=datetime, factory=factory, count=count, impl=impl) @pytest.mark.parametrize('impl', Impls) @pytest.mark.parametrize('count', [99, 1_000_000]) @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off']) def test_nested_dataclass(impl: Impl, count: int, gc_control, request) -> None: # NOTE: was previously named test_many_from_cachew @dataclass class UUU: xx: int yy: int @dataclass class TE2: value: int uuu: UUU value2: int def factory(*, count: int): for i in range(count): yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i) do_test(test_name=request.node.name, Type=TE2, factory=factory, count=count, impl=impl) # TODO next test should probs be runtimeerror? ================================================ FILE: src/cachew/tests/test_cachew.py ================================================ # ruff: noqa: ARG001 # ruff thinks pytest fixtures are unused arguments import hashlib import inspect import platform import string import sys import time import timeit from collections.abc import Iterable, Iterator, Sequence from concurrent.futures import ProcessPoolExecutor from contextlib import nullcontext from dataclasses import asdict, dataclass from datetime import UTC, date, datetime from itertools import chain, islice from pathlib import Path from random import Random from subprocess import check_call, check_output, run from time import sleep from typing import ( Any, NamedTuple, cast, ) import patchy import pytest from more_itertools import ilen, last, one, unique_everseen from .. import ( Backend, CachewException, cachew, callable_name, get_logger, settings, ) from .utils import ( gc_control, # noqa: F401 running_on_ci, ) logger = get_logger() @pytest.fixture(autouse=True) def set_default_cachew_dir(tmp_path: Path): tpath = tmp_path / 'cachew_default' settings.DEFAULT_CACHEW_DIR = tpath @pytest.fixture(autouse=True) def throw_on_errors(): # NOTE: in tests we always throw on errors, it's a more reasonable default for testing. # we still check defensive behaviour in test_defensive settings.THROW_ON_ERROR = True # TODO restore it? @pytest.fixture(autouse=True, params=['sqlite', 'file']) def set_backend(restore_settings, request): backend = request.param settings.DEFAULT_BACKEND = backend # TODO restore it?? @pytest.fixture def restore_settings(): orig = {k: v for k, v in settings.__dict__.items() if not k.startswith('__')} try: yield finally: for k, v in orig.items(): setattr(settings, k, v) class UUU(NamedTuple): xx: int yy: int def test_simple() -> None: # just make sure all the high level cachew stuff is working @cachew def fun() -> Iterable[UUU]: yield from [] list(fun()) def test_string_annotation_old() -> None: """ For some reason collections.abc.Iterable doesn't seem to work here on python <= 3.11 , it only sees 'UUU' as a string Keeping this just as a demonstration, probably not worth trying to support as it's fairly esoteric combo. """ from typing import Iterable as typing_Iterable # noqa: UP035 @cachew def fun() -> typing_Iterable['UUU']: yield from [] # should properly infer UUU type list(fun()) def test_string_annotation_new() -> None: @cachew def fun() -> Iterable['UUU']: yield from [] # should properly infer UUU type list(fun()) def test_custom_hash(tmp_path: Path) -> None: """ Demo of using argument's modification time to determine if underlying data changed """ src = tmp_path / 'source' src.write_text('0') entities = [ UUU(xx=1, yy=1), UUU(xx=2, yy=2), UUU(xx=3, yy=3), ] calls = 0 def get_path_version(path: Path): ns = path.stat().st_mtime_ns # hmm, this might be unreliable, sometimes mtime doesn't change even after modifications? # I suppose it takes some time for them to sync or something... # so let's compute md5 or something in addition.. md5 = hashlib.md5(path.read_bytes()).digest() return str((ns, md5)) @cachew( cache_path=tmp_path, depends_on=get_path_version, # when path is updated, underlying cache would be discarded ) def data(path: Path) -> Iterable[UUU]: nonlocal calls calls += 1 count = int(path.read_text()) return entities[:count] ldata = lambda: list(data(path=src)) assert len(ldata()) == 0 assert len(ldata()) == 0 assert len(ldata()) == 0 assert calls == 1 src.write_text('1') assert ldata() == entities[:1] assert ldata() == entities[:1] assert calls == 2 src.write_text('3') assert ldata() == entities assert ldata() == entities assert calls == 3 def test_caching(tmp_path: Path) -> None: @cachew(tmp_path) def data() -> Iterator[UUU]: time.sleep(1) for i in range(5): yield UUU(xx=i, yy=i) time.sleep(1) # https://stackoverflow.com/a/40385994/706389 template = """ def inner(_it, _timer{init}): {setup} _t0 = _timer() for _i in _it: retval = {stmt} _t1 = _timer() return _t1 - _t0, retval """ timeit.template = template # type: ignore[attr-defined] timer = timeit.Timer(lambda: len(list(data()))) t, cnt = cast(tuple[float, int], timer.timeit(number=1)) assert cnt == 5 assert t > 5.0, 'should take at least 5 seconds' t, cnt = cast(tuple[float, int], timer.timeit(number=1)) assert cnt == 5 assert t < 2.0, 'should be pretty much instantaneous' def test_error(tmp_path: Path) -> None: ''' Test behaviour when the first time cache is initialized it ends up with an error ''' cache_file = tmp_path / 'cache' assert not cache_file.exists(), cache_file # just precondition should_raise = True @cachew(cache_file, force_file=True) def fun() -> Iterator[str]: yield 'string1' if should_raise: raise RuntimeError('oops') yield 'string2' with pytest.raises(RuntimeError, match='oops'): list(fun()) # vvv this would be nice but might be tricky because of the way sqlite works (i.e. wal mode creates a file) # assert not cache_file.exists(), cache_file # perhaps doesn't hurt either way as long this vvv works properly # shouldn't cache anything and crach again with pytest.raises(RuntimeError, match='oops'): list(fun()) should_raise = False assert list(fun()) == ['string1', 'string2'] def test_cache_path(tmp_path: Path) -> None: ''' Tests various ways of specifying cache path ''' calls = 0 def orig() -> Iterable[int]: nonlocal calls yield 1 yield 2 calls += 1 fun = cachew(tmp_path / 'non_existent_dir' / 'cache_dir')(orig) assert list(fun()) == [1, 2] assert calls == 1 assert list(fun()) == [1, 2] assert calls == 1 # dir by default cdir = tmp_path / 'non_existent_dir' / 'cache_dir' assert cdir.is_dir() cfile = one(cdir.glob('*')) assert cfile.name.startswith('cachew.tests.test_cachew:test_cache_path.') # treat None as "don't cache" fun = cachew(cache_path=None)(orig) assert list(fun()) == [1, 2] assert calls == 2 assert list(fun()) == [1, 2] assert calls == 3 f = tmp_path / 'a_file' f.touch() fun = cachew(cache_path=f)(orig) assert list(fun()) == [1, 2] assert calls == 4 assert list(fun()) == [1, 2] assert calls == 4 fun = cachew(tmp_path / 'name', force_file=True)(orig) assert list(fun()) == [1, 2] assert calls == 5 assert list(fun()) == [1, 2] assert calls == 5 # if passed force_file, also treat as file assert (tmp_path / 'name').is_file() # treat None as "don't cache" ('factory') # hmm not sure why mypy complains here.. might better if we get to use ParamSpec? fun = cachew(cache_path=lambda *args: None)(orig) # type: ignore[arg-type] # noqa: ARG005 assert list(fun()) == [1, 2] assert calls == 6 assert list(fun()) == [1, 2] assert calls == 7 # TODO this won't work at the moment # f.write_text('garbage') # not sure... on the one hand could just delete the garbage file and overwrite with db # on the other hand, wouldn't want to delete some user file by accident class UGood(NamedTuple): x: int class UBad: pass def test_unsupported_class(tmp_path: Path) -> None: with pytest.raises(CachewException, match=r'.*failed to infer cache type.*'): @cachew(cache_path=tmp_path) def fun() -> list[UBad]: return [UBad()] with pytest.raises(CachewException, match=r".*can't infer type from.*"): @cachew(cache_path=tmp_path) def fun2() -> Iterable[UGood | UBad]: yield UGood(x=1) yield UBad() yield UGood(x=2) class TE2(NamedTuple): value: int uuu: UUU value2: int # you can run one specific test (e.g. to profile) by passing it as -k to pytest # e.g. -k 'test_many[500000-False]' @pytest.mark.parametrize('count', [99, 500_000, 1_000_000]) @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off']) def test_many(count: int, tmp_path: Path, gc_control) -> None: if count > 99 and running_on_ci: pytest.skip("test would be too slow on CI, only meant to run manually") # should be a parametrized test perhaps src = tmp_path / 'source' src.touch() cache_path = tmp_path / 'test_many' @cachew(cache_path=cache_path, force_file=True) def iter_data() -> Iterator[TE2]: for i in range(count): # TODO also profile datetimes? yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i) a = time.time() assert ilen(iter_data()) == count # initial b = time.time() print(f'test_many: initial write to cache took {b - a:.1f}s', file=sys.stderr) print(f'test_many: cache size is {cache_path.stat().st_size / 10**6}Mb', file=sys.stderr) a = time.time() assert ilen(iter_data()) == count # hitting cache b = time.time() print(f'test_many: reading from cache took {b - a:.1f}s', file=sys.stderr) assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1) # serializing to db # in-memory: 16 seconds # without transaction: 22secs # without transaction and size 100 chunks -- some crazy amount of time, as expected # with transaction: # about 17 secs to write 1M entries (just None) # chunking by 20K doesn't seem to help # chunking by 100 also gives same perf # with to_row binding: 21 secs for dummy NamedTuple with None inside, 22 for less trivial class # deserializing from db: # initially, took 20 secs to load 1M entries (TE2) # 9 secs currently # 6 secs if we instantiate namedtuple directly via indices # 3.5 secs if we just return None from row class BB(NamedTuple): xx: int yy: int class AA(NamedTuple): value: int b: BB | None value2: int def test_return_type_inference(tmp_path: Path) -> None: """ Tests that return type (BB) is inferred from the type annotation """ @cachew(tmp_path) def data() -> Iterator[BB]: yield BB(xx=1, yy=2) yield BB(xx=3, yy=4) assert len(list(data())) == 2 assert len(list(data())) == 2 def test_return_type_mismatch(tmp_path: Path) -> None: # even though user got invalid type annotation here, they specified correct type, and it's the one that should be used @cachew(tmp_path, cls=AA) def data2() -> list[BB]: return [ # ty: ignore[invalid-return-type] AA(value=1, b=None, value2=123), # type: ignore[list-item] ] # TODO hmm, this is kinda a downside that it always returns # could preserve the original return type, but too much trouble for now assert list(data2()) == [AA(value=1, b=None, value2=123)] # type: ignore[comparison-overlap] def test_return_type_none(tmp_path: Path) -> None: with pytest.raises(CachewException): @cachew(tmp_path) def data(): return [] def test_callable_cache_path(tmp_path: Path) -> None: """ Cache path can be function dependent on wrapped function's arguments """ called: set[str] = set() @cachew(cache_path=lambda kind: tmp_path / f'{kind}.cache') def get_data(kind: str) -> Iterator[BB]: assert kind not in called called.add(kind) if kind == 'first': yield BB(xx=1, yy=1) else: yield BB(xx=2, yy=2) # fmt: off assert list(get_data('first')) == [BB(xx=1, yy=1)] assert list(get_data('second')) == [BB(xx=2, yy=2)] assert list(get_data('first')) == [BB(xx=1, yy=1)] assert list(get_data('second')) == [BB(xx=2, yy=2)] # fmt: on def test_nested(tmp_path: Path) -> None: d1 = AA( value=1, b=BB(xx=2, yy=3), value2=4, ) d2 = AA( value=3, b=None, value2=5, ) def data(): yield d1 yield d2 @cachew(cache_path=tmp_path, cls=AA) def get_data(): yield from data() assert list(get_data()) == [d1, d2] assert list(get_data()) == [d1, d2] class BBv2(NamedTuple): xx: int yy: int zz: float def test_schema_change(tmp_path: Path) -> None: """ Should discard cache on schema change (BB to BBv2) in this example """ b = BB(xx=2, yy=3) @cachew(cache_path=tmp_path, cls=BB) def get_data(): return [b] assert list(get_data()) == [b] # TODO make type part of key? b2 = BBv2(xx=3, yy=4, zz=5.0) @cachew(cache_path=tmp_path, cls=BBv2) def get_data_v2(): return [b2] assert list(get_data_v2()) == [b2] def test_transaction(tmp_path: Path) -> None: """ Should keep old cache and not leave it in some broken state in case of errors """ # logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) class TestError(Exception): pass @cachew(cache_path=tmp_path, cls=BB, chunk_by=1) def get_data(version: int): for i in range(3): yield BB(xx=2, yy=i) if version == 2: raise TestError exp = [BB(xx=2, yy=0), BB(xx=2, yy=1), BB(xx=2, yy=2)] assert list(get_data(1)) == exp assert list(get_data(1)) == exp # TODO test that hash is unchanged? with pytest.raises(TestError): list(get_data(2)) assert list(get_data(1)) == exp class Job(NamedTuple): company: str title: str | None def test_optional(tmp_path: Path) -> None: """ Tests support for typing.Optional """ @cachew(tmp_path) def data() -> Iterator[Job]: # fmt: off yield Job('google' , title='engineed') yield Job('selfemployed', title=None) # fmt: on list(data()) # trigger cachew # fmt: off assert list(data()) == [ Job('google' , title='engineed'), Job('selfemployed', title=None), ] # fmt: on # TODO add test for optional for misleading type annotation class Person(NamedTuple): name: str secondname: str age: int job: Job | None def make_people_data(count: int) -> Iterator[Person]: g = Random(124) chars = string.ascii_uppercase + string.ascii_lowercase randstr = lambda len_: ''.join(g.choices(chars, k=len_)) for _ in range(count): has_job = g.choice([True, False]) maybe_job: Job | None = None if has_job: maybe_job = Job(company=randstr(12), title=randstr(8)) yield Person( name=randstr(5), secondname=randstr(10), age=g.randint(20, 60), job=maybe_job, ) def test_stats(tmp_path: Path) -> None: cache_file = tmp_path / 'cache' # 4 + things are string lengths one = (4 + 5) + (4 + 10) + 4 + (4 + 12 + 4 + 8) N = 10000 @cachew(cache_path=cache_file, cls=Person) def get_people_data() -> Iterator[Person]: yield from make_people_data(count=N) list(get_people_data()) print( f"Cache db size for {N} entries: estimated size {one * N // 1024} Kb, actual size {cache_file.stat().st_size // 1024} Kb;" ) @dataclass class Test: field: int def test_dataclass(tmp_path: Path) -> None: @cachew(tmp_path) def get_dataclasses() -> Iterator[Test]: yield from [Test(field=i) for i in range(5)] assert list(get_dataclasses()) == [Test(field=i) for i in range(5)] assert list(get_dataclasses()) == [Test(field=i) for i in range(5)] def test_inner_class(tmp_path: Path) -> None: # NOTE: this doesn't work at the moment if from __future__ import annotations is used in client code (e.g. on top of this test) # see test_future_annotations for more info @dataclass class InnerDataclass: field: int @cachew(tmp_path) def fun() -> Iterator[InnerDataclass]: yield from [] # should manage to infer type and not crash at least list(fun()) list(fun()) @dataclass class Dates: d1: datetime d2: datetime d3: datetime d4: datetime d5: datetime def test_dates(tmp_path: Path) -> None: from zoneinfo import ZoneInfo tz = ZoneInfo('Europe/London') dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S') dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S') x = Dates( d1=dwinter.replace(tzinfo=tz), d2=dsummer.replace(tzinfo=tz), d3=dwinter, d4=dsummer, d5=dsummer.replace(tzinfo=UTC), ) @cachew(tmp_path) def fun() -> Iterable[Dates]: yield x assert one(fun()) == x assert one(fun()) == x # make sure the actuall tzinfo is preserved... otherwise we might end up with raw offsets and lose some info r = one(fun()) assert str(r.d1.tzinfo) == str(x.d1.tzinfo) assert str(r.d2.tzinfo) == str(x.d2.tzinfo) assert r.d3.tzname() is None assert r.d4.tzname() is None assert r.d5.tzinfo is UTC # fmt: off @dataclass class AllTypes: a_str : str an_int : int a_float : float a_bool : bool a_dt : datetime a_date : date a_dict : dict[str, Any] a_list : list[Any] a_tuple : tuple[float, str] an_exc : Exception an_opt : str | None # fmt: on # TODO support vararg tuples? def test_types(tmp_path: Path) -> None: import pytz tz = pytz.timezone('Europe/Berlin') # fmt: off obj = AllTypes( a_str = 'abac', an_int = 1123, a_float = 3.131, a_bool = True, a_dt = datetime.now(tz=tz), a_date = datetime.now().replace(year=2000).date(), a_dict = {'a': True, 'x': {'whatever': 3.14}}, a_list = ['aba', 123, None], a_tuple = (1.23, '3.2.1'), an_exc = RuntimeError('error!', 123), an_opt = 'hello', ) # fmt: on @cachew(tmp_path) def get() -> Iterator[AllTypes]: yield obj def helper(t: AllTypes): # Exceptions can't be directly compared.. so this kinda helps d = asdict(t) d['an_exc'] = d['an_exc'].args return d assert helper(one(get())) == helper(obj) assert helper(one(get())) == helper(obj) # TODO if I do perf tests, look at this https://docs.sqlalchemy.org/en/13/_modules/examples/performance/large_resultsets.html # TODO should be possible to iterate anonymous tuples too? or just sequences of primitive types? def test_primitive(tmp_path: Path) -> None: @cachew(tmp_path) def fun() -> Iterator[str]: yield 'aba' yield 'caba' assert list(fun()) == ['aba', 'caba'] assert list(fun()) == ['aba', 'caba'] def test_single_value(tmp_path: Path) -> None: @cachew(tmp_path) def fun_int() -> int: return 123 assert fun_int() == 123 assert fun_int() == 123 @cachew(tmp_path, cls=('single', str)) def fun_str(): return 'whatever' assert fun_str() == 'whatever' assert fun_str() == 'whatever' @cachew(tmp_path) def fun_opt_namedtuple(none: bool) -> UUU | None: # noqa: FBT001 if none: return None else: return UUU(xx=1, yy=2) assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2) assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2) assert fun_opt_namedtuple(none=True) is None assert fun_opt_namedtuple(none=True) is None class O(NamedTuple): x: int class _HackHash: def __init__(self, x: int) -> None: self.x = x def __repr__(self): return repr(self.x) def test_default_arguments(tmp_path: Path) -> None: hh = _HackHash(1) calls = 0 def orig(a: int, param: _HackHash = hh) -> Iterator[O]: yield O(hh.x) nonlocal calls calls += 1 def depends_on(a: int, param: _HackHash) -> str: # hmm. in principle this should be str according to typing # on practice though we always convert hash to str, so maybe type should be changed to Any? return (a, param.x) # type: ignore[return-value] fun = cachew(tmp_path, depends_on=depends_on)(orig) list(fun(123)) assert list(fun(123)) == [O(1)] assert calls == 1 # now, change hash. That should cause the composite hash to invalidate and recompute hh.x = 2 assert list(fun(123)) == [O(2)] assert calls == 2 # should be ok with explicitly passing assert list(fun(123, param=_HackHash(2))) == [O(2)] assert calls == 2 # we don't have to handle the default param in the default hash key fun = cachew(tmp_path)(fun) assert list(fun(456)) == [O(2)] assert calls == 3 assert list(fun(456)) == [O(2)] assert calls == 3 # changing the default should trigger the default (i.e. kwargs) key function to invalidate the cache hh.x = 3 assert list(fun(456)) == [O(3)] assert calls == 4 # you don't have to pass the default parameter explicitly fun = cachew(tmp_path, depends_on=lambda a: a)(orig) assert list(fun(456)) == [O(3)] assert calls == 5 # but watch out if you forget to handle it! hh.x = 4 assert list(fun(456)) == [O(3)] assert calls == 5 class U(NamedTuple): x: str | O def test_union(tmp_path: Path) -> None: @cachew(tmp_path) def fun() -> Iterator[U]: yield U('hi') yield U(O(123)) list(fun()) assert list(fun()) == [U('hi'), U(O(123))] # NOTE: empty dataclass doesn't have __annotations__ ??? not sure if need to handle it... @dataclass class DD: x: int def test_union_with_dataclass(tmp_path: Path) -> None: @cachew(tmp_path) def fun() -> Iterator[int | DD]: yield 123 yield DD(456) assert list(fun()) == [123, DD(456)] # ugh. we need to pass backend here explicitly since it might not get picked up from the fixture # that sets it in settings. due to multiprocess stuff def _concurrent_helper(cache_path: Path, count: int, backend: Backend, sleep_s=0.1): @cachew(cache_path, backend=backend) def test(count: int) -> Iterator[int]: for i in range(count): print(f"{count}: GENERATING {i}") sleep(sleep_s) yield i * i return list(test(count=count)) @pytest.fixture def fuzz_cachew_impl(): """ Insert random sleeps in cachew_impl to increase likelihood of concurrency issues """ from .. import cachew_wrapper patch = '''\ @@ -189,6 +189,11 @@ old_hash = backend.get_old_hash() logger.debug(f'old hash: {old_hash}') + from random import random + rs = random() * 2 + print("sleeping for: ", rs) + from time import sleep; sleep(rs) + if new_hash == old_hash: logger.debug('hash matched: loading from cache') yield from cached_items() ''' patchy.patch(cachew_wrapper, patch) yield patchy.unpatch(cachew_wrapper, patch) # TODO fuzz when they start so they enter transaction at different times? # TODO how to run it enough times on CI and increase likelihood of failing? # for now, stress testing manually: # while PYTHONPATH=src pytest -s cachew -k concurrent_writes ; do sleep 0.5; done @pytest.mark.xfail(condition=platform.system() == 'Darwin', reason='seems like file writes might not be atomic on osx?') def test_concurrent_writes(tmp_path: Path, fuzz_cachew_impl) -> None: cache_path = tmp_path / 'cache.sqlite' # warm up to create the database # FIXME ok, that will be fixed separately with atomic move I suppose _concurrent_helper(cache_path, 1, settings.DEFAULT_BACKEND) processes = 5 with ProcessPoolExecutor() as pool: futures = [ pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND) for count in range(processes) ] for count, f in enumerate(futures): assert f.result() == [i * i for i in range(count)] # TODO ugh. need to keep two processes around to test for yield holding transaction lock def test_concurrent_reads(tmp_path: Path, fuzz_cachew_impl): cache_path = tmp_path / 'cache.sqlite' count = 10 # warm up _concurrent_helper(cache_path, count, settings.DEFAULT_BACKEND, sleep_s=0) processes = 4 start = time.time() with ProcessPoolExecutor() as pool: futures = [ pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND, 1) for _ in range(processes) ] for f in futures: print(f.result()) end = time.time() taken = end - start # should be pretty instantaneous # if it takes more, most likely means that helper was called again assert taken < 5 def test_mcachew(tmp_path: Path): # TODO how to test for defensive behaviour? from cachew.extra import mcachew # TODO check throw on error @mcachew(cache_path=tmp_path / 'cache') def func() -> Iterator[str]: yield 'one' yield 'two' assert list(func()) == ['one', 'two'] assert list(func()) == ['one', 'two'] def test_defensive(restore_settings) -> None: ''' Make sure that cachew doesn't crash on misconfiguration ''' def orig() -> Iterator[int]: yield 123 def orig2(): yield "x" yield 123 fun = cachew(bad_arg=123)(orig) # type: ignore[call-overload] assert list(fun()) == [123] assert list(fun()) == [123] for throw in [True, False]: ctx = pytest.raises(Exception) if throw else nullcontext() settings.THROW_ON_ERROR = throw with ctx: fun = cachew(cache_path=lambda: 1 + 'bad_path_provider')(orig) # type: ignore[arg-type,misc,operator] assert list(fun()) == [123] assert list(fun()) == [123] fun = cachew(cache_path=lambda p: '/tmp/' + str(p))(orig) assert list(fun()) == [123] assert list(fun()) == [123] fun = cachew(orig2) assert list(fun()) == ['x', 123] assert list(fun()) == ['x', 123] settings.DEFAULT_CACHEW_DIR = '/dev/nonexistent' fun = cachew(orig) assert list(fun()) == [123] assert list(fun()) == [123] @pytest.mark.parametrize('throw', [False, True]) def test_bad_annotation(*, tmp_path: Path, throw: bool) -> None: """ this will work in runtime without cachew if from __future__ import annotations is used so should work with cachew decorator as well """ src = tmp_path / 'src.py' src.write_text( f''' from __future__ import annotations from cachew import settings, cachew settings.THROW_ON_ERROR = {throw} @cachew def fun() -> BadType: print("called!") return 0 fun() '''.lstrip() ) ctx = pytest.raises(Exception) if throw else nullcontext() with ctx: assert check_output([sys.executable, src], text=True).strip() == "called!" def test_recursive_simple(tmp_path: Path) -> None: d0 = 0 d1 = 1000 calls = 0 @cachew(tmp_path) def factorials(n: int) -> Iterable[int]: nonlocal calls, d0, d1 calls += 1 if n == 0: d0 = len(inspect.stack(0)) if n == 1: d1 = len(inspect.stack(0)) if n == 0: yield 1 return prev = factorials(n - 1) last = 1 # TODO potentially quadratic? measure perf perhaps? for x in prev: yield x last = x yield last * n assert calls == 0 assert list(factorials(3)) == [1, 1, 2, 6] # make sure the recursion isn't eating too much stack # ideally would have 1? not sure if possible without some insane hacking? # todo maybe check stack frame size as well? assert abs(d0 - d1) <= 2 assert calls == 4 assert list(factorials(3)) == [1, 1, 2, 6] assert calls == 4 assert list(factorials(5)) == [1, 1, 2, 6, 24, 120] assert calls == 6 assert list(factorials(3)) == [1, 1, 2, 6] assert calls == 10 def test_recursive_deep(tmp_path: Path) -> None: @cachew(tmp_path) def numbers(n: int) -> Iterable[int]: if n == 0: yield 0 return yield from numbers(n - 1) yield n @cachew(cache_path=None) def numbers_cache_disabled(n: int) -> Iterable[int]: if n == 0: yield 0 return yield from numbers(n - 1) yield n rlimit = sys.getrecursionlimit() # NOTE in reality it has to do with the number of file descriptors (ulimit -Sn, e.g. 1024?) # but it seems that during the error unrolling, pytest or something else actually hits the recursion limit somehow # pytest ends up with an internal error in such case... which is good enough as long as tests are concerned I guess. sys.setrecursionlimit(2 * 800 + 100) try: # at the moment each recursive call takes two frames (one for the original call, one for cachew_wrapper) # + allow 100 calls for random constant overhead like pytest etc list(numbers(800)) list(numbers(800)) list(numbers_cache_disabled(800)) list(numbers_cache_disabled(800)) finally: sys.setrecursionlimit(rlimit) def test_recursive_error(tmp_path: Path) -> None: @cachew(tmp_path) def rec(n: int) -> Iterable[int]: if n == 0: yield 0 return yield from rec(n - 1) yield n rlimit = sys.getrecursionlimit() try: sys.setrecursionlimit(50) list(rec(100)) raise AssertionError('Expecting recursion error') except RecursionError: pass finally: sys.setrecursionlimit(rlimit) # todo not sure if cache file should exist?? # either way, at least check that the db is not completely messed up assert len(list(rec(100))) == 101 def test_exceptions(tmp_path: Path) -> None: class X(NamedTuple): a: int d = datetime.strptime('20200102 03:04:05', '%Y%m%d %H:%M:%S') @cachew(tmp_path) def fun() -> Iterator[Exception]: yield RuntimeError('whatever', 123, d, X(a=123)) list(fun()) [e] = fun() # not sure if there is anything that can be done to preserve type information? assert type(e) is Exception assert e.args == ('whatever', 123, '2020-01-02T03:04:05', 'X(a=123)') # see https://beepb00p.xyz/mypy-error-handling.html#kiss def test_result(tmp_path: Path) -> None: @cachew(tmp_path) def fun() -> Iterator[Exception | int]: yield 1 yield RuntimeError("sad!") yield 123 list(fun()) [v1, ve, v123] = fun() assert v1 == 1 assert v123 == 123 assert isinstance(ve, Exception) assert ve.args == ('sad!',) def test_version_change(tmp_path: Path) -> None: calls = 0 @cachew(tmp_path, logger=logger) def fun() -> Iterator[str]: nonlocal calls calls += 1 yield from ['a', 'b', 'c'] list(fun()) list(fun()) assert calls == 1 # todo ugh. not sure how to do this as a relative import?? import cachew as cachew_module old_version = cachew_module.CACHEW_VERSION try: cachew_module.CACHEW_VERSION = old_version + '_whatever' # should invalidate cachew now list(fun()) assert calls == 2 list(fun()) assert calls == 2 finally: cachew_module.CACHEW_VERSION = old_version # and now again, back to the old version list(fun()) assert calls == 3 list(fun()) assert calls == 3 def dump_old_cache(tmp_path: Path) -> None: # call this if you want to get an sql script for version upgrade tests.. oc = tmp_path / 'old_cache.sqlite' @cachew(oc) def fun() -> Iterator[int]: yield from [1, 2, 3] list(fun()) assert oc.exists(), oc sql = check_output(['sqlite3', oc, '.dump']).decode('utf8') print(sql, file=sys.stderr) def test_old_cache_v0_6_3(tmp_path: Path) -> None: if settings.DEFAULT_BACKEND != 'sqlite': pytest.skip('this test only makes sense for sqlite backend') sql = ''' PRAGMA foreign_keys=OFF; BEGIN TRANSACTION; CREATE TABLE hash ( value VARCHAR ); INSERT INTO hash VALUES('cachew: 1, schema: {''_'': }, hash: ()'); CREATE TABLE IF NOT EXISTS "table" ( _cachew_primitive INTEGER ); INSERT INTO "table" VALUES(1); INSERT INTO "table" VALUES(2); INSERT INTO "table" VALUES(3); COMMIT; ''' db = tmp_path / 'cache.sqlite' check_call(['sqlite3', db, sql]) @cachew(db) def fun() -> Iterator[int]: yield from [1, 2, 3] # this tests that it doesn't crash # for actual version upgrade test see test_version_change assert list(fun()) == [1, 2, 3] def test_disabled(tmp_path: Path) -> None: calls = 0 @cachew(tmp_path) def fun() -> Iterator[int]: yield 1 yield 2 nonlocal calls calls += 1 assert list(fun()) == [1, 2] assert list(fun()) == [1, 2] assert calls == 1 from cachew.extra import disabled_cachew with disabled_cachew(): assert list(fun()) == [1, 2] assert calls == 2 assert list(fun()) == [1, 2] assert calls == 3 def test_early_exit_simple(tmp_path: Path) -> None: # cachew works on iterators and we'd prefer not to cache if the iterator hasn't been exhausted calls_f = 0 @cachew(tmp_path) def f() -> Iterator[int]: yield from range(20) nonlocal calls_f calls_f += 1 calls_g = 0 @cachew(tmp_path) def g() -> Iterator[int]: yield from f() nonlocal calls_g calls_g += 1 # only consume 10/20 items assert len(list(islice(g(), 0, 10))) == 10 # precondition assert calls_f == 0 # f hasn't been fully exhausted assert calls_g == 0 # g hasn't been fully exhausted # todo not sure if need to check that db is empty? assert len(list(g())) == 20 assert calls_f == 1 assert calls_g == 1 # should be cached now assert len(list(g())) == 20 assert calls_f == 1 assert calls_g == 1 # see https://github.com/sqlalchemy/sqlalchemy/issues/5522#issuecomment-705156746 def test_early_exit_shutdown(tmp_path: Path) -> None: # don't ask... otherwise the exception doesn't appear :shrug: import_hack = ''' from sqlalchemy import Column import re re.hack = lambda: None ''' Path(tmp_path / 'import_hack.py').write_text(import_hack) prog = f''' import sys sys.path.insert(0, '') import import_hack import cachew cachew.settings.THROW_ON_ERROR = True # todo check with both? @cachew.cachew('{tmp_path}', cls=int) def fun(): yield 0 g = fun() e = next(g) print("FINISHED") ''' r = run([sys.executable, '-c', prog], cwd=tmp_path, capture_output=True, check=True) assert r.stdout.strip() == b'FINISHED' assert b'Traceback' not in r.stderr # tests both modes side by side to demonstrate the difference @pytest.mark.parametrize('use_synthetic', ['False', 'True']) def test_synthetic_keyset(*, tmp_path: Path, use_synthetic: bool) -> None: # just to keep track of which data we had to compute from scratch _recomputed: list[str] = [] # assume key i is responsible for numbers i and i-1 # in reality this could be some slow function we'd like to avoid calling if its results is already cached # e.g. the key would typically be a filename (e.g. isoformat timestamp) # and the returned values could be the results of an export over the month prior to the timestamp, or something like that # see https://beepb00p.xyz/exports.html#synthetic for more on the motivation def compute(key: str) -> Iterator[str]: _recomputed.append(key) n = int(key) yield str(n - 1) yield str(n) # fmt: off # should result in 01 + 12 + 45 == 01245 keys125 = ['1', '2', '5' ] # should result in 01 + 12 + 45 + 56 + 67 == 0124567 keys12567 = ['1', '2', '5', '6', '7' ] # should result in 01 + 12 + 45 + 56 + 78 + 89 == 012456789 keys125689 = ['1', '2', '5', '6', '8', '9'] # should result in 45 + 56 + 78 + 89 == 456789 keys5689 = [ '5', '6', '8', '9'] # fmt: on def recomputed() -> list[str]: r = list(_recomputed) _recomputed.clear() return r ## 'cachew_cached' will just be [] if synthetic key is not used, so no impact on data @cachew(tmp_path, synthetic_key=('keys' if use_synthetic else None)) def fun_aux(keys: Sequence[str], *, cachew_cached: Iterable[str] = []) -> Iterator[str]: yield from unique_everseen( chain( cachew_cached, *(compute(key) for key in keys), ) ) def fun(keys: Sequence[str]) -> set[str]: return set(fun_aux(keys=keys)) ## # preserve formatting of string arguments it makes easier to read the tes # fmt: off assert fun(keys125) == set('01' '12' '45') assert recomputed() == keys125 assert fun(keys125) == set('01' '12' '45') assert recomputed() == [] # should be cached assert fun(keys12567) == set('01' '12' '45' '56' '67') if use_synthetic: # 1, 2 and 5 should be already cached from the previous call assert recomputed() == ['6', '7'] else: # but without synthetic key this would cause everything to recompute assert recomputed() == keys12567 assert fun(keys12567) == set('01' '12' '45' '56' '67') assert recomputed() == [] # should be cached assert fun(keys125689) == set('01' '12' '45' '56' '78' '89') if use_synthetic: # similarly, 1 2 5 6 7 are cached from the previous cacll assert recomputed() == ['8', '9'] else: # and we need to call against all keys otherwise assert recomputed() == keys125689 assert fun(keys125689) == set('01' '12' '45' '56' '78' '89') assert recomputed() == [] # should be cached assert fun(keys5689) == set('45' '56' '78' '89') # now the prefix has changed, so if we returned cached items it might return too much # so have to recompute everything assert recomputed() == keys5689 assert fun(keys5689) == set('45' '56' '78' '89') assert recomputed() == [] # should be cached # fmt: on # TODO maybe call combined function? so it could return total result and last cached? # TODO another option is: # the function yields all cached stuff first # then the user yields stuff from new # and then external function does merging # TODO test with kwargs hash?... # TODO try without and with simultaneously? # TODO check what happens when errors happen? # FIXME check what happens if we switch between modes? (synthetic/non-synthetic) # FIXME make sure this thing works if len(keys) > chunk size? # TODO check what happens when we forget to set 'cachew_cached' argument # TODO check what happens when keys are not str but e.g. Path def test_db_path_matches_fun_name(tmp_path: Path) -> None: @cachew(tmp_path) def fun_single() -> int: return 123 @cachew(tmp_path) def fun_multiple() -> Iterable[int]: return [123] # write to cache fun_single() list(fun_multiple()) assert (tmp_path / callable_name(fun_single)).exists() assert (tmp_path / callable_name(fun_multiple)).exists() def test_type_alias_type_1(tmp_path: Path) -> None: type Int = int @cachew(tmp_path) def fun() -> Iterator[Int]: yield 123 assert list(fun()) == [123] assert list(fun()) == [123] def test_type_alias_type_2(tmp_path: Path) -> None: type IteratorInt = Iterator[int] @cachew(tmp_path) def fun() -> IteratorInt: yield 123 assert list(fun()) == [123] assert list(fun()) == [123] def test_type_alias_generic(tmp_path: Path) -> None: type Res[T] = T | Exception type IntRes = Res[int] @cachew(tmp_path) def fun() -> Iterator[IntRes]: yield 123 assert list(fun()) == [123] assert list(fun()) == [123] ================================================ FILE: src/cachew/tests/test_future_annotations.py ================================================ from __future__ import annotations import os import sys import textwrap from collections.abc import Iterator from dataclasses import dataclass from pathlib import Path from subprocess import check_output from typing import Any import pytest from more_itertools import one from .. import cachew type _Str = str # deliberate, to test 3.12 'type ... = ...' type definitions # fmt: off @dataclass class NewStyleTypes1: a_str : str a_dict : dict[str, Any] a_list : list[Any] a_tuple : tuple[float, _Str] # fmt: on def test_types1(tmp_path: Path) -> None: obj = NewStyleTypes1( a_str = 'abac', a_dict = {'a': True, 'x': {'whatever': 3.14}}, a_list = ['aba', 123, None], a_tuple = (1.23, '3.2.1'), ) # fmt: skip @cachew(tmp_path) def get() -> Iterator[NewStyleTypes1]: yield obj assert one(get()) == obj assert one(get()) == obj # fmt: off @dataclass class NewStyleTypes2: an_opt : str | None a_union : _Str | int # fmt: on def test_types2(tmp_path: Path) -> None: obj = NewStyleTypes2( an_opt = 'hello', a_union = 999, ) # fmt: skip @cachew(tmp_path) def get() -> Iterator[NewStyleTypes2]: yield obj assert one(get()) == obj assert one(get()) == obj @pytest.mark.parametrize('use_future_annotations', [False, True]) @pytest.mark.parametrize('local', [False, True]) @pytest.mark.parametrize('throw', [False, True]) def test_future_annotations( *, use_future_annotations: bool, local: bool, throw: bool, tmp_path: Path, ) -> None: """ Checks handling of postponed evaluation of annotations (from __future__ import annotations) """ # NOTE: to avoid weird interactions with existing interpreter in which pytest is running # , we compose a program and running in python directly instead # (also not sure if it's even possible to tweak postponed annotations without doing that) if use_future_annotations and local and throw: # when annotation is local (like inner class), then they end up as strings # so we can't eval it as we don't have access to a class defined inside function # keeping this test just to keep track of whether this is fixed at some point # possibly relevant: # - https://peps.python.org/pep-0563/#keeping-the-ability-to-use-function-local-state-when-defining-annotations pytest.skip("local aliases/classses don't work with from __future__ import annotations") _PREAMBLE = f''' from pathlib import Path import tempfile from cachew import cachew, settings settings.THROW_ON_ERROR = {throw} temp_dir = tempfile.TemporaryDirectory() td = Path(temp_dir.name) ''' _TEST = ''' type Identity[T] = T I = int type S = Identity[str] @cachew(td) def fun() -> list[I | S]: print("called") return [1, "2"] assert list(fun()) == [1, "2"] assert list(fun()) == [1, "2"] ''' if use_future_annotations: code = ''' from __future__ import annotations ''' else: code = '' code += _PREAMBLE if local: code += f''' def test() -> None: {textwrap.indent(_TEST, prefix=" ")} test() ''' else: code += _TEST run_py = tmp_path / 'run.py' run_py.write_text(code) cache_dir = tmp_path / 'cache' cache_dir.mkdir() res = check_output( [sys.executable, run_py], env={'TMPDIR': str(cache_dir), **os.environ}, text=True, ) called = int(res.count('called')) if use_future_annotations and local and not throw: # cachew fails to set up, so no caching but at least it works otherwise assert called == 2 else: assert called == 1 ================================================ FILE: src/cachew/tests/test_resolve_type_parameters.py ================================================ from ..utils import resolve_type_parameters def test_simple_generic_alias() -> None: # if you define types ad-hoc, they resolve to GenericAlias, not TypeAliasType assert resolve_type_parameters(int) == int # noqa: E721 assert resolve_type_parameters(list[bool]) == list[bool] assert resolve_type_parameters(dict[str, list[float]]) == dict[str, list[float]] def test_simple_type_keyword() -> None: type Int = int assert resolve_type_parameters(Int) == int # noqa: E721 assert resolve_type_parameters(list[Int]) == list[int] assert resolve_type_parameters(dict[str, list[Int]]) == dict[str, list[int]] def test_generic_collections() -> None: type ListInt = list[int] assert resolve_type_parameters(ListInt) == list[int] assert resolve_type_parameters(dict[str, ListInt]) == dict[str, list[int]] type TupleInt = tuple[int, bool] assert resolve_type_parameters(TupleInt) == tuple[int, bool] type TupleIntStr = tuple[TupleInt, str] assert resolve_type_parameters(TupleIntStr) == tuple[tuple[int, bool], str] type SetStr = set[str] assert resolve_type_parameters(SetStr) == set[str] type DictAlias[K, V] = dict[K, V] assert resolve_type_parameters(DictAlias[str, int]) == dict[str, int] assert resolve_type_parameters(DictAlias[int, list[str]]) == dict[int, list[str]] type ComplexDict = dict[str, tuple[ListInt, SetStr]] assert resolve_type_parameters(ComplexDict) == dict[str, tuple[list[int], set[str]]] def test_generic_type_keyword() -> None: type Id[T] = T type IdInt = Id[int] assert resolve_type_parameters(IdInt) == int # noqa: E721 assert resolve_type_parameters(list[IdInt]) == list[int] # check multiple uses of type params type Pair[T] = tuple[T, T] type PairInt = Pair[int] assert resolve_type_parameters(PairInt) == tuple[int, int] assert resolve_type_parameters(Pair[str]) == tuple[str, str] assert resolve_type_parameters(list[Pair[int]]) == list[tuple[int, int]] # check if type params aren't used type NotUsing1[T, V] = int type NotUsing2[V, W] = NotUsing1[bool, float] type ListInt1 = list[NotUsing2[list, str]] assert resolve_type_parameters(ListInt1) == list[int] # Test generic alias with alias as parameter type Container[T] = list[T] type Int = int assert resolve_type_parameters(Container[Int]) == list[int] def test_chaining() -> None: type Int = int type Int2 = Int type Int3 = Int2 assert resolve_type_parameters(Int3) == int # noqa: E721 type ListInt3 = list[Int3] assert resolve_type_parameters(ListInt3) == list[int] type Box[T] = list[T] type DoubleBox[T] = Box[Box[T]] type DoubleBoxFloat = DoubleBox[float] assert resolve_type_parameters(DoubleBoxFloat) == list[list[float]] def test_optional_and_union() -> None: type Int = int type MaybeInt = int | None assert resolve_type_parameters(MaybeInt) == (int | None) assert resolve_type_parameters(list[MaybeInt]) == list[int | None] type Str = str # FIXME extract outside? type StrOrInt = Str | Int assert resolve_type_parameters(StrOrInt) == (str | int) type UnionWithAlias = int | Str assert resolve_type_parameters(UnionWithAlias) == (int | str) # Test union in generic contexts type OptionalList[T] = list[T] | None assert resolve_type_parameters(OptionalList[int]) == (list[int] | None) assert resolve_type_parameters(OptionalList[str]) == (list[str] | None) # Test nested unions with aliases type Bool = bool type StrOrIntOrBool = StrOrInt | Bool assert resolve_type_parameters(StrOrIntOrBool) == (int | str | bool) # Test union with complex aliased types type ListInt = list[int] type DictStrInt = dict[str, int] type ComplexUnion = ListInt | DictStrInt | None assert resolve_type_parameters(ComplexUnion) == (list[int] | dict[str, int] | None) def test_old_aliases() -> None: """ Old style typing.* aliases get 'normalised' by typing.get_origin call. This shouldn't really be a problem, so just highihghting it here. """ from typing import Dict, List, Optional # noqa: UP035 type OptionalInt = Optional[int] # noqa: UP045 assert resolve_type_parameters(OptionalInt) == int | None type ListInt = List[int] # noqa: UP006 assert resolve_type_parameters(ListInt) == list[int] type DictIntStr = Dict[int, str] # noqa: UP006 assert resolve_type_parameters(DictIntStr) == dict[int, str] def test_old_union() -> None: from typing import Union type IntUnion[T] = Union[int, T, bool] # noqa: UP007 assert resolve_type_parameters(IntUnion[str]) == (int | str | bool) def test_typevar() -> None: from typing import TypeVar X = TypeVar('X') ListX = list[X] type ListInt = ListX[int] assert resolve_type_parameters(ListInt) == list[int] SetX = set[X] SetFloat = SetX[float] assert resolve_type_parameters(SetFloat) == set[float] def test_misc() -> None: """ Miscellaneous more complex tests. """ # Test union inside list/dict type MaybeStr = str | None assert resolve_type_parameters(list[MaybeStr]) == list[str | None] assert resolve_type_parameters(dict[str, MaybeStr]) == dict[str, str | None] # Test union with nested generic aliases type Container[T] = list[T] type OptionalContainer[T] = Container[T] | None assert resolve_type_parameters(OptionalContainer[int]) == (list[int] | None) # Test union with multiple aliased generics type ListAlias[T] = list[T] type SetAlias[T] = set[T] type CollectionUnion[T] = ListAlias[T] | SetAlias[T] assert resolve_type_parameters(CollectionUnion[str]) == (list[str] | set[str]) # Test union in tuple type IntOrStr = int | str assert resolve_type_parameters(tuple[IntOrStr, bool]) == tuple[int | str, bool] # Test deeply nested union with aliases type Middle = list[IntOrStr] type Outer = Middle | None assert resolve_type_parameters(Outer) == (list[int | str] | None) # Test union with chained aliases type Level1 = int type Level2 = Level1 type Level3 = Level2 type UnionChained = Level3 | str | None assert resolve_type_parameters(UnionChained) == (int | str | None) # Test union with generic that resolves to union type MaybeList[T] = list[T] | None type NestedMaybe = MaybeList[int | str] assert resolve_type_parameters(NestedMaybe) == (list[int | str] | None) # Test union with aliased union type NumberOrStr = int | float | str type ExtendedUnion = NumberOrStr | bool assert resolve_type_parameters(ExtendedUnion) == (int | float | str | bool) # Test union in dict values and keys type FlexibleKey = str | int type FlexibleValue = list[int] | dict[str, str] | None assert ( resolve_type_parameters(dict[FlexibleKey, FlexibleValue]) == dict[str | int, list[int] | dict[str, str] | None] ) # Test union with same type repeated (Python may or may not normalize this) type RepeatUnion = int | int | str # noqa: PYI016 # Python's union implementation may deduplicate, so we accept both assert resolve_type_parameters(RepeatUnion) == (int | str) or resolve_type_parameters(RepeatUnion) == (int | int | str) # fmt: skip # Test union with TypeAliasType in multiple positions type AliasA = list[int] type AliasB = dict[str, int] type AliasC = set[str] type MultiAliasUnion = AliasA | AliasB | AliasC assert resolve_type_parameters(MultiAliasUnion) == (list[int] | dict[str, int] | set[str]) # Test generic union with substitution type Result[T, E] = T | E assert resolve_type_parameters(Result[int, str]) == (int | str) assert resolve_type_parameters(Result[list[int], dict[str, str]]) == (list[int] | dict[str, str]) # Test union with None (Optional pattern) in various positions type OptionalInt = int | None type ListOfOptional = list[OptionalInt] assert resolve_type_parameters(ListOfOptional) == list[int | None] # Test union with multiple levels of aliased unions type UnionA = int | str type UnionB = bool | float type CombinedUnion = UnionA | UnionB assert resolve_type_parameters(CombinedUnion) == (int | str | bool | float) # Test union as generic parameter with nested aliases type NestedAlias = list[int] type UnionParam[T] = dict[str, T | None] assert resolve_type_parameters(UnionParam[NestedAlias]) == dict[str, list[int] | None] # Test complex scenario: generic alias that returns a union, used in another union type ComplexUnion[T] = MaybeList[T] | dict[str, T] assert resolve_type_parameters(ComplexUnion[int]) == (list[int] | None | dict[str, int]) # Test union in tuple with multiple aliased elements type AliasInt = int type AliasStr = str type TupleWithUnions = tuple[AliasInt | None, list[AliasStr | bool]] assert resolve_type_parameters(TupleWithUnions) == tuple[int | None, list[str | bool]] # Test three-way union with all aliased types type TypeA = list[int] type TypeB = dict[str, str] type TypeC = set[bool] type ThreeWayUnion = TypeA | TypeB | TypeC assert resolve_type_parameters(ThreeWayUnion) == (list[int] | dict[str, str] | set[bool]) # Test union where members themselves contain unions type InnerUnion1 = int | str type InnerUnion2 = bool | float type OuterUnion = list[InnerUnion1] | dict[str, InnerUnion2] assert resolve_type_parameters(OuterUnion) == (list[int | str] | dict[str, bool | float]) # Test generic union with nested type aliases in parameters type Box[T] = list[T] type OptionBox[T] = Box[T] | None assert resolve_type_parameters(OptionBox[int | str]) == (list[int | str] | None) # Test union with mix of generic and non-generic aliases type SimpleAlias = int type GenericAlias[T] = list[T] type MixedUnion[T] = SimpleAlias | GenericAlias[T] assert resolve_type_parameters(MixedUnion[str]) == (int | list[str]) # Test generic alias that returns the parameter unchanged type Same[T] = T assert resolve_type_parameters(Same[int]) == int # noqa: E721 assert resolve_type_parameters(Same[list[str]]) == list[str] assert resolve_type_parameters(Same[Same[int]]) == int # noqa: E721 # Test deeply nested generics type Deep = dict[str, list[tuple[int, set[str]]]] assert resolve_type_parameters(Deep) == dict[str, list[tuple[int, set[str]]]] # Test union in complex nested structure type Data[T] = dict[str, list[T] | None] assert resolve_type_parameters(Data[int | str]) == dict[str, list[int | str] | None] # Test alias in tuple with mixed types type Mixed = tuple[int, list[str], dict[str, int]] assert resolve_type_parameters(Mixed) == tuple[int, list[str], dict[str, int]] ================================================ FILE: src/cachew/tests/utils.py ================================================ import gc import os import sys from contextlib import contextmanager from pathlib import Path import pytest PROFILES = Path(__file__).absolute().parent / 'profiles' @contextmanager def profile(name: str): # ugh. seems like pyinstrument slows down code quite a bit? if os.environ.get('PYINSTRUMENT') is None: yield return from pyinstrument import Profiler with Profiler() as profiler: yield PROFILES.mkdir(exist_ok=True) results_file = PROFILES / f"{name}.html" print("results for " + name, file=sys.stderr) profiler.print() results_file.write_text(profiler.output_html()) def timer(name: str): from codetiming import Timer return Timer(name=name, text=name + ': ' + '{:.2f}s') @pytest.fixture def gc_control(*, gc_on: bool): if gc_on: # no need to do anything, should be on by default yield return gc.disable() try: yield finally: gc.enable() running_on_ci = 'CI' in os.environ ================================================ FILE: src/cachew/utils.py ================================================ from collections.abc import Mapping from types import UnionType from typing import TypeAliasType, TypeVar, get_args, get_origin # https://stackoverflow.com/a/2166841/706389 def is_namedtuple(t) -> bool: b = getattr(t, '__bases__', None) if b is None: return False if len(b) != 1 or b[0] is not tuple: return False f = getattr(t, '_fields', None) if not isinstance(f, tuple): return False return all(type(n) == str for n in f) # noqa: E721 def resolve_type_parameters(t) -> type: return _resolve_type_parameters_aux(t, typevar_to_type={}) def _resolve_type_parameters_aux(t, *, typevar_to_type: Mapping[TypeVar, type]) -> type: if isinstance(t, TypeVar): return typevar_to_type[t] # This is the 'left hand side' case, i.e. in type ... = if isinstance(t, TypeAliasType): return _resolve_type_parameters_aux(t.__value__, typevar_to_type=typevar_to_type) # note: args is never none raw_args = get_args(t) resolved_args = tuple(_resolve_type_parameters_aux(arg, typevar_to_type=typevar_to_type) for arg in raw_args) # UnionType: resolve each member of the union if isinstance(t, UnionType): # Reconstruct the union with resolved args result = resolved_args[0] for arg in resolved_args[1:]: result = result | arg # type: ignore[assignment] return result origin = get_origin(t) # Must be a non-generic type if origin is None: return t # This is the 'right hand side', e.g. '... = Id[int]' matches this if isinstance(origin, TypeAliasType): type_params = origin.__type_params__ new_typevar_to_type: Mapping[TypeVar, type] = { **typevar_to_type, **dict(zip(type_params, resolved_args, strict=True)), # type: ignore[arg-type] } return _resolve_type_parameters_aux(origin.__value__, typevar_to_type=new_typevar_to_type) # Just a regular generic type return origin[resolved_args] ================================================ FILE: tox.ini ================================================ [tox] minversion = 4 # relies on the correct version of Python installed # (we rely on CI for the test matrix) envlist = ruff,tests,mypy,ty # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 # hack to prevent .tox from crapping to the project directory toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox [testenv] # TODO how to get package name from setuptools? package_name = "cachew" pass_env = # useful for tests to know they are running under ci CI CI_* # respect user's cache dirs to prevent tox from crapping into project dir PYTHONPYCACHEPREFIX MYPY_CACHE_DIR RUFF_CACHE_DIR set_env = # do not add current working directory to pythonpath # generally this is more robust and safer, prevents weird issues later on PYTHONSAFEPATH=1 runner = uv-venv-lock-runner uv_sync_locked = false [testenv:ruff] skip_install = true dependency_groups = testing commands = {envpython} -m ruff check \ {posargs} [testenv:tests] dependency_groups = testing commands = # posargs allow test filtering, e.g. tox ... -- -k test_name {envpython} -m pytest \ --pyargs {[testenv]package_name} \ {posargs} [testenv:mypy] dependency_groups = typecheck commands = {envpython} -m mypy --no-install-types \ -p {[testenv]package_name} \ --txt-report .coverage.mypy \ --html-report .coverage.mypy \ # this is for github actions to upload to codecov.io # sadly xml coverage crashes on windows... so we need to disable it {env:CI_MYPY_COVERAGE} \ {posargs} [testenv:ty] dependency_groups = typecheck commands = {envpython} -m ty \ check \ {posargs} ================================================ FILE: ty.toml ================================================ [src] exclude = [ "doc/test_serialization.py", ]