Repository: karlicoss/cachew
Branch: master
Commit: 7e785aac758f
Files: 46
Total size: 251.7 KB
Directory structure:
gitextract_k61syhvn/
├── .ci/
│ ├── release
│ └── run
├── .gitattributes
├── .github/
│ └── workflows/
│ └── main.yml
├── .gitignore
├── .idea/
│ └── dictionaries/
│ └── karlicos.xml
├── LICENSE.txt
├── README.ipynb
├── README.md
├── benchmarks/
│ ├── 20230912-comparison-with-legacy.org
│ ├── 20230912.org
│ └── 20230917.org
├── doc/
│ ├── cachew_disable.md
│ ├── serialization.org
│ └── test_serialization.py
├── generate-readme
├── github-issues.org
├── misc/
│ ├── profile.py
│ └── test_redis/
│ ├── docker-compose.yml
│ └── test.py
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── ruff.toml
├── src/
│ └── cachew/
│ ├── __init__.py
│ ├── backend/
│ │ ├── common.py
│ │ ├── file.py
│ │ └── sqlite.py
│ ├── common.py
│ ├── compat.py
│ ├── experimental.py
│ ├── extra.py
│ ├── legacy.py
│ ├── logging_helper.py
│ ├── marshall/
│ │ ├── cachew.py
│ │ └── common.py
│ ├── py.typed
│ ├── pytest.py
│ ├── tests/
│ │ ├── marshall.py
│ │ ├── test_cachew.py
│ │ ├── test_future_annotations.py
│ │ ├── test_resolve_type_parameters.py
│ │ └── utils.py
│ └── utils.py
├── tox.ini
└── ty.toml
================================================
FILE CONTENTS
================================================
================================================
FILE: .ci/release
================================================
#!/usr/bin/env python3
'''
Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]].
- running manually
You'll need =UV_PUBLISH_TOKEN= env variable
- running on Github Actions
Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi
It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]].
Packages are deployed on:
- every master commit, onto test pypi
- every new tag, onto production pypi
'''
UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN'
import argparse
import os
from pathlib import Path
from subprocess import check_call
is_ci = os.environ.get('CI') is not None
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument('--use-test-pypi', action='store_true')
args = p.parse_args()
publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else []
root = Path(__file__).absolute().parent.parent
os.chdir(root) # just in case
check_call(['uv', 'build', '--clear'])
if not is_ci:
# CI relies on trusted publishers so doesn't need env variable
assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed'
check_call(['uv', 'publish', *publish_url])
if __name__ == '__main__':
main()
================================================
FILE: .ci/run
================================================
#!/bin/bash
set -eu
cd "$(dirname "$0")"
cd .. # git root
if ! command -v sudo; then
# CI or Docker sometimes doesn't have it, so useful to have a dummy
function sudo {
"$@"
}
fi
# --parallel-live to show outputs while it's running
tox_cmd='run-parallel --parallel-live'
if [ -n "${CI-}" ]; then
# install OS specific stuff here
case "$OSTYPE" in
darwin*)
# macos
:
;;
cygwin* | msys* | win*)
# windows
# ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
tox_cmd='run'
;;
*)
# must be linux?
:
;;
esac
fi
# NOTE: expects uv installed
uv tool run --with tox-uv tox $tox_cmd "$@"
================================================
FILE: .gitattributes
================================================
*.ipynb filter=nbstripout
*.ipynb diff=ipynb
================================================
FILE: .github/workflows/main.yml
================================================
# see https://github.com/karlicoss/pymplate for up-to-date reference
name: CI
on:
push:
branches: '*'
tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
# Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:
# Needed to trigger on others' PRs.
# Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them".
pull_request:
# Needed to trigger workflows manually.
workflow_dispatch:
inputs:
debug_enabled:
type: boolean
description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
required: false
default: false
schedule:
- cron: '31 18 * * 5' # run every Friday
jobs:
build:
strategy:
fail-fast: false
matrix:
platform: [ubuntu-latest, macos-latest] # windows-latest
python-version: ['3.12', '3.13', '3.14']
# vvv just an example of excluding stuff from matrix
# exclude: [{platform: macos-latest, python-version: '3.6'}]
runs-on: ${{ matrix.platform }}
# useful for 'optional' pipelines
# continue-on-error: ${{ matrix.platform == 'windows-latest' }}
steps:
# ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
- run: echo "$HOME/.local/bin" >> $GITHUB_PATH
- uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0 # nicer to have all git history when debugging/for tests
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- uses: astral-sh/setup-uv@v7
with:
enable-cache: false # we don't have lock files, so can't use them as cache key
- uses: mxschmitt/action-tmate@v3
if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
# explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...
- run: bash .ci/run
env:
# only compute lxml coverage on ubuntu; it crashes on windows
CI_MYPY_COVERAGE: ${{ matrix.platform == 'ubuntu-latest' && '--cobertura-xml-report .coverage.mypy' || '' }}
- if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms
uses: codecov/codecov-action@v5
with:
fail_ci_if_error: true # default false
token: ${{ secrets.CODECOV_TOKEN }}
flags: mypy-${{ matrix.python-version }}
files: .coverage.mypy/cobertura.xml
pypi:
# Do not run it for PRs/cron schedule etc.
# NOTE: release tags are guarded by on: push: tags on the top.
if: github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') || (github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch)))
# Ugh, I tried using matrix or something to explicitly generate only test pypi or prod pypi pipelines.
# But github actions is so shit, it's impossible to do any logic at all, e.g. doesn't support conditional matrix, if/else statements for variables etc.
needs: [build] # add all other jobs here
runs-on: ubuntu-latest
permissions:
# necessary for Trusted Publishing
id-token: write
steps:
# ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
- run: echo "$HOME/.local/bin" >> $GITHUB_PATH
- uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0 # pull all commits to correctly infer vcs version
- uses: actions/setup-python@v6
with:
python-version: '3.12'
- uses: astral-sh/setup-uv@v7
with:
enable-cache: false # we don't have lock files, so can't use them as cache key
- name: 'release to test pypi'
# always deploy merged master to test pypi
if: github.event.ref == format('refs/heads/{0}', github.event.repository.master_branch)
run: .ci/release --use-test-pypi
- name: 'release to prod pypi'
# always deploy tags to release pypi
if: startsWith(github.event.ref, 'refs/tags/')
run: .ci/release
================================================
FILE: .gitignore
================================================
# Created by https://www.gitignore.io/api/python,emacs
# Edit at https://www.gitignore.io/?templates=python,emacs
### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
# directory configuration
.dir-locals.el
# network security
/network-security.data
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# End of https://www.gitignore.io/api/python,emacs
untracked/
================================================
FILE: .idea/dictionaries/karlicos.xml
================================================
cachew
dataclassish
pylint
typecheck
================================================
FILE: LICENSE.txt
================================================
The MIT License (MIT)
Copyright (c) 2019 Dima Gerasimov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"from pathlib import Path\n",
"\n",
"import jedi # ty: ignore[unresolved-import]\n",
"\n",
"\n",
"def git_root() -> Path:\n",
" import subprocess\n",
"\n",
" path_s = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip()\n",
" path = Path(path_s)\n",
" assert path.is_absolute(), path # just in case\n",
" return path\n",
"\n",
"\n",
"src_dir = git_root() / 'src'\n",
"assert src_dir.exists(), src_dir # seems like jedi is pretty quiet about missing dirs..\n",
"\n",
"project = jedi.Project(src_dir)\n",
"\n",
"\n",
"def _find(name: str):\n",
" # ugh. sometimes it returns exact dupes for no apparent reason??\n",
" completions = set(project.search(name, all_scopes=True))\n",
" assert len(completions) == 1, f\"Expected one completion for {name}, got {completions}\"\n",
" [c] = completions\n",
" [c] = c.goto() # todo what is this for?\n",
" return c\n",
"\n",
"\n",
"def rlink(name: str) -> str:\n",
" c = _find(name)\n",
" if c.module_path is None:\n",
" # TODO ugh raise an issue on tracker or something??\n",
" # seems to only happen for namsepace packages..\n",
" assert c.description.startswith('namespace '), c\n",
" res = name.replace('.', '/')\n",
" assert (src_dir / res).exists(), res\n",
" return f'src/{res}'\n",
" else:\n",
" rpath = Path(c.module_path).relative_to(src_dir)\n",
" return f'src/{rpath}#L{c.line}'\n",
"\n",
"\n",
"# TODO ugh.. annoying, seems like Jedi can't get the functions source?\n",
"# maybe because it's doing partial parsing or something?\n",
"# there is c._get_module_context().code_lines, but it returns all lines in a source file??\n",
"def getsource(symbol: str) -> str:\n",
" c = _find(symbol)\n",
" p = Path(c.module_path)\n",
" # TODO check that it's a function?\n",
" function_name = symbol.split('.')[-1]\n",
" assert p.exists(), p\n",
" src = p.read_text()\n",
" src_lines = src.splitlines(keepends=True)\n",
" for x in ast.walk(ast.parse(src)):\n",
" if isinstance(x, ast.FunctionDef) and x.name == function_name:\n",
" break\n",
" else:\n",
" raise RuntimeError(f'Function not found: {symbol}')\n",
"\n",
" # ugh lineno is 1-indexed, and seems like a closed interval?\n",
" return ''.join(src_lines[x.lineno - 1 : x.end_lineno])\n",
"\n",
"\n",
"def getdoc(symbol: str) -> str:\n",
" c = _find(symbol)\n",
" doc = c.docstring()\n",
" assert doc is not None, symbol\n",
" return doc"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TODO just get rid of this in favor of native markdown + rlink?\n",
"def flink(title: str, name: str | None = None) -> str:\n",
" if name is None:\n",
" name = title.replace('`', '') # meh\n",
" if name.startswith('tests'):\n",
" name = name.replace('tests', 'cachew.tests.test_cachew')\n",
" # FIXME just replace in code..\n",
"\n",
" return f\"[{title}]({rlink(name)})\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Markdown as md # ty: ignore[unresolved-import]\n",
"\n",
"dmd = lambda x: display(md(x.strip())) # ty: ignore[unresolved-reference]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"dmd('''\n",
"\n",
"''')"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# What is Cachew?\n",
"TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.\n",
"Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.\n",
"\n",
"In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.\n",
"\n",
"A simple type is defined as:\n",
"\n",
"- primitive: `str`/`int`/`float`/`bool`\n",
"- JSON-like types (`dict`/`list`/`tuple`)\n",
"- `datetime`\n",
"- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )\n",
"- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)\n",
"- [dataclasses](https://docs.python.org/3/library/dataclasses.html)\n",
"\n",
"\n",
"That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.\n",
"Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.\n",
"\n",
"## Motivation\n",
"\n",
"I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.\n",
"\n",
"Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,\n",
"comparing on the next run and returning cached data if nothing changed.\n",
"\n",
"Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.\n",
"\n",
"\n",
"# Examples\n",
"## Processing Wikipedia\n",
"Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.\n",
"Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.\n",
"\n",
"\n",
"With this library your can achieve it through single `@cachew` decorator."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# FIXME hmm seems like this doesn't work if there are type annotations on cachew_impl? odd\n",
"# likely this? https://github.com/davidhalter/jedi/issues/2025\n",
"doc = getdoc('cachew_impl').split('Usage example:')[-1].lstrip()\n",
"dmd(f\"\"\"```python\n",
"{doc}\n",
"```\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.\n",
"\n",
"When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Incremental data exports\n",
"This is my most common usecase of cachew, which I'll illustrate with example.\n",
"\n",
"I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.\n",
"Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).\n",
"That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:\n",
"\n",
" ...\n",
" 20190715100026.db\n",
" 20190716100138.db\n",
" 20190717101651.db\n",
" 20190718100118.db\n",
" 20190719100701.db\n",
" ...\n",
"\n",
"To access **all** of historic temperature data, I have two options:\n",
"\n",
"- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:\n",
" \n",
" def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n",
" for chunk in chunks:\n",
" # read measurements from 'chunk' and yield unseen ones\n",
"\n",
" This is very **easy, but slow** and you waste CPU for no reason every time you need data.\n",
"\n",
"- Keep a 'master' database and write code to merge chunks in it.\n",
"\n",
" This is very **efficient, but tedious**:\n",
" \n",
" - requires serializing/deserializing data -- boilerplate\n",
" - requires manually managing sqlite database -- error prone, hard to get right every time\n",
" - requires careful scheduling, ideally you want to access new data without having to refresh cache\n",
"\n",
" \n",
"Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:\n",
"\n",
" @cachew \n",
" def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n",
" # ...\n",
" \n",
"- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast\n",
"- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)\n",
"\n",
" All the complexity of handling database is hidden in `cachew` implementation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"link = rlink('composite_hash')\n",
"\n",
"dmd(f'''\n",
"# How it works\n",
"\n",
"- first your objects get {flink('converted', 'cachew.marshall.cachew.CachewMarshall')} into a simpler JSON-like representation\n",
"- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).\n",
"\n",
"When the function is called, cachew [computes the hash of your function's arguments ]({link})\n",
"and compares it against the previously stored hash value.\n",
"\n",
"- If they match, it would deserialize and yield whatever is stored in the cache database\n",
"- If the hash mismatches, the original function is called and new data is stored along with the new hash\n",
"''')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"dmd('# Features')\n",
"types = [f'`{t}`' for t in ['str', 'int', 'float', 'bool', 'datetime', 'date', 'Exception']]\n",
"dmd(f\"\"\"\n",
"* automatic schema inference: {flink('1', 'tests.test_return_type_inference')}, {flink('2', 'tests.test_return_type_mismatch')}\n",
"* supported types:\n",
"\n",
" * primitive: {', '.join(types)}\n",
"\n",
" See {flink('tests.test_types')}, {flink('tests.test_primitive')}, {flink('tests.test_dates')}, {flink('tests.test_exceptions')}\n",
" * {flink('@dataclass and NamedTuple', 'tests.test_dataclass')}\n",
" * {flink('Optional', 'tests.test_optional')} types\n",
" * {flink('Union', 'tests.test_union')} types\n",
" * {flink('nested datatypes', 'tests.test_nested')}\n",
"\n",
"* detects {flink('datatype schema changes', 'tests.test_schema_change')} and discards old data automatically\n",
"\"\"\")\n",
"# * custom hash function TODO example with mtime?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Performance\n",
"Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.\n",
"\n",
"During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.\n",
"\n",
"It would almost certainly make your program faster if your computations take more than several seconds.\n",
"\n",
"You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"dmd(f\"\"\"\n",
"# Using\n",
"See {flink('docstring', 'cachew_impl')} for up-to-date documentation on parameters and return types.\n",
"You can also use {flink('extensive unit tests', 'tests')} as a reference.\n",
"\n",
"Some useful (but optional) arguments of `@cachew` decorator:\n",
"\n",
"* `cache_path` can be a directory, or a callable that {flink('returns a path', 'tests.test_callable_cache_path')} and depends on function's arguments.\n",
"\n",
" By default, `settings.DEFAULT_CACHEW_DIR` is used.\n",
"\n",
"* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.\n",
"\n",
" By default it just uses string representation of the arguments, you can also specify a custom callable.\n",
"\n",
" For instance, it can be used to {flink('discard cache', 'tests.test_custom_hash')} if the input file was modified.\n",
"\n",
"* `cls` is the type that would be serialized.\n",
"\n",
" By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Installing\n",
"Package is available on [pypi](https://pypi.org/project/cachew/).\n",
"\n",
" pip3 install --user cachew\n",
" \n",
"## Developing\n",
"I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI."
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Implementation\n",
"\n",
"* why NamedTuples and dataclasses?\n",
" \n",
" `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.\n",
" Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.\n",
" \n",
" If you want to find out more why you should use more dataclasses in your code I suggest these links:\n",
" \n",
" - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)\n",
" - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)\n",
" \n",
"* why not `pandas.DataFrame`?\n",
"\n",
" DataFrames are great and can be serialised to csv or pickled.\n",
" They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.\n",
" They also can't be nested.\n",
"\n",
"* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?\n",
" \n",
" ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.\n",
"\n",
" * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.\n",
" Also it doesn't support nested types.\n",
" \n",
"* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?\n",
"\n",
" Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.\n",
"\n",
" Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:\n",
" \n",
" * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api\n",
" * https://pypi.org/project/marshmallow-dataclass\n",
" \n",
" I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).\n",
" So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.\n",
"\n",
"* why `sqlite` database for storage?\n",
"\n",
" It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.\n",
"\n",
" There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.\n",
" It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.\n",
" \n",
" It would also be interesting to experiment with in-RAM storages.\n",
"\n",
" I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tips and tricks\n",
"## Optional dependency\n",
"You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dmd(f\"\"\"```python\n",
"{getsource('cachew.extra.mcachew')}\n",
"```\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.\n",
"\n",
"## Settings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dmd(f'''\n",
"{flink('cachew.settings')} exposes some parameters that allow you to control `cachew` behaviour:\n",
"- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).\n",
" You can also use {flink('cachew.extra.disabled_cachew')} context manager to do it temporarily.\n",
"- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the \"user cache directory\" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).\n",
"- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.\n",
" Set to `True` to catch errors earlier.\n",
"- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).\n",
"\n",
"''')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Updating this readme\n",
"This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script."
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "cachew",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
},
"name": "README.ipynb"
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: README.md
================================================
# What is Cachew?
TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.
Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.
In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.
A simple type is defined as:
- primitive: `str`/`int`/`float`/`bool`
- JSON-like types (`dict`/`list`/`tuple`)
- `datetime`
- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )
- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)
- [dataclasses](https://docs.python.org/3/library/dataclasses.html)
That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.
Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.
## Motivation
I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.
Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,
comparing on the next run and returning cached data if nothing changed.
Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.
# Examples
## Processing Wikipedia
Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.
Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.
With this library your can achieve it through single `@cachew` decorator.
```python
>>> from typing import NamedTuple, Iterator
>>> class Link(NamedTuple):
... url : str
... text: str
...
>>> @cachew
... def extract_links(archive_path: str) -> Iterator[Link]:
... for i in range(5):
... # simulate slow IO
... # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours
... import time; time.sleep(1)
... yield Link(url=f'http://link{i}.org', text=f'text {i}')
...
>>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run
[Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]
>>> from timeit import Timer
>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)
... # second run is cached, so should take less time
>>> print(f"call took {int(res)} seconds")
call took 0 seconds
>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)
... # now file has changed, so the cache will be discarded
>>> print(f"call took {int(res)} seconds")
call took 5 seconds
```
When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.
When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately.
## Incremental data exports
This is my most common usecase of cachew, which I'll illustrate with example.
I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.
Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).
That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:
...
20190715100026.db
20190716100138.db
20190717101651.db
20190718100118.db
20190719100701.db
...
To access **all** of historic temperature data, I have two options:
- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:
def measurements(chunks: List[Path]) -> Iterator[Measurement]:
for chunk in chunks:
# read measurements from 'chunk' and yield unseen ones
This is very **easy, but slow** and you waste CPU for no reason every time you need data.
- Keep a 'master' database and write code to merge chunks in it.
This is very **efficient, but tedious**:
- requires serializing/deserializing data -- boilerplate
- requires manually managing sqlite database -- error prone, hard to get right every time
- requires careful scheduling, ideally you want to access new data without having to refresh cache
Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:
@cachew
def measurements(chunks: List[Path]) -> Iterator[Measurement]:
# ...
- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast
- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)
All the complexity of handling database is hidden in `cachew` implementation.
# How it works
- first your objects get [converted](src/cachew/marshall/cachew.py#L29) into a simpler JSON-like representation
- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).
When the function is called, cachew [computes the hash of your function's arguments ](src/cachew/__init__.py#L580)
and compares it against the previously stored hash value.
- If they match, it would deserialize and yield whatever is stored in the cache database
- If the hash mismatches, the original function is called and new data is stored along with the new hash
# Features
* automatic schema inference: [1](src/cachew/tests/test_cachew.py#L381), [2](src/cachew/tests/test_cachew.py#L395)
* supported types:
* primitive: `str`, `int`, `float`, `bool`, `datetime`, `date`, `Exception`
See [tests.test_types](src/cachew/tests/test_cachew.py#L682), [tests.test_primitive](src/cachew/tests/test_cachew.py#L720), [tests.test_dates](src/cachew/tests/test_cachew.py#L632), [tests.test_exceptions](src/cachew/tests/test_cachew.py#L1124)
* [@dataclass and NamedTuple](src/cachew/tests/test_cachew.py#L597)
* [Optional](src/cachew/tests/test_cachew.py#L524) types
* [Union](src/cachew/tests/test_cachew.py#L827) types
* [nested datatypes](src/cachew/tests/test_cachew.py#L440)
* detects [datatype schema changes](src/cachew/tests/test_cachew.py#L470) and discards old data automatically
# Performance
Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.
During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.
It would almost certainly make your program faster if your computations take more than several seconds.
You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py).
# Using
See [docstring](src/cachew/__init__.py#L279) for up-to-date documentation on parameters and return types.
You can also use [extensive unit tests](src/cachew/tests/test_cachew.py#L1) as a reference.
Some useful (but optional) arguments of `@cachew` decorator:
* `cache_path` can be a directory, or a callable that [returns a path](src/cachew/tests/test_cachew.py#L417) and depends on function's arguments.
By default, `settings.DEFAULT_CACHEW_DIR` is used.
* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.
By default it just uses string representation of the arguments, you can also specify a custom callable.
For instance, it can be used to [discard cache](src/cachew/tests/test_cachew.py#L115) if the input file was modified.
* `cls` is the type that would be serialized.
By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.
# Installing
Package is available on [pypi](https://pypi.org/project/cachew/).
pip3 install --user cachew
## Developing
I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI.
# Implementation
* why NamedTuples and dataclasses?
`NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.
Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.
If you want to find out more why you should use more dataclasses in your code I suggest these links:
- [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)
- [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)
* why not `pandas.DataFrame`?
DataFrames are great and can be serialised to csv or pickled.
They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.
They also can't be nested.
* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?
ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.
* E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.
Also it doesn't support nested types.
* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?
Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.
Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:
* https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api
* https://pypi.org/project/marshmallow-dataclass
I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).
So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.
* why `sqlite` database for storage?
It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.
There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.
It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.
It would also be interesting to experiment with in-RAM storages.
I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.
# Tips and tricks
## Optional dependency
You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:
```python
def mcachew(*args, **kwargs):
"""
Stands for 'Maybe cachew'.
Defensive wrapper around @cachew to make it an optional dependency.
"""
try:
import cachew
except ModuleNotFoundError:
import warnings
warnings.warn(
'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew',
stacklevel=2,
)
return lambda orig_func: orig_func
else:
return cachew.cachew(*args, **kwargs)
```
Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.
## Settings
[cachew.settings](src/cachew/__init__.py#L55) exposes some parameters that allow you to control `cachew` behaviour:
- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).
You can also use [cachew.extra.disabled_cachew](src/cachew/extra.py#L25) context manager to do it temporarily.
- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the "user cache directory" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).
- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.
Set to `True` to catch errors earlier.
- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).
## Updating this readme
This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script.
================================================
FILE: benchmarks/20230912-comparison-with-legacy.org
================================================
Running on @karlicoss desktop PC, =python3.10=.
This is basically to justify switching to the new serialization method
- old way, =legacy= used to 'flatten' the type into an sqlite row
- new way, =cachew=, just dumps it as a dict, then to bytes via =orjson= and stores in a single sqlite column
The numbers between legacy and cachew can't be directly compared though.
Legacy =serializing= step emits a tuple, which can be inserted directly into the database.
So to compare it with the new way, we need to compare with the sum of =serializing= + =json dump=.
That said this won't be exact comparison either, since legacy binder relied on sqlalchemy to dump custom types to sqlite types (e.g. =datetime= or =Exception=). So legacy will have a slight advantage this way, but it's fine.
So we can see that for:
- =test_union_str_dataclass=
- new implementation: =0.53 + 0.45s= to serialize; =0.29 + 0.48= to deserialize
- old implementation: =2.38s= to serialize; =1.92= to deserialize
- =test_nested_dataclass=
- new implementation: =1.05 + 0.26s= to serialize; =0.50 + 1.42= to deserialize
- old implementation: =1.92s= to serialize; =1.88= to deserialize
For both tests, serialization if quite a bit faster with the new implementation.
On the second test, they are on par for deserialization, but as I mention these numbers are in favor of the legacy implementation.
In addition, keeping everything in one column unlocks some othe optimizations which wouldn't be possible with multiple columns.
#+begin_example
$ pytest --pyargs cachew.tests.marshall -k 'gc_off and 1000000 and not cattrs' -s
=========================================================== test session starts ============================================================
platform linux -- Python 3.10.12, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /code/cachew_jsonpickle
configfile: pytest.ini
plugins: anyio-3.6.2
collected 100 items / 95 deselected / 5 selected
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
building 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.34s
serializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.53s
json dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
sqlite dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.08s
sqlite load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
jsonl dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.18s
jsonl load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.13s
json load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.48s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-legacy]
building 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.35s
serializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 2.38s
json dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.22s
sqlite dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.06s
sqlite load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
jsonl dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
jsonl load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
json load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.23s
deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.92s
PASSED
src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-cachew]
building 1000000 objects of type .TE2'>: 0.58s
serializing 1000000 objects of type .TE2'>: 1.05s
json dump 1000000 objects of type .TE2'>: 0.26s
sqlite dump 1000000 objects of type .TE2'>: 1.03s
sqlite load 1000000 objects of type .TE2'>: 0.30s
jsonl dump 1000000 objects of type .TE2'>: 0.14s
jsonl load 1000000 objects of type .TE2'>: 0.14s
json load 1000000 objects of type .TE2'>: 0.50s
deserializing 1000000 objects of type .TE2'>: 1.42s
PASSED
src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-legacy]
building 1000000 objects of type .TE2'>: 0.56s
serializing 1000000 objects of type .TE2'>: 1.92s
json dump 1000000 objects of type .TE2'>: 0.21s
sqlite dump 1000000 objects of type .TE2'>: 0.99s
sqlite load 1000000 objects of type .TE2'>: 0.29s
jsonl dump 1000000 objects of type .TE2'>: 0.12s
jsonl load 1000000 objects of type .TE2'>: 0.12s
json load 1000000 objects of type .TE2'>: 0.24s
deserializing 1000000 objects of type .TE2'>: 1.88s
PASSED
#+end_example
================================================
FILE: benchmarks/20230912.org
================================================
Running on @karlicoss desktop PC, =python3.10=
- serializing/deserializing here refers to converting object to json-ish python dictionary (not actual json string!)
- json dump/json load refers to converting the dict above to a json string and back
- sqlite dump/jsonl dump refers to saving/loading these strings to a persistent storage
#+begin_example
$ pytest --pyargs --ignore-glob '*test_cachew*' -k marshall -s
=========================================================== test session starts ============================================================
platform linux -- Python 3.10.6, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3
cachedir: .pytest_cache
configfile: pytest.ini
plugins: anyio-3.6.2
collected 37 items / 8 deselected / 29 selected
src/cachew/marshall/cachew.py::test_serialize_and_deserialize PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
building 1000000 objects of type str | cachew.tests.marshall.Name: 0.60s
serializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.85s
json dump 1000000 objects of type str | cachew.tests.marshall.Name: 0.46s
sqlite dump 1000000 objects of type str | cachew.tests.marshall.Name: 1.11s
sqlite load 1000000 objects of type str | cachew.tests.marshall.Name: 0.31s
jsonl dump 1000000 objects of type str | cachew.tests.marshall.Name: 0.13s
jsonl load 1000000 objects of type str | cachew.tests.marshall.Name: 0.13s
json load 1000000 objects of type str | cachew.tests.marshall.Name: 1.04s
deserializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.86s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
building 5000000 objects of type str | cachew.tests.marshall.Name: 3.00s
serializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.38s
json dump 5000000 objects of type str | cachew.tests.marshall.Name: 2.14s
sqlite dump 5000000 objects of type str | cachew.tests.marshall.Name: 5.43s
sqlite load 5000000 objects of type str | cachew.tests.marshall.Name: 1.47s
jsonl dump 5000000 objects of type str | cachew.tests.marshall.Name: 0.62s
jsonl load 5000000 objects of type str | cachew.tests.marshall.Name: 0.64s
json load 5000000 objects of type str | cachew.tests.marshall.Name: 4.74s
deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.06s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
building 5000000 objects of type str | cachew.tests.marshall.Name: 1.77s
serializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.59s
json dump 5000000 objects of type str | cachew.tests.marshall.Name: 1.22s
sqlite dump 5000000 objects of type str | cachew.tests.marshall.Name: 5.28s
sqlite load 5000000 objects of type str | cachew.tests.marshall.Name: 1.58s
jsonl dump 5000000 objects of type str | cachew.tests.marshall.Name: 0.64s
jsonl load 5000000 objects of type str | cachew.tests.marshall.Name: 0.66s
json load 5000000 objects of type str | cachew.tests.marshall.Name: 1.53s
deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.60s
PASSED
src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)
src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
building 1000000 objects of type : 1.05s
serializing 1000000 objects of type : 1.28s
json dump 1000000 objects of type : 0.22s
sqlite dump 1000000 objects of type : 1.14s
sqlite load 1000000 objects of type : 0.30s
jsonl dump 1000000 objects of type : 0.14s
jsonl load 1000000 objects of type : 0.14s
json load 1000000 objects of type : 0.70s
deserializing 1000000 objects of type : 2.20s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
building 5000000 objects of type : 5.08s
serializing 5000000 objects of type : 6.35s
json dump 5000000 objects of type : 1.13s
sqlite dump 5000000 objects of type : 5.58s
sqlite load 5000000 objects of type : 1.47s
jsonl dump 5000000 objects of type : 0.69s
jsonl load 5000000 objects of type : 0.70s
json load 5000000 objects of type : 6.85s
deserializing 5000000 objects of type : 11.10s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
building 1000000 objects of type : 1.37s
serializing 1000000 objects of type : 1.25s
json dump 1000000 objects of type : 0.24s
sqlite dump 1000000 objects of type : 1.12s
sqlite load 1000000 objects of type : 0.29s
jsonl dump 1000000 objects of type : 0.14s
jsonl load 1000000 objects of type : 0.14s
json load 1000000 objects of type : 0.24s
deserializing 1000000 objects of type : 2.17s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
building 5000000 objects of type : 5.10s
serializing 5000000 objects of type : 6.22s
json dump 5000000 objects of type : 1.17s
sqlite dump 5000000 objects of type : 5.43s
sqlite load 5000000 objects of type : 1.54s
jsonl dump 5000000 objects of type : 0.70s
jsonl load 5000000 objects of type : 0.71s
json load 5000000 objects of type : 1.22s
deserializing 5000000 objects of type : 10.97s
PASSED
src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
building 1000000 objects of type .TE2'>: 1.64s
serializing 1000000 objects of type .TE2'>: 1.43s
json dump 1000000 objects of type .TE2'>: 0.30s
sqlite dump 1000000 objects of type .TE2'>: 1.16s
sqlite load 1000000 objects of type .TE2'>: 0.30s
jsonl dump 1000000 objects of type .TE2'>: 0.15s
jsonl load 1000000 objects of type .TE2'>: 0.15s
json load 1000000 objects of type .TE2'>: 1.02s
deserializing 1000000 objects of type .TE2'>: 2.78s
PASSED
src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
building 1000000 objects of type .TE2'>: 1.88s
serializing 1000000 objects of type .TE2'>: 0.80s
json dump 1000000 objects of type .TE2'>: 0.31s
sqlite dump 1000000 objects of type .TE2'>: 1.39s
sqlite load 1000000 objects of type .TE2'>: 0.31s
jsonl dump 1000000 objects of type .TE2'>: 0.15s
jsonl load 1000000 objects of type .TE2'>: 0.15s
json load 1000000 objects of type .TE2'>: 1.03s
deserializing 1000000 objects of type .TE2'>: 2.61s
PASSED
src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
building 1000000 objects of type .TE2'>: 0.57s
serializing 1000000 objects of type .TE2'>: 1.08s
json dump 1000000 objects of type .TE2'>: 0.29s
sqlite dump 1000000 objects of type .TE2'>: 1.09s
sqlite load 1000000 objects of type .TE2'>: 0.30s
jsonl dump 1000000 objects of type .TE2'>: 0.15s
jsonl load 1000000 objects of type .TE2'>: 0.15s
json load 1000000 objects of type .TE2'>: 0.50s
deserializing 1000000 objects of type .TE2'>: 1.43s
PASSED
src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
building 1000000 objects of type .TE2'>: 0.57s
serializing 1000000 objects of type .TE2'>: 0.39s
json dump 1000000 objects of type .TE2'>: 0.29s
sqlite dump 1000000 objects of type .TE2'>: 1.16s
sqlite load 1000000 objects of type .TE2'>: 0.32s
jsonl dump 1000000 objects of type .TE2'>: 0.16s
jsonl load 1000000 objects of type .TE2'>: 0.15s
json load 1000000 objects of type .TE2'>: 0.50s
deserializing 1000000 objects of type .TE2'>: 1.29s
PASSED
============================================================ slowest durations =============================================================
44.87s call src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
38.76s call src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
28.65s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
20.05s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
9.82s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
9.51s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
8.37s call src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
8.20s call src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
6.45s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
5.93s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
5.78s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
3.98s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
0.01s call src/cachew/marshall/cachew.py::test_serialize_and_deserialize
(68 durations < 0.005s hidden. Use -vv to show these durations.)
========================================================= short test summary info ==========================================================
SKIPPED [6] src/cachew/tests/marshall.py:171: TODO need to adjust the handling of Union types..
SKIPPED [4] src/cachew/tests/marshall.py:194: TODO support datetime with pytz for cattrs
PASSED src/cachew/marshall/cachew.py::test_serialize_and_deserialize
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
#+end_example
================================================
FILE: benchmarks/20230917.org
================================================
Running on @karlicoss desktop PC, =python3.10=
Just a comparison of =sqlite= and =file= backends.
#+begin_example
$ pytest --pyargs -k 'test_many and gc_off and 3000000' -s
src/cachew/tests/test_cachew.py::test_many[sqlite-gc_off-3000000] [INFO 2023-09-17 02:02:09,946 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many..iter_data: wrote 3000000 objects to cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)
test_many: initial write to cache took 13.6s
test_many: cache size is 229.220352Mb
[INFO 2023-09-17 02:02:10,780 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many..iter_data: loading 3000000 objects from cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)
test_many: reading from cache took 7.0s
PASSED
src/cachew/tests/test_cachew.py::test_many[file-gc_off-3000000] [INFO 2023-09-17 02:02:23,944 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many..iter_data: wrote 3000000 objects to cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)
test_many: initial write to cache took 6.1s
test_many: cache size is 202.555667Mb
[INFO 2023-09-17 02:02:23,945 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many..iter_data: loading objects from cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)
test_many: reading from cache took 5.4s
#+end_example
================================================
FILE: doc/cachew_disable.md
================================================
Can put this in the README.md once its been tested a bit
### Disable through Environment Variables
To disable a `cachew` function in some module, you can use the `CACHEW_DISABLE` environment variable. This is a colon-delimited (like a `$PATH`) list of modules to disable. It disables modules given some name recursively, and supports [unix-style globs](https://docs.python.org/3/library/fnmatch.html)
For example, say you were using [HPI](https://github.com/karlicoss/HPI) which internally uses a snippet like `mcachew` above. You may want to enable `cachew` for _most_ modules, but disable them for specific ones. For example take:
```
my/browser
├── active_browser.py
├── all.py
├── common.py
└── export.py
my/reddit
├── __init__.py
├── all.py
├── common.py
├── pushshift.py
└── rexport.py
```
To disable `cachew` in all of these files: `export CACHEW_DISABLE=my.browser:my.reddit` (disables for all submodules)
To disable just for a particular module: `export CACHEW_DISABLE='my.browser.export'`
Similarly to `$PATH` manipulations, you can do this in your shell configuration incrementally:
```
CACHEW_DISABLE='my.reddit.rexport'
if some condition...; then
CACHEW_DISABLE="my.browser.export:$CACHEW_DISABLE"
fi
export CACHEW_DISABLE
```
You can also use globs, e.g. `CACHEW_DISABLE='my.*.gdpr`
To disable `cachew` everywhere, you could set `export CACHEW_DISABLE='*'`
================================================
FILE: doc/serialization.org
================================================
Cachew works kinda like =functools.lru_cache=, but it also works in-between program runs.
For that, it needs to somehow persist the objects on the disk (unlike =lru_cache= which just keeps references to the objects already in process memory).
While persisting objects to the cache, essentially cachew needs to map them into simpler types, i.e. ones you can keep in a database like strings/ints/binary blobs.
At the moment (as of =v0.13.0=), we use sqlite as the cache store, with =sqlalchemy= as the interface to interact with it.
The way cachew works now is, to save the object in cache:
- first it's "flattened out" to conform to the database row model, so individual fields (including recursive fields) become database columns
- python types are mapped into sqlalchemy types, with extra =sqlalchemy.TypeDecorator= instances to support custom types like =datetime= or =Exception=
You can find a more detailed example [[https://github.com/karlicoss/cachew/blob/175afade0a417bfd533ced174365d246b8a7dabc/src/cachew/__init__.py#L319-L353][here]].
A big problem is that in general it's not really possible to serialize, and especially to deserialize back an arbitrary object in Python, unless you resort to binary serialization like =pickle= (which is very slow and comes with its own hosts of issues).
However in cachew we require the user to supply the *type signature* for the functions that are cached, so we can benefit from it for serializing and deserializing.
Few years ago, when I implemented =cachew= at first, there weren't really many options for serialization driven by type signatures, so I implemented the custom code I mentioned above to support that. In 2023, however, more and more libraries are benefiting from type signatures, in particular for serializing stuff.
So I decided to give it another go, in hope of using some mature library, simplifying cachew's code, and possibly getting a perfromance boost.
It's possible that I missed some documentation so if you think the problems I am describing can actually be worked around, please don't hesitate to let me know.
* Comparison
In cachew the very minimum we're aiming to support are:
- all json-ish types, e.g. =int=/=str=/=dict=/=list= etc
- =dataclass= and =NamedTuple=
- =Optional= and =Union=
- custom types, e.g. =datetime=, =Exception= (e.g. at least preserve exception message)
See [[file:test_serialization.py]] for more specific examples and supporting evidence for my summary here.
** [[https://docs.python.org/3.10/library/pickle.html][pickle]]
Builtin pickle module can handle any objects, without even needing type annotations.
However, it's [[https://www.benfrederickson.com/dont-pickle-your-data/][famously very slow]], so I even didn't consider using it.
It's also not secure in general, although in our case we control the objects we save/load from cache, so it's not a big issue.
** [[https://github.com/jsonpickle/jsonpickle#readme][jsonpickle]]
Jsonpickle -- similar to pickle, can handle any types.
I [[https://github.com/karlicoss/cachew/commit/048df33e65560205d63845f022b027a27719ff48][gave it a go]] just in case, and it's an order of magnitude slower than custom serialization code I already had, which is a no-go.
** [[https://github.com/lidatong/dataclasses-json/#readme][dataclasses-json]]
# TODO link to code
- CON: requires annotating all dataclasses involved with =@dataclass_json=, recursively.
This is a blocker from using it in =cachew=.
- CON: requires the type to be a =@dataclass= to annotate
So if you have something simpler you'll have to wrap it into a dummy dataclass or something.
- PRO: supports =Union= correctly
** [[https://github.com/marshmallow-code/marshmallow][marshmallow]]
By default marshmallow doesn't support dataclasses or unions, but there are some extra packages
- for dataclasses https://github.com/lovasoa/marshmallow_dataclass
- PRO: doesn't require modifying the original class, handles recursion out of the box
- CON: doesn't handle =Union= correctly
This is a blocker for cachew.
In addition it has a custom implementation of Union handling (rather than e.g. relying on =python-marshmallow-union=).
- https://github.com/adamboche/python-marshmallow-union
I didn't even get to try it since if dataclasses don't work marshmallow is a no-go for me.
Plus for some reason =marshmallow_dataclass= has a custom Union handling implementation which is different from this one, so it's going to be a huge mess.
** [[https://github.com/pydantic/pydantic#readme][pydantic]]
- PRO: if you use =TypeAdapter=, you can serialize/deserialize arbitrary types without decorating/inheriting from =BaseModel=
- CON: doesn't handle =Union= correctly
Again, this is a bit blocker. I've created an issue on pydantic bug tracker here: https://github.com/pydantic/pydantic/issues/7391
Kind of sad, because otherwise pydantic seemed promising!
** [[https://github.com/python-attrs/cattrs#features][cattrs]]
- PRO: doesn't require modifying the classes you serialise
- PRO: rich feature set, clearly aiming to comply with standard python's typing annotations
- CON: there is an issue with handling =NamedTuple=
It isn't converted to a dictionary like =dataclass= does, [[https://github.com/python-attrs/cattrs/issues/425][likely a bug]]?
- =Union= types are supported, but require some extra configuration
Unions work, but you have to 'register' them first.
A bit annoying that this is necessary even for simple unions like =int | str=, although [[https://github.com/python-attrs/cattrs/issues/423][possible]] to workaround.
The plus side is that cattr has a builtin utility for Union type discrimination.
I guess for my application I could traverse the type and register all necessary Unions with =catrrs=?
# TODO create an issue to support opting in everywhere by default?
Since the above seems quite good, I did a quick cachew hack on [[https://github.com/karlicoss/cachew/tree/cattrs][cattrs branch]] to try and use it.
The pipeline is the following:
- serialize type to a dictionary with primitive types via =cattrs=
- serialize dictionary to a byte string via =orjson=
- persist the byte string as an sqlite database row
(for deserializing we just do the same in reverse)
You can find the results [[https://github.com/karlicoss/cachew/commit/82691b10cd1d4ced4862dff21cf038fb83f9525c][here]] -- cattrs proved to be quite a huge speedup over my custom serialization code!
It needs a bit more work and evaluation for use in =cachew=, however it's super promising!
# TODO https://catt.rs/en/stable/preconf.html#orjson
Some interesting reading about cattrs:
- https://threeofwands.com/why-cattrs-is-so-fast/#v2-the-genconverter
- https://threeofwands.com/why-i-use-attrs-instead-of-pydantic
* Verdict
The biggest shared issues are that most of this libraries:
- require modifying the original class definitions, either by inheriting or decorating
- don't handle =Union= at all or don't handle it corectly (usually relying on the structural equivalence rather than actual types)
So for most of them, I even didn't get to trying to support custom types and measuing performance with =cachew=.
Of all of them only =cattrs= stood out, it takes builtin python typing and performance very seriously, and very configurable.
So if you need no bullshit serialization in python, I can definitely recommend it.
I might switch to it in [[https://github.com/karlicoss/promnesia][promnesia]] (where we have full control over the type we serialize in the database), and could potentially be used in HPI for [[https://github.com/karlicoss/HPI/blob/master/my/core/serialize.py][my.core.serialize]].
================================================
FILE: doc/test_serialization.py
================================================
#!/usr/bin/env python3
from dataclasses import dataclass
from typing import NamedTuple, Union
def test_dataclasses_json():
# pip install dataclasses-json
from dataclasses_json import dataclass_json
@dataclass
class Inner:
value: int
@dataclass
class Outer:
inner: Inner
### issue 1: requires @dataclass_json annotation on all involved dataclasses
obj = Outer(inner=Inner(value=123)) # noqa: F841
# we don't control the types that are passed to us, so we can't use the @dataclass_json
# but we can just call the decorator directly
# HOWEVER: this modifies the original class, Outer!!
OuterJson = dataclass_json(Outer) # noqa: F841
# it adds 'from_dict', 'from_json', 'schema', 'to_dict', 'to_json' attributes to it
# now if you try
# print(OuterJson.schema().dump(obj))
# you get a warning that it wants you to add annotations to Inner classes too.
# this isn't really an option for us.
###
### issue 2: can't dump anything unless the top level type is a dataclass?
### could wrap into a dummy dataclass or something, but is wasteful in terms of performance
###
### nice thing: correctly serializes Union types, even if they share the same attributes
@dataclass_json
@dataclass
class City:
name: str
@dataclass_json
@dataclass
class Country:
name: str
@dataclass_json
@dataclass
class WithUnion:
union: Union[City, Country] # noqa: UP007
objs = [
WithUnion(union=City(name='London')),
WithUnion(union=Country(name='UK')),
]
schema = WithUnion.schema()
json = schema.dumps(objs, many=True)
objs2 = schema.loads(json, many=True)
print("objects ", objs)
print("json ", json)
# NOTE: it dumps [{"union": {"name": "London", "__type": "City"}}, {"union": {"name": "UK", "__type": "Country"}}]
# so types are correctly distinguished
print("restored ", objs2)
assert objs == objs2, (objs, objs2)
###
def test_marshmallow_dataclass():
# pip3 install --user marshmallow-dataclass[union]
import marshmallow_dataclass
### issue 1: the top level type has to be a dataclass?
### although possible that we could use regular marshmallow for that instead
###
### issue 2: doesn't handle unions correctly
@dataclass
class City:
name: str
@dataclass
class Country:
name: str
@dataclass
class WithUnion:
union: Union[City, Country] # noqa: UP007
objs = [
WithUnion(union=City(name="London")),
WithUnion(union=Country(name="UK")),
]
# NOTE: good, doesn't require adding annotations on the original classes
schema = marshmallow_dataclass.class_schema(WithUnion)()
json = schema.dumps(objs, many=True)
objs2 = schema.loads(json, many=True)
print("objects ", objs)
print("json ", json)
# NOTE: it dumps [{"union": {"value": 123}}, {"union": {"value": 123}}]
# so it doesn't distingush based on types => won't deserialize correctly
print("restored ", objs2)
# assert objs == objs2, (objs, objs2)
# ^ this assert fails!
###
def test_pydantic():
from pydantic import TypeAdapter
### issue: doesn't handle Unions correctly
@dataclass
class City:
name: str
@dataclass
class Country:
name: str
@dataclass
class WithUnion:
union: Union[City, Country] # noqa: UP007
objs = [
WithUnion(union=City(name="London")),
WithUnion(union=Country(name="UK")),
]
# NOTE: nice, doesn't require annotating the original classes with anything
Schema = TypeAdapter(list[WithUnion])
json = Schema.dump_python(
objs,
# round_rtip: Whether to output the serialized data in a way that is compatible with deserialization
# not sure, doesn't seem to impact anything..
round_trip=True,
)
objs2 = Schema.validate_python(json)
print("objects ", objs)
print("json ", json)
print("restored ", objs2)
# assert objs == objs2, (objs, objs2)
# ^ this assert fails!
# created an issue https://github.com/pydantic/pydantic/issues/7391
###
def test_cattrs():
from cattrs import Converter
from cattrs.strategies import configure_tagged_union
converter = Converter()
### issue: NamedTuples aren't unstructured? asked here https://github.com/python-attrs/cattrs/issues/425
class X(NamedTuple):
value: int
d = converter.unstructure(X(value=123), X) # noqa: F841
# NOTE: this assert doesn't pass!
# assert isinstance(d, dict)
###
### good: handles Union correctly (although some extra configuring required)
@dataclass
class City:
name: str
@dataclass
class Country:
name: str
@dataclass
class WithUnion:
union: Union[City, Country] # noqa: UP007
objs = [
WithUnion(union=City(name="London")),
WithUnion(union=Country(name="UK")),
]
configure_tagged_union(
union=City | Country,
converter=converter,
)
# NOTE: nice -- doesn't require decorating original classes
json = converter.unstructure(objs, list[WithUnion])
assert isinstance(json, list)
objs2 = converter.structure(json, list[WithUnion])
print("objects ", objs)
# NOTE: dumps it as [{'union': {'name': 'London', '_type': 'City'}}, {'union': {'name': 'UK', '_type': 'Country'}}]
print("json ", json)
print("restored ", objs2)
assert objs == objs2, (objs, objs2)
###
### issue: unions of simple types aren't supported?
# see https://github.com/python-attrs/cattrs/issues/423
mixed: list[int | str] = [
123,
'Jakarta',
]
json = converter.unstructure(mixed, list[int | str])
# NOTE: this fails
# mixed2 = converter.structure(json , list[int | str])
###
test_dataclasses_json()
test_marshmallow_dataclass()
test_pydantic()
test_cattrs()
================================================
FILE: generate-readme
================================================
#!/bin/bash
set -eu
cd "$(dirname "$0")"
# --no-input seems to work well
# but if need more targeted approach, pparently can mark certain cells with tag and use '--TagRemovePreprocessor.remove_cell_tags={"noexport"}' ?
exec uvx --with jupyter --from jupyter-core jupyter nbconvert --execute --to markdown --no-input README.ipynb
# TODO run it on CI to make sure it renders and up to date?
================================================
FILE: github-issues.org
================================================
#+todo: OPEN | CLOSED
* Issues of cachew
:PROPERTIES:
:since:
:url: https://api.github.com/repos/karlicoss/cachew
:END:
** OPEN keep hash along each cached entity instead of separate table?
:PROPERTIES:
:tags: ("prio-B")
:id: 15
:date-modification: 2020-01-08T22:26:04+0000
:date-creation: 2020-01-08T22:26:04+0000
:author: "karlicoss"
:END:
: At the moment there are two separate tables: one for latest hash value, another for cached entities.
: It might be simpler and safer to keep a single table, with hash along with each cached entity.
:
** OPEN support multiple cached values?
:PROPERTIES:
:tags: ("prio-B")
:id: 14
:date-modification: 2020-01-08T22:26:03+0000
:date-creation: 2020-01-08T22:26:02+0000
:author: "karlicoss"
:END:
: At the moment it's LRU(1) cache, it some usecases it makes sense to cache more values though
:
** OPEN support pathlib.Path
:PROPERTIES:
:tags: ("prio-C")
:id: 13
:date-modification: 2020-01-08T22:26:02+0000
:date-creation: 2020-01-08T22:26:01+0000
:author: "karlicoss"
:END:
: Path is a trivial wrapper around str. I guess generally think of a good way to allow adhoc mapping of simple types.
: Perhaps current Exception makes sense.
:
** OPEN support defensive behaviour
:PROPERTIES:
:tags: ("prio-C")
:id: 12
:date-modification: 2020-01-08T22:26:01+0000
:date-creation: 2020-01-08T22:26:00+0000
:author: "karlicoss"
:END:
: E.g. if we can't serialize for some reason, bail the database but at least yield values anyway
:
** OPEN Add Redis support
:PROPERTIES:
:id: 9
:date-modification: 2020-01-06T00:48:59+0000
:date-creation: 2020-01-06T00:48:59+0000
:author: "softinio"
:END:
: Add Redis support as an alternative to sqlite
:
: This would be a great feature as it will make this solution easier to use in an enterprise production environment as getting a redis instance shared amonst multiple instances of your app is very easy and cost effective to use.
:
** OPEN better pytz support?
:PROPERTIES:
:tags: ("prio-C")
:id: 6
:date-modification: 2020-01-05T13:34:51+0000
:date-creation: 2020-01-05T13:33:25+0000
:author: "karlicoss"
:END:
** CLOSED Optional feature: Exception support
:PROPERTIES:
:id: 11
:date-modification: 2020-01-08T21:56:56+0000
:date-creation: 2020-01-08T21:34:03+0000
:author: "karlicoss"
:END:
** CLOSED Add doc on defensive/optional usage
:PROPERTIES:
:id: 10
:date-modification: 2020-01-06T23:48:54+0000
:date-creation: 2020-01-06T23:47:39+0000
:author: "karlicoss"
:END:
** CLOSED Safer concurrent writes handling
:PROPERTIES:
:id: 8
:date-modification: 2020-01-05T22:32:13+0000
:date-creation: 2020-01-05T22:08:24+0000
:author: "karlicoss"
:END:
** CLOSED Update readme
:PROPERTIES:
:id: 7
:date-modification: 2020-01-05T15:29:37+0000
:date-creation: 2020-01-05T15:24:38+0000
:author: "karlicoss"
:END:
** CLOSED support for dataclasses
:PROPERTIES:
:id: 1
:date-modification: 2020-01-05T13:34:50+0000
:date-creation: 2019-07-30T21:45:30+0100
:author: "karlicoss"
:END:
** CLOSED Fix Json support for python3.6
:PROPERTIES:
:id: 2
:date-modification: 2020-01-05T13:33:28+0000
:date-creation: 2019-12-08T12:21:58+0000
:author: "karlicoss"
:END:
** CLOSED Fix bug when default argument is explicitly specified
:PROPERTIES:
:id: 3
:date-modification: 2020-01-05T13:33:27+0000
:date-creation: 2019-12-08T17:56:51+0000
:author: "karlicoss"
:END:
** CLOSED Union types
:PROPERTIES:
:id: 4
:date-modification: 2020-01-05T13:33:27+0000
:date-creation: 2019-12-19T23:32:55+0000
:author: "karlicoss"
:END:
** CLOSED support top level primitive types
:PROPERTIES:
:id: 5
:date-modification: 2020-01-05T13:33:26+0000
:date-creation: 2019-12-20T00:09:00+0000
:author: "karlicoss"
:END:
================================================
FILE: misc/profile.py
================================================
#!/usr/bin/env python3
import sqlite3
from collections.abc import Iterator
from pathlib import Path
import sqlalchemy
from codetiming import Timer
from more_itertools import ilen
from cachew import cachew
# todo not sure it really helps much?
import gc # isort: skip
gc.disable()
def timer(name: str) -> Timer:
return Timer(name=name, text=name + ': ' + '{:.2f}s')
def test_ints() -> None:
N = 5_000_000
base = Path('/tmp/cachew_profiling/')
# shutil.rmtree(base)
base.mkdir(exist_ok=True, parents=True)
cache_path = base / 'ints'
def fun_nocachew(n) -> Iterator[int]:
yield from range(n)
@cachew(cache_path=cache_path, force_file=True)
def fun(n) -> Iterator[int]:
yield from range(n)
# with timer('no caching'):
# ilen(fun_nocachew(N))
# with timer('initial call'):
# ilen(fun(N))
assert cache_path.exists() # just in case
with timer('reading directly via sqlite'):
total = 0
with sqlite3.connect(cache_path) as conn:
for (_x,) in conn.execute('SELECT * FROM cache'):
total += 1
assert total == N # just in case
with timer('reading directly via sqlalchemy'):
total = 0
engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}')
from sqlalchemy import Column, MetaData, Table
meta = MetaData()
table_cache = Table('cache', meta, Column('_cachew_primitive', sqlalchemy.Integer))
with engine.connect() as conn:
with timer('sqlalchemy querying'):
rows = conn.execute(table_cache.select())
for (_x,) in rows:
total += 1
engine.dispose()
assert total == N # just in case
cache_size_mb = cache_path.stat().st_size / 10**6
print(f'cache size: {cache_size_mb:.1f} Mb')
with timer('subsequent call'):
ilen(fun(N))
test_ints()
================================================
FILE: misc/test_redis/docker-compose.yml
================================================
services:
redis:
image: "redis:alpine"
# restart: always
command:
- "sh"
- "-euc"
- |
exec redis-server
# - |
# echo "requirepass '$$REDIS_PASSWORD'" > /etc/redis.conf
# exec redis-server /etc/redis.conf
# environment:
# REDIS_PASSWORD: "password"
ports:
- 6379:6379
volumes:
- "redis-cachew:/data:rw"
volumes:
redis-cachew:
================================================
FILE: misc/test_redis/test.py
================================================
#!/usr/bin/env python3
from time import time
import redis # ty: ignore[unresolved-import]
from loguru import logger # ty: ignore[unresolved-import]
from more_itertools import ilen
r = redis.Redis(host='localhost', port=6379, db=0)
N = 1_000_000
def items():
yield from map(str, range(N))
TAG = 'keys'
def reset():
r.delete(TAG)
def write():
for i, obj in enumerate(items()):
key = f'obj:{i}'
r.hset(key, 'data', obj)
r.lpush(TAG, key)
def read():
keys = r.lrange(TAG, 0, -1)
result = (r.hget(key, 'data') for key in keys)
print('total', ilen(result))
# TODO could use lmove for atomic operations?
def write2():
for obj in items():
r.lpush(TAG, obj)
def read2():
result = r.lrange(TAG, 0, -1)
print('total', ilen(result))
reset()
a = time()
write2()
b = time()
logger.info(f'writing took {b - a:.1f}s')
a = time()
read2()
b = time()
logger.info(f'reading took {b - a:.1f}s')
# with read()/write()
# 100000 strings:
# 2023-09-09 01:50:23.498 | INFO | __main__::37 - writing took 13.1s
# 2023-09-09 01:50:30.052 | INFO | __main__::42 - reading took 6.6s
# hmm kinda slow..
# with read2/write2, writing about 7secs, and reading is instantaneous??
# for 1M objects, writing took 60 secs, and reading 0.2s?
# lol could be promising...
# I guess it's not iterative, but could retrieve items in batches?
================================================
FILE: mypy.ini
================================================
[mypy]
pretty = True
show_error_context = True
show_column_numbers = True
show_error_end = True
check_untyped_defs = True
# see https://mypy.readthedocs.io/en/stable/error_code_list2.html
warn_redundant_casts = True
strict_equality = True
warn_unused_ignores = True
enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable
# an example of suppressing
# [mypy-my.config.repos.pdfannots.pdfannots]
# ignore_errors = True
================================================
FILE: pyproject.toml
================================================
# see https://github.com/karlicoss/pymplate for up-to-date reference
[project]
dynamic = ["version"] # version is managed by build backend
name = "cachew"
dependencies = [
"platformdirs", # default cache dir
"sqlalchemy>=1.0", # cache DB interaction
"orjson", # fast json serialization
"typing-extensions",# for depreceated decorator
]
requires-python = ">=3.12"
## these need to be set if you're planning to upload to pypi
# description = "TODO"
license = {file = "LICENSE.txt"}
authors = [
{name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
]
maintainers = [
{name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
]
# keywords = []
# # see: http://pypi.python.org/pypi?%3Aaction=list_classifiers
# classifiers = [
# ]
[project.urls]
Homepage = "https://github.com/karlicoss/cachew"
##
[project.optional-dependencies]
optional = [
"colorlog",
]
[dependency-groups]
# TODO: not sure, on the one hand could just use 'standard' dev dependency group
# On the other hand, it's a bit annoying that it's always included by default?
# To make sure it's not included, need to use `uv run --exact --no-default-groups ...`
testing = [
"pytest>=9", # need version 9 for proper namespace package support
"ruff",
"pytz",
"more-itertools",
"patchy", # for injecting sleeps and testing concurrent behaviour
"enlighten", # used in logging helper, but not really required
"cattrs", # benchmarking alternative marshalling implementation
"pyinstrument", # for profiling from within tests
"codetiming", # Timer context manager
]
typecheck = [
{ include-group = "testing" },
"mypy",
"lxml", # for mypy html coverage
"ty>=0.0.3",
"types-pytz", # optional runtime only dependency
"cachew[optional]",
]
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"
# unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894
[tool.hatch.build.targets.wheel]
packages = ["src/cachew"]
[tool.hatch.version]
source = "vcs"
[tool.hatch.version.raw-options]
version_scheme = "python-simplified-semver"
local_scheme = "dirty-tag"
================================================
FILE: pytest.ini
================================================
[pytest]
# discover files that don't follow test_ naming. Useful to keep tests along with the source code
python_files = *.py
# this is necessary for --pyargs to discover implicit namespace packages correctly
consider_namespace_packages = true
# see https://docs.pytest.org/en/stable/reference/reference.html#confval-strict
# disable for now -- some macos tests ('file backend') are flaky
# strict = true
addopts =
# prevent pytest cache from being created... it craps into project dir and I never use it anyway
-p no:cacheprovider
# -rap to print tests summary even when they are successful
-rap
--verbose
# otherwise it won't discover doctests
--doctest-modules
# show all test durations (unless they are too short)
--durations=0
================================================
FILE: ruff.toml
================================================
line-length = 120 # impacts import sorting
lint.extend-select = [
"ALL",
]
lint.ignore = [
"D", # annoying nags about docstrings
"N", # pep naming
"TCH", # type checking rules, mostly just suggests moving imports under TYPE_CHECKING
"S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks
"DTZ", # datetimes checks -- complaining about missing tz and mostly false positives
"FIX", # complains about fixmes/todos -- annoying
"TD", # complains about todo formatting -- too annoying
"ANN", # missing type annotations? seems way to strict though
"EM" , # suggests assigning all exception messages into a variable first... pretty annoying
### too opinionated style checks
"E501", # too long lines
"E731", # assigning lambda instead of using def
"E741", # Ambiguous variable name: `l`
"E742", # Ambiguous class name: `O
"E401", # Multiple imports on one line
"F403", # import *` used; unable to detect undefined names
###
###
"E722", # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing..
"F811", # Redefinition of unused # this gets in the way of pytest fixtures (e.g. in cachew)
## might be nice .. but later and I don't wanna make it strict
"E402", # Module level import not at top of file
### these are just nitpicky, we usually know better
"PLR0911", # too many return statements
"PLR0912", # too many branches
"PLR0913", # too many function arguments
"PLR0915", # too many statements
"PLR1714", # consider merging multiple comparisons
"PLR2044", # line with empty comment
"PLR5501", # use elif instead of else if
"PLR2004", # magic value in comparison -- super annoying in tests
###
"PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check
"B009", # calling gettattr with constant attribute -- this is useful to convince mypy
"B010", # same as above, but setattr
"B017", # pytest.raises(Exception)
"B023", # seems to result in false positives?
# complains about useless pass, but has sort of a false positive if the function has a docstring?
# this is common for click entrypoints (e.g. in __main__), so disable
"PIE790",
# a bit too annoying, offers to convert for loops to list comprehension
# , which may heart readability
"PERF401",
# suggests no using exception in for loops
# we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost"
"PERF203",
"RET504", # unnecessary assignment before returning -- that can be useful for readability
"RET505", # unnecessary else after return -- can hurt readability
"PLW0603", # global variable update.. we usually know why we are doing this
"PLW2901", # for loop variable overwritten, usually this is intentional
"PT011", # pytest raises is too broad
"COM812", # trailing comma missing -- mostly just being annoying with long multiline strings
"TRY003", # suggests defining exception messages in exception class -- kinda annoying
"TRY201", # raise without specifying exception name -- sometimes hurts readability
"TRY400", # a bit dumb, and results in false positives (see https://github.com/astral-sh/ruff/issues/18070)
"TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging
"TID252", # Prefer absolute imports over relative imports from parent modules
## too annoying
"T20", # just complains about prints and pprints (TODO maybe consider later?)
"Q", # flake quotes, too annoying
"C90", # some complexity checking
"G004", # logging statement uses f string
"ERA001", # commented out code
"SLF001", # private member accessed
"BLE001", # do not catch 'blind' Exception
"INP001", # complains about implicit namespace packages
"SIM102", # if statements collapsing, often hurts readability
"SIM103", # multiple conditions collapsing, often hurts readability
"SIM105", # suggests using contextlib.suppress instad of try/except -- this wouldn't be mypy friendly
"SIM108", # suggests using ternary operation instead of if -- hurts readability
"SIM110", # suggests using any(...) instead of for look/return -- hurts readability
"SIM117", # suggests using single with statement instead of nested -- doesn't work in tests
"RSE102", # complains about missing parens in exceptions
##
"PLC0415", # "imports should be at the top level" -- not realistic
"ISC001", # implicit string concatenation -- we do use it in tests
]
extend-exclude = [
"src/cachew/legacy.py", # TODO dunno, remove it for good?
]
================================================
FILE: src/cachew/__init__.py
================================================
import fnmatch
import functools
import importlib.metadata
import inspect
import json
import logging
import os
import stat
import warnings
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Literal,
cast,
get_args,
get_origin,
get_type_hints,
overload,
)
try:
# orjson might not be available on some architectures, so let's make it defensive just in case
from orjson import dumps as orjson_dumps
from orjson import loads as orjson_loads
except:
warnings.warn("orjson couldn't be imported. It's _highly_ recommended for better caching performance", stacklevel=2)
def orjson_dumps(*args, **kwargs): # type: ignore[misc]
# sqlite needs a blob
return json.dumps(*args, **kwargs).encode('utf8')
orjson_loads = json.loads # ty: ignore[invalid-assignment]
import platformdirs
from .backend.common import AbstractBackend
from .backend.file import FileBackend
from .backend.sqlite import SqliteBackend
from .common import CachewException, SourceHash, TypeNotSupported
from .logging_helper import make_logger
from .marshall.cachew import CachewMarshall, build_schema
from .utils import resolve_type_parameters
# in case of changes in the way cachew stores data, this should be changed to discard old caches
CACHEW_VERSION: str = importlib.metadata.version(__name__)
type PathIsh = Path | str
Backend = Literal['sqlite', 'file']
class settings:
'''
Global settings, you can override them after importing cachew
'''
'''
Toggle to disable caching
'''
ENABLE: bool = True
DEFAULT_CACHEW_DIR: PathIsh = Path(platformdirs.user_cache_dir('cachew'))
'''
Set to true if you want to fail early. Otherwise falls back to non-cached version
'''
THROW_ON_ERROR: bool = False
DEFAULT_BACKEND: Backend = 'sqlite'
def get_logger() -> logging.Logger:
return make_logger(__name__)
BACKENDS: dict[Backend, type[AbstractBackend]] = {
'file': FileBackend,
'sqlite': SqliteBackend,
}
type PathProvider[**P] = PathIsh | Callable[P, PathIsh]
type HashFunction[**P] = Callable[P, SourceHash]
def default_hash(*args, **kwargs) -> SourceHash:
# TODO eh, demand hash? it's not safe either... ugh
# can lead to werid consequences otherwise..
return str(args + tuple(sorted(kwargs.items()))) # good enough??
# TODO give it as an example in docs
def mtime_hash(path: Path, *args, **kwargs) -> SourceHash:
mt = path.stat().st_mtime
return default_hash(f'{path}.{mt}', *args, **kwargs)
Failure = str # deliberately not a type =, used in type checks
type Kind = Literal['single', 'multiple']
type Inferred = tuple[Kind, type[Any]]
def infer_return_type(func) -> Failure | Inferred:
"""
>>> def const() -> int:
... return 123
>>> infer_return_type(const)
('single', )
>>> from typing import Optional
>>> def first_character(s: str) -> Optional[str]:
... return None if len(s) == 0 else s[0]
>>> kind, opt = infer_return_type(first_character)
>>> # in 3.8, Optional[str] is printed as Union[str, None], so need to hack around this
>>> (kind, opt == Optional[str])
('single', True)
# tuple is an iterable.. but presumably should be treated as a single value
>>> from typing import Tuple
>>> def a_tuple() -> Tuple[int, str]:
... return (123, 'hi')
>>> infer_return_type(a_tuple)
('single', tuple[int, str])
>>> from typing import Collection, NamedTuple
>>> class Person(NamedTuple):
... name: str
... age: int
>>> def person_provider() -> Collection[Person]:
... return []
>>> infer_return_type(person_provider)
('multiple', )
>>> def single_str() -> str:
... return 'hello'
>>> infer_return_type(single_str)
('single', )
>>> def single_person() -> Person:
... return Person(name="what", age=-1)
>>> infer_return_type(single_person)
('single', )
>>> from typing import Sequence
>>> def int_provider() -> Sequence[int]:
... return (1, 2, 3)
>>> infer_return_type(int_provider)
('multiple', )
>>> from typing import Iterator
>>> def union_provider() -> Iterator[str | int]:
... yield 1
... yield 'aaa'
>>> infer_return_type(union_provider)
('multiple', str | int)
>>> from typing import Iterator
>>> type Str = str
>>> type Int = int
>>> type IteratorStrInt = Iterator[Str | Int]
>>> def iterator_str_int() -> IteratorStrInt:
... yield 1
... yield 'aaa'
>>> infer_return_type(iterator_str_int)
('multiple', str | int)
# a bit of an edge case
>>> from typing import Tuple
>>> def empty_tuple() -> Iterator[Tuple[()]]:
... yield ()
>>> infer_return_type(empty_tuple)
('multiple', tuple[()])
... # doctest: +ELLIPSIS
>>> def untyped():
... return 123
>>> infer_return_type(untyped)
'no return type annotation...'
>>> from typing import List
>>> class Custom:
... pass
>>> def unsupported() -> Custom:
... return Custom()
>>> infer_return_type(unsupported)
"can't infer type from : can't cache "
>>> def unsupported_list() -> List[Custom]:
... return [Custom()]
>>> infer_return_type(unsupported_list)
"can't infer type from list[cachew.Custom]: can't cache "
"""
try:
hints = get_type_hints(func)
except Exception as ne:
# get_type_hints might fail if types are forward defined or missing
# see test_future_annotation for an example
return str(ne)
rtype = hints.get('return', None)
if rtype is None:
return f"no return type annotation on {func}"
rtype = resolve_type_parameters(rtype)
def bail(reason: str) -> str:
return f"can't infer type from {rtype}: " + reason
# first we wanna check if the top level type is some sort of iterable that makes sense ot cache
# e.g. List/Sequence/Iterator etc
return_multiple = _returns_multiple(rtype)
if return_multiple:
# then the actual type to cache will be the argument of the top level one
args = get_args(rtype)
if args is None:
return bail("has no __args__")
if len(args) != 1:
return bail(f"wrong number of __args__: {args}")
(cached_type,) = args
else:
cached_type = rtype
try:
build_schema(Type=cached_type)
except TypeNotSupported as ex:
return bail(f"can't cache {ex.type_}")
return ('multiple' if return_multiple else 'single', cached_type)
def _returns_multiple(rtype) -> bool:
origin = get_origin(rtype)
if origin is None:
return False
if origin is tuple:
# usually tuples are more like single values rather than a sequence? (+ this works for namedtuple)
return False
try:
return issubclass(origin, Iterable)
except TypeError:
# that would happen if origin is not a 'proper' type, e.g. is a Union or something
# seems like exception is the easiest way to check
return False
# https://stackoverflow.com/questions/653368/how-to-create-a-python-decorator-that-can-be-used-either-with-or-without-paramet
def doublewrap(f):
@functools.wraps(f)
def new_dec(*args, **kwargs):
if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
# actual decorated function
return f(args[0])
else:
# decorator arguments
return lambda realf: f(realf, *args, **kwargs)
return new_dec
def cachew_error(e: Exception, *, logger: logging.Logger) -> None:
if settings.THROW_ON_ERROR:
# TODO would be nice to throw from the original code line -- maybe mess with the stack here?
raise e
logger.error("error while setting up cache, falling back to non-cached version")
logger.exception(e)
use_default_path = cast(Path, object())
# using cachew_impl here just to use different signatures during type checking (see below)
@doublewrap
def cachew_impl[**P](
func=None, # TODO should probably type it after switch to python 3.10/proper paramspec
cache_path: PathProvider[P] | None = use_default_path,
*,
force_file: bool = False,
cls: type | tuple[Kind, type] | None = None,
depends_on: HashFunction[P] = default_hash,
logger: logging.Logger | None = None,
chunk_by: int = 100,
# NOTE: allowed values for chunk_by depend on the system.
# some systems (to be more specific, sqlite builds), it might be too large and cause issues
# ideally this would be more defensive/autodetected, maybe with a warning?
# you can use 'test_many' to experiment
# - too small values (e.g. 10) are slower than 100 (presumably, too many sql statements)
# - too large values (e.g. 10K) are slightly slower as well (not sure why?)
synthetic_key: str | None = None,
backend: Backend | None = None,
**kwargs,
):
r"""
Database-backed cache decorator. TODO more description?
# TODO use this doc in readme?
:param cache_path: if not set, `cachew.settings.DEFAULT_CACHEW_DIR` will be used.
:param force_file: if set to True, assume `cache_path` is a regular file (instead of a directory)
:param cls: if not set, cachew will attempt to infer it from return type annotation. See :func:`infer_return_type` and :func:`cachew.tests.test_cachew.test_return_type_inference`.
:param depends_on: hash function to determine whether the underlying . Can potentially benefit from the use of side effects (e.g. file modification time). TODO link to test?
:param logger: custom logger, if not specified will use logger named `cachew`. See :func:`get_logger`.
:return: iterator over original or cached items
Usage example:
>>> from typing import NamedTuple, Iterator
>>> class Link(NamedTuple):
... url : str
... text: str
...
>>> @cachew
... def extract_links(archive_path: str) -> Iterator[Link]:
... for i in range(5):
... # simulate slow IO
... # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours
... import time; time.sleep(1)
... yield Link(url=f'http://link{i}.org', text=f'text {i}')
...
>>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run
[Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]
>>> from timeit import Timer
>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)
... # second run is cached, so should take less time
>>> print(f"call took {int(res)} seconds")
call took 0 seconds
>>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)
... # now file has changed, so the cache will be discarded
>>> print(f"call took {int(res)} seconds")
call took 5 seconds
"""
if logger is None:
module_name = getattr(func, '__module__', None)
if module_name is not None and module_name in logging.Logger.manager.loggerDict:
# if logger for the function's module already exists, reuse it
logger = logging.getLogger(module_name)
else:
# rely on default cachew logger
logger = get_logger()
class AddFuncName(logging.LoggerAdapter):
def process(self, msg, kwargs):
extra = self.extra
assert extra is not None
func_name = extra['func_name']
return f'[{func_name}] {msg}', kwargs
assert func is not None
func_name = callable_name(func)
adapter = AddFuncName(logger, {'func_name': func_name})
logger = cast(logging.Logger, adapter)
hashf = kwargs.get('hashf')
if hashf is not None:
warnings.warn("'hashf' is deprecated. Please use 'depends_on' instead", stacklevel=2)
depends_on = hashf
# todo not very nice that ENABLE check is scattered across two places
if not settings.ENABLE or cache_path is None:
logger.debug('cache explicitly disabled (settings.ENABLE is False or cache_path is None)')
return func
if cache_path is use_default_path:
cache_path = settings.DEFAULT_CACHEW_DIR
logger.debug(f'no cache_path specified, using the default {cache_path}')
use_kind: Kind | None = None
use_cls: type | None = None
if cls is not None:
# defensive here since typing. objects passed as cls might fail on isinstance
try:
is_tuple = isinstance(cls, tuple)
except:
is_tuple = False
if is_tuple:
use_kind, use_cls = cls # type: ignore[misc]
else:
use_kind = 'multiple'
use_cls = cls # type: ignore[assignment]
# TODO fuzz infer_return_type, should never crash?
inference_res = infer_return_type(func)
if isinstance(inference_res, Failure):
msg = f"failed to infer cache type: {inference_res}. See https://github.com/karlicoss/cachew#features for the list of supported types."
if use_cls is None:
ex = CachewException(msg)
cachew_error(ex, logger=logger)
return func
else:
# it's ok, assuming user knows better
logger.debug(msg)
assert use_kind is not None
else:
(inferred_kind, inferred_cls) = inference_res
if use_cls is None:
logger.debug(f'using inferred type {inferred_kind} {inferred_cls}')
(use_kind, use_cls) = (inferred_kind, inferred_cls)
else:
assert use_kind is not None
if (use_kind, use_cls) != inference_res:
logger.warning(
f"inferred type {inference_res} mismatches explicitly specified type {(use_kind, use_cls)}"
)
# TODO not sure if should be more serious error...
if use_kind == 'single':
# pretend it's an iterable, this is just simpler for cachew_wrapper
@functools.wraps(func)
def _func(*args, **kwargs):
return [func(*args, **kwargs)]
else:
_func = func
assert use_cls is not None
ctx = Context(
func =_func,
cache_path =cache_path,
force_file =force_file,
cls_ =use_cls,
depends_on =depends_on,
logger =logger,
chunk_by =chunk_by,
synthetic_key=synthetic_key,
backend =backend,
) # fmt: skip
# hack to avoid extra stack frame (see test_recursive*)
@functools.wraps(func)
def binder(*args, **kwargs):
kwargs['_cachew_context'] = ctx
res = cachew_wrapper(*args, **kwargs)
if use_kind == 'single':
lres = list(res)
assert len(lres) == 1, lres # shouldn't happen
return lres[0]
return res
return binder
if TYPE_CHECKING:
# we need two versions due to @doublewrap
# this is when we just annotate as @cachew without any args
@overload
def cachew[F: Callable](fun: F) -> F: ...
# NOTE: we won't really be able to make sure the args of cache_path are the same as args of the wrapped function
# because when cachew() is called, we don't know anything about the wrapped function yet
# but at least it works for checking that cachew_path and depdns_on have the same args :shrug:
@overload
def cachew[F, **P](
cache_path: PathProvider[P] | None = ...,
*,
force_file: bool = ...,
cls: type | tuple[Kind, type] | None = ...,
depends_on: HashFunction[P] = ...,
logger: logging.Logger | None = ...,
chunk_by: int = ...,
synthetic_key: str | None = ...,
backend: Backend | None = ...,
) -> Callable[[F], F]: ...
def cachew(*args, **kwargs): # make ty happy
raise NotImplementedError
else:
cachew = cachew_impl
def callable_name(func: Callable) -> str:
# some functions don't have __module__
mod = getattr(func, '__module__', None) or ''
return f'{mod}:{getattr(func, "__qualname__")}'
def callable_module_name(func: Callable) -> str | None:
return getattr(func, '__module__', None)
# could cache this, but might be worth not to, so the user can change it on the fly?
def _parse_disabled_modules(logger: logging.Logger | None = None) -> list[str]:
# e.g. CACHEW_DISABLE=my.browser:my.reddit
if 'CACHEW_DISABLE' not in os.environ:
return []
disabled = os.environ['CACHEW_DISABLE']
if disabled.strip() == '':
return []
if ',' in disabled and logger:
logger.warning(
'CACHEW_DISABLE contains a comma, but this expects a $PATH-like, colon-separated list; '
f'try something like CACHEW_DISABLE={disabled.replace(",", ":")}'
)
# remove any empty strings incase did something like CACHEW_DISABLE=my.module:$CACHEW_DISABLE
return [p for p in disabled.split(':') if p.strip() != '']
def _matches_disabled_module(module_name: str, pattern: str) -> bool:
'''
>>> _matches_disabled_module('my.browser', 'my.browser')
True
>>> _matches_disabled_module('my.browser', 'my.*')
True
>>> _matches_disabled_module('my.browser', 'my')
True
>>> _matches_disabled_module('my.browser', 'my.browse*')
True
>>> _matches_disabled_module('my.browser.export', 'my.browser')
True
>>> _matches_disabled_module('mysomething.else', '*') # CACHEW_DISABLE='*' disables everything
True
>>> _matches_disabled_module('my.browser', 'my.br?????') # fnmatch supports unix-like patterns
True
>>> _matches_disabled_module('my.browser', 'my.browse')
False
>>> _matches_disabled_module('mysomething.else', 'my') # since not at '.' boundary, doesn't match
False
>>> _matches_disabled_module('mysomething.else', '')
False
>>> _matches_disabled_module('my.browser', 'my.browser.export')
False
'''
if module_name == pattern:
return True
module_parts = module_name.split('.')
pattern_parts = pattern.split('.')
# e.g. if pattern is 'module.submod.inner_module' and module is just 'module.submod'
# theres no possible way for it to match
if len(module_parts) < len(pattern_parts):
return False
for mp, pp in zip(module_parts, pattern_parts, strict=False):
if fnmatch.fnmatch(mp, pp):
continue
return False
return True
def _module_is_disabled(module_name: str, logger: logging.Logger) -> bool:
disabled_modules = _parse_disabled_modules(logger)
for pat in disabled_modules:
if _matches_disabled_module(module_name, pat):
logger.debug(
f"caching disabled for {module_name} (matched '{pat}' from 'CACHEW_DISABLE={os.environ['CACHEW_DISABLE']})'"
)
return True
return False
# fmt: off
_CACHEW_CACHED = 'cachew_cached' # TODO add to docs
_SYNTHETIC_KEY = 'synthetic_key'
_SYNTHETIC_KEY_VALUE = 'synthetic_key_value'
_DEPENDENCIES = 'dependencies'
# fmt: on
@dataclass
class Context[**P]:
# fmt: off
func : Callable
cache_path : PathProvider[P]
force_file : bool
cls_ : type
depends_on : HashFunction[P]
logger : logging.Logger
chunk_by : int
synthetic_key: str | None
backend : Backend | None
def composite_hash(self, *args, **kwargs) -> dict[str, Any]:
fsig = inspect.signature(self.func)
# defaults wouldn't be passed in kwargs, but they can be an implicit dependency (especially inbetween program runs)
defaults = {
k: v.default
for k, v in fsig.parameters.items()
if v.default is not inspect.Parameter.empty
}
# but only pass default if the user wants it in the hash function?
hsig = inspect.signature(self.depends_on)
defaults = {
k: v
for k, v in defaults.items()
if k in hsig.parameters or 'kwargs' in hsig.parameters
}
kwargs = {**defaults, **kwargs}
schema = str(self.cls_)
hash_parts = {
'cachew' : CACHEW_VERSION,
'schema' : schema,
_DEPENDENCIES : str(self.depends_on(*args, **kwargs)),
}
synthetic_key = self.synthetic_key
if synthetic_key is not None:
hash_parts[_SYNTHETIC_KEY ] = synthetic_key
hash_parts[_SYNTHETIC_KEY_VALUE] = kwargs[synthetic_key]
# FIXME assert it's in kwargs in the first place?
# FIXME support positional args too? maybe extract the name from signature somehow? dunno
# need to test it
return hash_parts
# fmt: on
def cachew_wrapper[**P](
*args,
_cachew_context: Context[P],
**kwargs,
):
C = _cachew_context
# fmt: off
func = C.func
cache_path = C.cache_path
force_file = C.force_file
cls = C.cls_
logger = C.logger
chunk_by = C.chunk_by
synthetic_key = C.synthetic_key
backend_name = C.backend
# fmt: on
used_backend = backend_name or settings.DEFAULT_BACKEND
func_name = callable_name(func)
if not settings.ENABLE:
logger.debug('cache explicitly disabled (settings.ENABLE is False)')
yield from func(*args, **kwargs)
return
mod_name = callable_module_name(func)
if mod_name is not None and _module_is_disabled(mod_name, logger):
yield from func(*args, **kwargs)
return
def get_db_path() -> Path | None:
db_path: Path
if callable(cache_path):
pp = cache_path(*args, **kwargs)
if pp is None:
logger.debug('cache explicitly disabled (cache_path is None)')
# early return, in this case we just yield the original items from the function
return None
else:
db_path = Path(pp)
else:
db_path = Path(cache_path)
db_path.parent.mkdir(parents=True, exist_ok=True)
# need to be atomic here, hence calling stat() once and then just using the results
try:
# note: stat follows symlinks (which is what we want)
st = db_path.stat()
except FileNotFoundError:
# doesn't exist. then it's controlled by force_file
if force_file:
# just use db_path as is
pass
else:
db_path.mkdir(parents=True, exist_ok=True)
db_path = db_path / func_name
else:
# already exists, so just use callable name if it's a dir
if stat.S_ISDIR(st.st_mode):
db_path = db_path / func_name
logger.debug(f'using {used_backend}:{db_path} for cache')
return db_path
def try_use_synthetic_key() -> None:
if synthetic_key is None:
return
# attempt to use existing cache if possible, as a 'prefix'
old_hash_d: dict[str, Any] = {}
if old_hash is not None:
try:
old_hash_d = json.loads(old_hash)
except json.JSONDecodeError:
# possible if we used old cachew version (<=0.8.1), hash wasn't json
pass
hash_diffs = {
k: new_hash_d.get(k) == old_hash_d.get(k)
for k in (*new_hash_d.keys(), *old_hash_d.keys())
# the only 'allowed' differences for hash, otherwise need to recompute (e.g. if schema changed)
if k not in {_SYNTHETIC_KEY_VALUE, _DEPENDENCIES}
}
cache_compatible = all(hash_diffs.values())
if not cache_compatible:
return
def missing_keys(cached: list[str], wanted: list[str]) -> list[str] | None:
# FIXME assert both cached and wanted are sorted? since we rely on it
# if not, then the user could use some custom key for caching (e.g. normalise filenames etc)
# although in this case passing it into the function wouldn't make sense?
if len(cached) == 0:
# no point trying to reuse anything, cache should be empty?
return None
if len(wanted) == 0:
# similar, no way to reuse cache
return None
if cached[0] != wanted[0]:
# there is no common prefix, so no way to reuse cache really
return None
last_cached = cached[-1]
# ok, now actually figure out which items are missing
for i, k in enumerate(wanted):
if k > last_cached:
# ok, rest of items are missing
return wanted[i:]
# otherwise too many things are cached, and we seem to wante less
return None
new_values: list[str] = new_hash_d[_SYNTHETIC_KEY_VALUE]
old_values: list[str] = old_hash_d[_SYNTHETIC_KEY_VALUE]
missing = missing_keys(cached=old_values, wanted=new_values)
if missing is not None:
# can reuse cache
kwargs[_CACHEW_CACHED] = cached_items()
kwargs[synthetic_key] = missing
early_exit = False
def written_to_cache():
nonlocal early_exit
datas = func(*args, **kwargs)
if isinstance(backend, FileBackend):
# FIXME uhhh.. this is a bit crap
# but in sqlite mode we don't want to publish new hash before we write new items
# maybe should use tmp table for hashes as well?
backend.write_new_hash(new_hash)
else:
# happens later for sqlite
pass
flush_blobs = backend.flush_blobs
chunk: list[Any] = []
def flush() -> None:
nonlocal chunk
if len(chunk) > 0:
flush_blobs(chunk=chunk)
chunk = []
total_objects = 0
for obj in datas:
try:
total_objects += 1
yield obj
except GeneratorExit:
early_exit = True
return
dct = marshall.dump(obj)
blob = orjson_dumps(dct)
chunk.append(blob)
if len(chunk) >= chunk_by:
flush()
flush()
backend.finalize(new_hash)
logger.info(f'wrote {total_objects} objects to cachew ({used_backend}:{db_path})')
def cached_items():
total_cached = backend.cached_blobs_total()
total_cached_s = '' if total_cached is None else f'{total_cached} '
logger.info(f'loading {total_cached_s}objects from cachew ({used_backend}:{db_path})')
for blob in backend.cached_blobs():
j = orjson_loads(blob)
obj = marshall.load(j)
yield obj
# NOTE: annoyingly huge try/catch ahead...
# but it lets us save a function call, hence a stack frame
# see test_recursive*
try:
db_path = get_db_path()
if db_path is None:
yield from func(*args, **kwargs)
return
BackendCls = BACKENDS[used_backend]
new_hash_d = C.composite_hash(*args, **kwargs)
new_hash: SourceHash = json.dumps(new_hash_d)
logger.debug(f'new hash: {new_hash}')
marshall: CachewMarshall[Any] = CachewMarshall(Type_=cls)
with BackendCls(cache_path=db_path, logger=logger) as backend:
old_hash = backend.get_old_hash()
logger.debug(f'old hash: {old_hash}')
if new_hash == old_hash:
logger.debug('hash matched: loading from cache')
yield from cached_items()
return
logger.debug('hash mismatch: computing data and writing to db')
try_use_synthetic_key()
got_write = backend.get_exclusive_write()
if not got_write:
# NOTE: this is the bit we really have to watch out for and not put in a helper function
# otherwise it's causing an extra stack frame on every call
# the rest (reading from cachew or writing to cachew) happens once per function call? so not a huge deal
yield from func(*args, **kwargs)
return
# at this point we're guaranteed to have an exclusive write transaction
yield from written_to_cache()
except Exception as e:
# sigh... see test_early_exit_shutdown...
if early_exit and 'Cannot operate on a closed database' in str(e):
return
# todo hmm, kinda annoying that it tries calling the function twice?
# but gonna require some sophisticated cooperation with the cached wrapper otherwise
cachew_error(e, logger=logger)
yield from func(*args, **kwargs)
__all__ = [
'CachewException',
'HashFunction',
'SourceHash',
'cachew',
'get_logger',
]
================================================
FILE: src/cachew/backend/common.py
================================================
import logging
from abc import abstractmethod
from collections.abc import Iterator, Sequence
from pathlib import Path
from ..common import SourceHash
class AbstractBackend:
@abstractmethod
def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
raise NotImplementedError
@abstractmethod
def __enter__(self):
raise NotImplementedError
def __exit__(self, *args) -> None:
raise NotImplementedError
def get_old_hash(self) -> SourceHash | None:
raise NotImplementedError
def cached_blobs_total(self) -> int | None:
raise NotImplementedError
def cached_blobs(self) -> Iterator[bytes]:
raise NotImplementedError
def get_exclusive_write(self) -> bool:
'''
Returns whether it actually managed to get it
'''
raise NotImplementedError
def write_new_hash(self, new_hash: SourceHash) -> None:
raise NotImplementedError
def flush_blobs(self, chunk: Sequence[bytes]) -> None:
raise NotImplementedError
def finalize(self, new_hash: SourceHash) -> None:
raise NotImplementedError
================================================
FILE: src/cachew/backend/file.py
================================================
import logging
from collections.abc import Iterator, Sequence
from pathlib import Path
from typing import (
BinaryIO,
)
from ..common import SourceHash
from .common import AbstractBackend
class FileBackend(AbstractBackend):
jsonl: Path
jsonl_tmp: Path
jsonl_fr: BinaryIO | None
jsonl_tmp_fw: BinaryIO | None
def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
self.logger = logger
self.jsonl = cache_path
self.jsonl_tmp = Path(str(self.jsonl) + '.tmp')
self.jsonl_fr = None
self.jsonl_tmp_fw = None
def __enter__(self) -> 'FileBackend':
try:
self.jsonl_fr = self.jsonl.open('rb')
except FileNotFoundError:
self.jsonl_fr = None
return self
def __exit__(self, *args) -> None:
if self.jsonl_tmp_fw is not None:
# might still exist in case of early exit
self.jsonl_tmp.unlink(missing_ok=True)
# NOTE: need to unlink first
# otherwise possible that someone else might open the file before we unlink it
self.jsonl_tmp_fw.close()
if self.jsonl_fr is not None:
self.jsonl_fr.close()
def get_old_hash(self) -> SourceHash | None:
if self.jsonl_fr is None:
return None
hash_line = self.jsonl_fr.readline().rstrip(b'\n')
return hash_line.decode('utf8')
def cached_blobs_total(self) -> int | None:
# not really sure how to support that for a plaintext file?
# could wc -l but it might be costly..
return None
def cached_blobs(self) -> Iterator[bytes]:
assert self.jsonl_fr is not None # should be guaranteed by get_old_hash
yield from self.jsonl_fr # yields line by line
def get_exclusive_write(self) -> bool:
# NOTE: opening in x (exclusive write) mode just in case, so it throws if file exists
try:
self.jsonl_tmp_fw = self.jsonl_tmp.open('xb')
except FileExistsError:
self.jsonl_tmp_fw = None
return False
else:
return True
def write_new_hash(self, new_hash: SourceHash) -> None:
assert self.jsonl_tmp_fw is not None
self.jsonl_tmp_fw.write(new_hash.encode('utf8') + b'\n')
def flush_blobs(self, chunk: Sequence[bytes]) -> None:
fw = self.jsonl_tmp_fw
assert fw is not None
for blob in chunk:
fw.write(blob)
fw.write(b'\n')
def finalize(self, new_hash: SourceHash) -> None: # noqa: ARG002
# TODO defensive??
self.jsonl_tmp.rename(self.jsonl)
================================================
FILE: src/cachew/backend/sqlite.py
================================================
import logging
import sqlite3
import time
import warnings
from collections.abc import Iterator, Sequence
from pathlib import Path
import sqlalchemy
import sqlalchemy.exc
from sqlalchemy import Column, Table, event, text
from sqlalchemy.dialects import sqlite
from ..common import SourceHash
from .common import AbstractBackend
class SqliteBackend(AbstractBackend):
def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
self.logger = logger
self.engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}', connect_args={'timeout': 0})
# NOTE: timeout is necessary so we don't lose time waiting during recursive calls
# by default, it's several seconds? you'd see 'test_recursive' test performance degrade
@event.listens_for(self.engine, 'connect')
def set_sqlite_pragma(dbapi_connection, connection_record): # noqa: ARG001
# without wal, concurrent reading/writing is not gonna work
# ugh. that's odd, how are we supposed to set WAL if the very fact of setting wal might lock the db?
while True:
try:
dbapi_connection.execute('PRAGMA journal_mode=WAL')
break
except sqlite3.OperationalError as oe:
if 'database is locked' not in str(oe):
# ugh, pretty annoying that exception doesn't include database path for some reason
raise RuntimeError(f'Error while setting WAL on {cache_path}') from oe
time.sleep(0.1)
self.connection = self.engine.connect()
"""
Erm... this is pretty confusing.
https://docs.sqlalchemy.org/en/13/dialects/sqlite.html#transaction-isolation-level
Somehow without this thing sqlalchemy logs BEGIN (implicit) instead of BEGIN TRANSACTION which actually works in sqlite...
Judging by sqlalchemy/dialects/sqlite/base.py, looks like some sort of python sqlite driver problem??
test_transaction should check this behaviour
"""
@event.listens_for(self.connection, 'begin')
def do_begin(conn):
# NOTE there is also BEGIN CONCURRENT in newer versions of sqlite. could use it later?
conn.execute(text('BEGIN DEFERRED'))
self.meta = sqlalchemy.MetaData()
self.table_hash = Table('hash', self.meta, Column('value', sqlalchemy.String))
# fmt: off
# actual cache
self.table_cache = Table('cache' , self.meta, Column('data', sqlalchemy.BLOB))
# temporary table, we use it to insert and then (atomically?) rename to the above table at the very end
self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.BLOB))
# fmt: on
def __enter__(self) -> 'SqliteBackend':
# NOTE: deferred transaction
self.transaction = self.connection.begin()
# FIXME this is a bit crap.. is there a nicer way to use another ctx manager here?
self.transaction.__enter__()
return self
def __exit__(self, *args) -> None:
self.transaction.__exit__(*args)
self.connection.close()
self.engine.dispose()
def get_old_hash(self) -> SourceHash | None:
# first, try to do as much as possible read-only, benefiting from deferred transaction
old_hashes: Sequence
try:
# not sure if there is a better way...
cursor = self.connection.execute(self.table_hash.select())
except sqlalchemy.exc.OperationalError as e:
# meh. not sure if this is a good way to handle this..
if 'no such table: hash' in str(e):
old_hashes = []
else:
raise e
else:
old_hashes = cursor.fetchall()
assert len(old_hashes) <= 1, old_hashes # shouldn't happen
old_hash: SourceHash | None
if len(old_hashes) == 0:
old_hash = None
else:
old_hash = old_hashes[0][0] # returns a tuple...
return old_hash
def cached_blobs_total(self) -> int | None:
[(total,)] = self.connection.execute(sqlalchemy.select(sqlalchemy.func.count()).select_from(self.table_cache))
return total
def cached_blobs(self) -> Iterator[bytes]:
rows = self.connection.execute(self.table_cache.select())
# by default, sqlalchemy wraps all results into Row object
# this can cause quite a lot of overhead if you're reading many rows
# it seems that in principle, sqlalchemy supports just returning bare underlying tuple from the dbapi
# but from browsing the code it doesn't seem like this functionality exposed
# if you're looking for cues, see
# - ._source_supports_scalars
# - ._generate_rows
# - ._row_getter
# by using this raw iterator we speed up reading the cache quite a bit
# asked here https://github.com/sqlalchemy/sqlalchemy/discussions/10350
raw_row_iterator = getattr(rows, '_raw_row_iterator', None)
if raw_row_iterator is None:
warnings.warn(
"CursorResult._raw_row_iterator method isn't found. This could lead to degraded cache reading performance.",
stacklevel=2,
)
row_iterator = rows
else:
row_iterator = raw_row_iterator()
for (blob,) in row_iterator:
yield blob
def get_exclusive_write(self) -> bool:
# NOTE on recursive calls
# somewhat magically, they should work as expected with no extra database inserts?
# the top level call 'wins' the write transaction and once it's gathered all data, will write it
# the 'intermediate' level calls fail to get it and will pass data through
# the cached 'bottom' level is read only and will be yielded without a write transaction
try:
# first 'write' statement will upgrade transaction to write transaction which might fail due to concurrency
# see https://www.sqlite.org/lang_transaction.html
# NOTE: because of 'checkfirst=True', only the last .create will guarantee the transaction upgrade to write transaction
self.table_hash.create(self.connection, checkfirst=True)
# 'table' used to be old 'cache' table name, so we just delete it regardless
# otherwise it might overinfalte the cache db with stale values
self.connection.execute(text('DROP TABLE IF EXISTS `table`'))
# NOTE: we have to use .drop and then .create (e.g. instead of some sort of replace)
# since it's possible to have schema changes inbetween calls
# checkfirst=True because it might be the first time we're using cache
self.table_cache_tmp.drop(self.connection, checkfirst=True)
self.table_cache_tmp.create(self.connection)
except sqlalchemy.exc.OperationalError as e:
if e.code == 'e3q8' and 'database is locked' in str(e):
# someone else must be have won the write lock
# not much we can do here
# NOTE: important to close early, otherwise we might hold onto too many file descriptors during yielding
# see test_recursive_deep
# (normally connection is closed in SqliteBackend.__exit__)
self.connection.close()
# in this case all the callee can do is just to call the actual function
return False
else:
raise e
return True
def flush_blobs(self, chunk: Sequence[bytes]) -> None:
# uhh. this gives a huge speedup for inserting
# since we don't have to create intermediate dictionaries
# TODO move this to __init__?
insert_into_table_cache_tmp_raw = str(
self.table_cache_tmp.insert().compile(dialect=sqlite.dialect(paramstyle='qmark'))
)
# I also tried setting paramstyle='qmark' in create_engine, but it seems to be ignored :(
# idk what benefit sqlalchemy gives at this point, seems to just complicate things
self.connection.exec_driver_sql(insert_into_table_cache_tmp_raw, [(c,) for c in chunk])
def finalize(self, new_hash: SourceHash) -> None:
# delete hash first, so if we are interrupted somewhere, it mismatches next time and everything is recomputed
self.connection.execute(self.table_hash.delete())
# checkfirst is necessary since it might not have existed in the first place
# e.g. first time we use cache
self.table_cache.drop(self.connection, checkfirst=True)
# meh https://docs.sqlalchemy.org/en/14/faq/metadata_schema.html#does-sqlalchemy-support-alter-table-create-view-create-trigger-schema-upgrade-functionality
# also seems like sqlalchemy doesn't have any primitives to escape table names.. sigh
self.connection.execute(text(f"ALTER TABLE `{self.table_cache_tmp.name}` RENAME TO `{self.table_cache.name}`"))
self.connection.execute(self.table_hash.insert().values([{'value': new_hash}]))
================================================
FILE: src/cachew/common.py
================================================
from dataclasses import dataclass
# TODO better name to represent what it means?
type SourceHash = str
class CachewException(RuntimeError):
pass
@dataclass
class TypeNotSupported(CachewException):
type_: type
reason: str
def __str__(self) -> str:
return f"{self.type_} isn't supported by cachew: {self.reason}. See https://github.com/karlicoss/cachew#features for the list of supported types."
================================================
FILE: src/cachew/compat.py
================================================
import sys
if sys.version_info[:2] >= (3, 13):
from warnings import deprecated
else:
from typing_extensions import deprecated
__all__ = ["deprecated"]
================================================
FILE: src/cachew/experimental.py
================================================
from typing import TYPE_CHECKING
if not TYPE_CHECKING:
from .compat import deprecated
@deprecated("Exceptions are not an experimental feature anymore and enabled by default.")
def enable_exceptions() -> None:
pass
@deprecated("Exceptions are not an experimental feature anymore and enabled by default.")
def disable_exceptions() -> None:
pass
================================================
FILE: src/cachew/extra.py
================================================
# todo Ideally, needs doublewraps as well? also typing helpers
def mcachew(*args, **kwargs):
"""
Stands for 'Maybe cachew'.
Defensive wrapper around @cachew to make it an optional dependency.
"""
try:
import cachew
except ModuleNotFoundError:
import warnings
warnings.warn(
'cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew',
stacklevel=2,
)
return lambda orig_func: orig_func
else:
return cachew.cachew(*args, **kwargs)
from contextlib import contextmanager
@contextmanager
def disabled_cachew():
from . import settings
orig = settings.ENABLE
try:
settings.ENABLE = False
yield
finally:
settings.ENABLE = orig
================================================
FILE: src/cachew/legacy.py
================================================
import typing
import warnings
from collections.abc import Iterable, Iterator, Sequence
from dataclasses import dataclass
from datetime import date, datetime
from itertools import chain, islice
from pathlib import Path
from typing import (
Any,
Generic,
NamedTuple,
Optional,
TypeVar,
Union,
)
import sqlalchemy
from sqlalchemy import Column
from .pytest import parametrize
from .common import CachewException
def get_union_args(cls) -> Optional[tuple[type]]:
if getattr(cls, '__origin__', None) != Union:
return None
args = cls.__args__
args = tuple(e for e in args if e is not type(None))
assert len(args) > 0
return args # ty: ignore[invalid-return-type]
def is_union(cls) -> bool:
return get_union_args(cls) is not None
Types = Union[
type[str],
type[int],
type[float],
type[bool],
type[datetime],
type[date],
type[dict],
type[list],
type[Exception],
type[NamedTuple],
]
Values = Union[
str,
int,
float,
bool,
datetime,
date,
dict,
list,
Exception,
NamedTuple,
]
PRIMITIVE_TYPES = {
str,
int,
float,
bool,
datetime,
date,
dict,
list,
Exception,
}
def is_primitive(cls: type) -> bool:
"""
>>> from typing import Dict, Any
>>> is_primitive(int)
True
>>> is_primitive(set)
False
>>> is_primitive(dict)
True
"""
return cls in PRIMITIVE_TYPES
class IsoDateTime(sqlalchemy.TypeDecorator):
# in theory could use something more effecient? e.g. blob for encoded datetime and tz?
# but practically, the difference seems to be pretty small, so perhaps fine for now
impl = sqlalchemy.String
cache_ok = True
@property
def python_type(self):
return datetime
def process_literal_param(self, value, dialect):
raise NotImplementedError() # make pylint happy
def process_bind_param(self, value: Optional[datetime], dialect) -> Optional[str]: # noqa: ARG002
if value is None:
return None
# ok, it's a bit hacky... attempt to preserve pytz infromation
iso = value.isoformat()
tz = getattr(value, 'tzinfo', None)
if tz is None:
return iso
try:
import pytz
except ImportError:
self.warn_pytz()
return iso
else:
if isinstance(tz, pytz.BaseTzInfo):
zone = tz.zone
# should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
assert zone is not None, tz
return iso + ' ' + zone
else:
return iso
def process_result_value(self, value: Optional[str], dialect) -> Optional[datetime]: # noqa: ARG002
if value is None:
return None
spl = value.split(' ')
dt = datetime.fromisoformat(spl[0])
if len(spl) <= 1:
return dt
zone = spl[1]
# else attempt to decypher pytz tzinfo
try:
import pytz
except ImportError:
self.warn_pytz()
return dt
else:
tz = pytz.timezone(zone)
return dt.astimezone(tz)
def warn_pytz(self) -> None:
warnings.warn('install pytz for better timezone support while serializing with cachew', stacklevel=2)
# a bit hacky, but works...
class IsoDate(IsoDateTime):
impl = sqlalchemy.String
cache_ok = True
@property
def python_type(self):
return date
def process_literal_param(self, value, dialect):
raise NotImplementedError() # make pylint happy
def process_result_value(self, value: Optional[str], dialect) -> Optional[date]: # type: ignore[override]
res = super().process_result_value(value, dialect)
if res is None:
return None
return res.date()
jtypes = (int, float, bool, type(None))
class ExceptionAdapter(sqlalchemy.TypeDecorator):
'''
Enables support for caching Exceptions. Exception is treated as JSON and serialized.
It's useful for defensive error handling, in case of cachew in particular for preserving error state.
I elaborate on it here: [mypy-driven error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss).
'''
impl = sqlalchemy.JSON
cache_ok = True
@property
def python_type(self):
return Exception
def process_literal_param(self, value, dialect):
raise NotImplementedError() # make pylint happy
def process_bind_param(self, value: Optional[Exception], dialect) -> Optional[list[Any]]: # noqa: ARG002
if value is None:
return None
sargs: list[Any] = []
for a in value.args:
if any(isinstance(a, t) for t in jtypes):
sargs.append(a)
elif isinstance(a, date):
sargs.append(a.isoformat())
else:
sargs.append(str(a))
return sargs
def process_result_value(self, value: Optional[str], dialect) -> Optional[Exception]: # noqa: ARG002
if value is None:
return None
# sadly, can't do much to convert back from the strings? Unless I serialize the type info as well?
return Exception(*value)
# fmt: off
PRIMITIVES = {
str : sqlalchemy.String,
int : sqlalchemy.Integer,
float : sqlalchemy.Float,
bool : sqlalchemy.Boolean,
datetime : IsoDateTime,
date : IsoDate,
dict : sqlalchemy.JSON,
list : sqlalchemy.JSON,
Exception: ExceptionAdapter,
}
# fmt: on
assert set(PRIMITIVES.keys()) == PRIMITIVE_TYPES
def strip_optional(cls) -> tuple[type, bool]:
"""
>>> from typing import Optional, NamedTuple
>>> strip_optional(Optional[int])
(, True)
>>> class X(NamedTuple):
... x: int
>>> strip_optional(X)
(, False)
"""
is_opt: bool = False
args = get_union_args(cls)
if args is not None and len(args) == 1:
cls = args[0] # meh
is_opt = True
return (cls, is_opt)
def strip_generic(tp):
"""
>>> from typing import List
>>> strip_generic(List[int])
>>> strip_generic(str)
"""
GA = getattr(typing, '_GenericAlias') # ugh, can't make both mypy and pylint happy here?
if isinstance(tp, GA):
return tp.__origin__
return tp
NT = TypeVar('NT')
# sadly, bound=NamedTuple is not working yet in mypy
# https://github.com/python/mypy/issues/685
# also needs to support dataclasses?
@dataclass
class NTBinder(Generic[NT]):
"""
>>> class Job(NamedTuple):
... company: str
... title: Optional[str]
>>> class Person(NamedTuple):
... name: str
... age: int
... job: Optional[Job]
NTBinder is a helper class for inteacting with sqlite database.
Hierarchy is flattened:
>>> binder = NTBinder.make(Person)
>>> [(c.name, type(c.type)) for c in binder.columns]
... # doctest: +NORMALIZE_WHITESPACE
[('name', ),
('age', ),
('_job_is_null', ),
('job_company', ),
('job_title', )]
>>> person = Person(name='alan', age=40, job=None)
to_row converts object to a sql-friendly tuple. job=None, so we end up with True in _job_is_null field
>>> tuple(binder.to_row(person))
('alan', 40, True, None, None)
from_row does reverse conversion
>>> binder.from_row(('alan', 40, True, None, None))
Person(name='alan', age=40, job=None)
>>> binder.from_row(('ann', 25, True, None, None, 'extra'))
Traceback (most recent call last):
...
cachew.common.CachewException: unconsumed items in iterator ['extra']
"""
name: Optional[str] # None means toplevel
type_: Types
span: int # not sure if span should include optional col?
primitive: bool
optional: bool
union: Optional[type] # helper, which isn't None if type is Union
fields: Sequence[Any] # mypy can't handle cyclic definition at this point :(
@staticmethod
def make(tp: type[NT], name: Optional[str] = None) -> 'NTBinder[NT]':
tp, optional = strip_optional(tp) # ty: ignore[invalid-assignment]
union: Optional[type]
fields: tuple[Any, ...]
primitive: bool
union_args = get_union_args(tp)
if union_args is not None:
CachewUnion = NamedTuple('_CachewUnionRepr', [(x.__name__, Optional[x]) for x in union_args]) # type: ignore[misc]
union = CachewUnion
primitive = False
fields = (NTBinder.make(tp=CachewUnion, name='_cachew_union_repr'),)
span = 1
else:
union = None
tp = strip_generic(tp)
primitive = is_primitive(tp)
if primitive:
if name is None:
name = '_cachew_primitive' # meh. presumably, top level
if primitive:
fields = ()
span = 1
else:
annotations = typing.get_type_hints(tp)
if annotations == {}:
raise CachewException(
f"{tp} (field '{name}'): doesn't look like a supported type to cache. See https://github.com/karlicoss/cachew#features for the list of supported types."
)
fields = tuple(NTBinder.make(tp=ann, name=fname) for fname, ann in annotations.items())
span = sum(f.span for f in fields) + (1 if optional else 0)
return NTBinder(
name=name,
type_=tp, # type: ignore[arg-type]
span=span,
primitive=primitive,
optional=optional,
union=union,
fields=fields,
)
@property
def columns(self) -> list[Column]:
return list(self.iter_columns())
# TODO not necessarily namedtuple? could be primitive type
def to_row(self, obj: NT) -> tuple[Optional[Values], ...]:
return tuple(self._to_row(obj))
def from_row(self, row: Iterable[Any]) -> NT:
riter = iter(row)
res = self._from_row(riter)
remaining = list(islice(riter, 0, 1))
if len(remaining) != 0:
raise CachewException(f'unconsumed items in iterator {remaining}')
assert res is not None # nosec # help mypy; top level will not be None
return res
def _to_row(self, obj) -> Iterator[Optional[Values]]:
if self.primitive:
yield obj
elif self.union is not None:
CachewUnion = self.union
(uf,) = self.fields
# TODO assert only one of them matches??
union = CachewUnion(**{f.name: obj if isinstance(obj, f.type_) else None for f in uf.fields})
yield from uf._to_row(union)
else:
if self.optional:
is_none = obj is None
yield is_none
else:
is_none = False
assert obj is not None # TODO hmm, that last assert is not very symmetric...
if is_none:
for _ in range(self.span - 1):
yield None
else:
yield from chain.from_iterable(f._to_row(getattr(obj, f.name)) for f in self.fields)
def _from_row(self, row_iter):
if self.primitive:
return next(row_iter)
elif self.union is not None:
CachewUnion = self.union # noqa: F841
(uf,) = self.fields
# TODO assert only one of them is not None?
union_params = [r for r in uf._from_row(row_iter) if r is not None]
assert len(union_params) == 1, union_params
return union_params[0]
else:
if self.optional:
is_none = next(row_iter)
else:
is_none = False
if is_none:
for _ in range(self.span - 1):
x = next(row_iter)
assert x is None, x # huh. assert is kinda opposite of producing value
return None
else:
return self.type_(*(f._from_row(row_iter) for f in self.fields))
# TODO not sure if we want to allow optionals on top level?
def iter_columns(self) -> Iterator[Column]:
used_names: set[str] = set()
def col(name: str, tp) -> Column:
while name in used_names:
name = '_' + name
used_names.add(name)
return Column(name, tp)
if self.primitive:
if self.name is None:
raise AssertionError
yield col(self.name, PRIMITIVES[self.type_])
else:
prefix = '' if self.name is None else self.name + '_'
if self.optional:
yield col(f'_{prefix}is_null', sqlalchemy.Boolean)
for f in self.fields:
for c in f.iter_columns():
yield col(f'{prefix}{c.name}', c.type)
def __str__(self):
lines = [' ' * level + str(x.name) + ('?' if x.optional else '') + f' ' for level, x in self.flatten()]
return '\n'.join(lines)
def __repr__(self):
return str(self)
def flatten(self, level=0):
yield (level, self)
for f in self.fields:
yield from f.flatten(level=level + 1)
def test_mypy_annotations() -> None:
# mypy won't handle, so this has to be dynamic
vs = []
for t in Types.__args__: # type: ignore[attr-defined]
(arg,) = t.__args__
vs.append(arg)
def types(ts):
return sorted(ts, key=lambda t: str(t))
assert types(vs) == types(Values.__args__) # type: ignore[attr-defined]
for p in PRIMITIVE_TYPES:
assert p in Values.__args__ # type: ignore[attr-defined]
@parametrize(
('tp', 'val'),
[
(int, 22),
(bool, False),
(Optional[str], 'abacaba'),
(Union[str, int], 1),
],
)
def test_ntbinder_primitive(tp, val) -> None:
b = NTBinder.make(tp, name='x')
row = b.to_row(val)
vv = b.from_row(list(row))
assert vv == val
def test_unique_columns(tmp_path: Path) -> None: # noqa: ARG001
class Job(NamedTuple):
company: str
title: Optional[str]
class Breaky(NamedTuple):
job_title: int
job: Optional[Job]
assert [c.name for c in NTBinder.make(Breaky).columns] == [
'job_title',
'_job_is_null',
'job_company',
'_job_title',
]
================================================
FILE: src/cachew/logging_helper.py
================================================
from __future__ import annotations
import logging
import os
import warnings
from functools import lru_cache
from typing import TYPE_CHECKING
def test() -> None:
import sys
from collections.abc import Callable
M: Callable[[str], None] = lambda s: print(s, file=sys.stderr)
## prepare exception for later
try:
None.whatever # type: ignore[attr-defined] # noqa: B018
except Exception as e:
ex = e
##
M(" Logging module's defaults are not great:")
l = logging.getLogger('default_logger')
l.error(
"For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level"
)
M("\n The reason is that you need to remember to call basicConfig() first. Let's do it now:")
logging.basicConfig()
l.error(
"OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number"
)
M(
"\n Also exception logging is kinda lame, doesn't print traceback by default unless you remember to pass exc_info:"
)
l.exception(ex) # type: ignore[possibly-undefined]
M(
"\n\n With make_logger you get a reasonable logging format, colours (via colorlog library) and other neat things:"
)
ll = make_logger('test') # No need for basicConfig!
ll.info("default level is INFO")
ll.debug("... so this shouldn't be displayed")
ll.warning("warnings are easy to spot!")
M("\n Exceptions print traceback by default now:")
ll.exception(ex)
M(
"\n You can (and should) use it via regular logging.getLogger after that, e.g. let's set logging level to DEBUG now"
)
logging.getLogger('test').setLevel(logging.DEBUG)
ll.debug("... now debug messages are also displayed")
DEFAULT_LEVEL = 'INFO'
FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)-4d]{end} %(message)s'
FORMAT_NOCOLOR = FORMAT.format(start='', end='')
Level = int
LevelIsh = Level | str | None
def mklevel(level: LevelIsh) -> Level:
if level is None:
return logging.NOTSET
if isinstance(level, int):
return level
return getattr(logging, level.upper())
def get_collapse_level() -> Level | None:
# TODO not sure if should be specific to logger name?
cl = os.environ.get('LOGGING_COLLAPSE', None)
if cl is not None:
return mklevel(cl)
# legacy name, maybe deprecate?
cl = os.environ.get('COLLAPSE_DEBUG_LOGS', None)
if cl is not None:
return logging.DEBUG
return None
def get_env_level(name: str) -> Level | None:
PREFIX = 'LOGGING_LEVEL_' # e.g. LOGGING_LEVEL_my_hypothesis=debug
# shell doesn't allow using dots in var names without escaping, so also support underscore syntax
lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None)
if lvl is not None:
return mklevel(lvl)
return None
def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> None:
"""
Wrapper to simplify logging setup.
"""
if isinstance(logger, str):
logger = logging.getLogger(logger)
if level is None:
level = DEFAULT_LEVEL
# env level always takes precedence
env_level = get_env_level(logger.name)
if env_level is not None:
lvl = env_level
else:
lvl = mklevel(level)
if logger.level == logging.NOTSET:
# if it's already set, the user requested a different logging level, let's respect that
logger.setLevel(lvl)
_setup_handlers_and_formatters(name=logger.name)
# cached since this should only be done once per logger instance
@lru_cache(None)
def _setup_handlers_and_formatters(name: str) -> None:
logger = logging.getLogger(name)
logger.addFilter(AddExceptionTraceback())
ch = logging.StreamHandler()
collapse_level = get_collapse_level()
ch = logging.StreamHandler() if collapse_level is None else CollapseLogsHandler(maxlevel=collapse_level)
# default level for handler is NOTSET, which will make it process all messages
# we rely on the logger to actually accept/reject log msgs
logger.addHandler(ch)
# this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand)
# even if log entry is handled by this logger ... not sure what's the point of this behaviour??
logger.propagate = False
try:
# try colorlog first, so user gets nice colored logs
import colorlog
except ModuleNotFoundError:
warnings.warn("You might want to 'pip install colorlog' for nice colored logs", stacklevel=2)
formatter = logging.Formatter(FORMAT_NOCOLOR)
else:
# log_color/reset are specific to colorlog
FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s')
# colorlog should detect tty in principle, but doesn't handle everything for some reason
# see https://github.com/borntyping/python-colorlog/issues/71
if ch.stream.isatty():
formatter = colorlog.ColoredFormatter(FORMAT_COLOR)
else:
formatter = logging.Formatter(FORMAT_NOCOLOR)
ch.setFormatter(formatter)
# by default, logging.exception isn't logging traceback unless called inside of the exception handler
# which is a bit annoying since we have to pass exc_info explicitly
# also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default
# todo also amend by post about defensive error handling?
class AddExceptionTraceback(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
if record.levelname == 'ERROR':
exc = record.msg
if isinstance(exc, BaseException):
if record.exc_info is None or record.exc_info == (None, None, None):
exc_info = (type(exc), exc, exc.__traceback__)
record.exc_info = exc_info
return True
# todo also save full log in a file?
class CollapseLogsHandler(logging.StreamHandler):
'''
Collapses subsequent debug log lines and redraws on the same line.
Hopefully this gives both a sense of progress and doesn't clutter the terminal as much?
'''
last: bool = False
maxlevel: Level = logging.DEBUG # everything with less or equal level will be collapsed
def __init__(self, *args, maxlevel: Level, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.maxlevel = maxlevel
def emit(self, record: logging.LogRecord) -> None:
try:
msg = self.format(record)
cur = record.levelno <= self.maxlevel and '\n' not in msg
if cur:
if self.last:
self.stream.write('\033[K' + '\r') # clear line + return carriage
else:
if self.last:
self.stream.write('\n') # clean up after the last line
self.last = cur
columns, _ = os.get_terminal_size(0)
# ugh. the columns thing is meh. dunno I guess ultimately need curses for that
# TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc)
self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\n'))
self.flush()
except:
self.handleError(record)
def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger:
logger = logging.getLogger(name)
setup_logger(logger, level=level)
return logger
# ughh. hacky way to have a single enlighten instance per interpreter, so it can be shared between modules
# not sure about this. I guess this should definitely be behind some flag
# OK, when stdout is not a tty, enlighten doesn't log anything, good
def get_enlighten():
# TODO could add env variable to disable enlighten for a module?
from unittest.mock import (
Mock, # Mock to return stub so cients don't have to think about it
)
# for now hidden behind the flag since it's a little experimental
if os.environ.get('ENLIGHTEN_ENABLE', None) is None:
return Mock()
try:
import enlighten # type: ignore[import-untyped]
except ModuleNotFoundError:
warnings.warn("You might want to 'pip install enlighten' for a nice progress bar", stacklevel=2)
return Mock()
# dirty, but otherwise a bit unclear how to share enlighten manager between packages that call each other
instance = getattr(enlighten, 'INSTANCE', None)
if instance is not None:
return instance
instance = enlighten.get_manager()
setattr(enlighten, 'INSTANCE', instance)
return instance
if __name__ == '__main__':
test()
## legacy/deprecated methods for backwards compatilibity
if not TYPE_CHECKING:
LazyLogger = make_logger
logger = make_logger
##
================================================
FILE: src/cachew/marshall/cachew.py
================================================
from __future__ import annotations
import types
from abc import abstractmethod
from collections import abc
from collections.abc import Sequence
from dataclasses import dataclass, is_dataclass
from datetime import UTC, date, datetime
from numbers import Real
from typing import ( # noqa: UP035
Any,
Dict,
List,
NamedTuple,
Optional,
Tuple,
Union,
get_args,
get_origin,
get_type_hints,
)
from zoneinfo import ZoneInfo
from ..common import TypeNotSupported
from ..utils import is_namedtuple, resolve_type_parameters
from .common import AbstractMarshall, Json
class CachewMarshall[T](AbstractMarshall[T]):
def __init__(self, Type_: type[T]) -> None:
self.schema = build_schema(Type_)
def dump(self, obj: T) -> Json:
return self.schema.dump(obj)
def load(self, dct: Json) -> T:
return self.schema.load(dct)
# NOTE: using slots gives a small speedup (maybe 5%?)
# I suppose faster access to fields or something..
@dataclass(slots=True)
class Schema:
type: Any
@abstractmethod
def dump(self, obj):
raise NotImplementedError
@abstractmethod
def load(self, dct):
raise NotImplementedError
@dataclass(slots=True)
class SPrimitive(Schema):
def dump(self, obj):
# NOTE: returning here directly (instead of calling identity lambda) gives about 20% speedup
# I think custom types should have their own Schema subclass
return obj
# prim = primitives_to.get(self.type)
# assert prim is not None
# return prim(o)
def load(self, dct):
return dct
# prim = primitives_from.get(self.type)
# assert prim is not None
# return prim(d)
@dataclass(slots=True)
class SDataclass(Schema):
# using list of tuples instead of dict gives about 5% speedup
fields: tuple[tuple[str, Schema], ...]
def dump(self, obj):
# TODO would be nice if we didn't create a dictionary here
# considering it is going to be serialized to json anyway
# maybe we need to yield json bits actually?
return {
# would be kinda nice if we didn't have to use getattr here
# but I think for dataclass this is actually the fastest way
# TODO for NamedTuples could just use them as tuples.. think about separating
k: ks.dump(getattr(obj, k))
for k, ks in self.fields
}
def load(self, dct):
# dict comprehension is meh, but not sure if there is a faster way?
return self.type(**{
k: ks.load(dct[k])
for k, ks in self.fields
}) # fmt: skip
@dataclass(slots=True)
class SUnion(Schema):
# it's a bit faster to cache indices here, gives about 15% speedup
args: tuple[tuple[int, Schema], ...]
def dump(self, obj):
if obj is None:
# if it's a None, then doesn't really matter how to serialize and deserialize it
return (0, None)
# TODO could do a bit of magic here and remember the last index that worked?
# that way if some objects dominate the Union, the first isinstance would always work
for tidx, a in self.args:
if isinstance(obj, a.type): # this takes quite a lot of time (sort of expected?)
# using lists instead of dicts gives a bit of a speedup (about 15%)
# so probably worth it even though a bit cryptic
# also could add a tag or something?
# NOTE: using tuple instead of list gives a tiiny speedup
jj = a.dump(obj)
return (tidx, jj)
# {
# '__union_index__': tidx,
# '__value__': jj,
# }
raise RuntimeError(f"shouldn't happen: {self.args} {obj}")
def load(self, dct):
# tidx = d['__union_index__']
# s = self.args[tidx]
# return s.load(d['__value__'])
tidx, val = dct
if val is None:
# counterpart for None handling in .dump method
return None
_, s = self.args[tidx]
return s.load(val)
@dataclass(slots=True)
class SList(Schema):
arg: Schema
def dump(self, obj):
return tuple(self.arg.dump(i) for i in obj)
def load(self, dct):
return [self.arg.load(i) for i in dct]
@dataclass(slots=True)
class STuple(Schema):
args: tuple[Schema, ...]
def dump(self, obj):
return tuple(a.dump(i) for a, i in zip(self.args, obj, strict=True))
def load(self, dct):
return tuple(a.load(i) for a, i in zip(self.args, dct, strict=True))
@dataclass(slots=True)
class SSequence(Schema):
arg: Schema
def dump(self, obj):
return tuple(self.arg.dump(i) for i in obj)
def load(self, dct):
return tuple(self.arg.load(i) for i in dct)
@dataclass(slots=True)
class SDict(Schema):
ft: SPrimitive
tt: Schema
def dump(self, obj):
return {
k: self.tt.dump(v)
for k, v in obj.items()
} # fmt: skip
def load(self, dct):
return {
k: self.tt.load(v)
for k, v in dct.items()
} # fmt: skip
# TODO unify with primitives?
JTypes = {int, str, type(None), float, bool}
def _exc_helper(args):
for a in args:
at = type(a)
if at in JTypes:
yield a
elif issubclass(at, date):
# TODO would be nice to restore datetime from cache too
# maybe generally save exception as a union? or intact and let orjson save it?
yield a.isoformat()
else:
yield str(a) # not much we can do..
@dataclass(slots=True)
class SException(Schema):
def dump(self, obj: Exception) -> Json:
return tuple(_exc_helper(obj.args))
def load(self, dct: Json):
return self.type(*dct)
try:
# defensive to avoid dependency on pytz when we switch to python >= 3.9
import pytz
except ModuleNotFoundError:
# dummy, this is only needed for isinstance check below
class pytz_BaseTzInfo:
zone: str
def make_tz_pytz(zone: str):
raise RuntimeError(f"Install pytz to deserialize {zone}")
else:
pytz_BaseTzInfo = pytz.BaseTzInfo # type: ignore[misc,assignment]
make_tz_pytz = pytz.timezone
# just ints to avoid inflating db size
# for now, we try to preserve actual timezone object just in case since they do have somewhat incompatible apis
_TZTAG_ZONEINFO = 1
_TZTAG_PYTZ = 2
@dataclass(slots=True)
class SDatetime(Schema):
def dump(self, obj: datetime) -> Json:
iso = obj.isoformat()
tz = obj.tzinfo
if tz is None:
return (iso, None, None)
if isinstance(tz, ZoneInfo):
return (iso, tz.key, _TZTAG_ZONEINFO)
elif isinstance(tz, pytz_BaseTzInfo):
zone = tz.zone
# should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
assert zone is not None, (obj, tz)
return (iso, zone, _TZTAG_PYTZ)
else:
return (iso, None, None)
def load(self, dct: tuple):
iso, zone, zone_tag = dct
dt = datetime.fromisoformat(iso)
if zone is None:
return dt
make_tz = ZoneInfo if zone_tag == _TZTAG_ZONEINFO else make_tz_pytz
tz = make_tz(zone)
return dt.astimezone(tz)
@dataclass(slots=True)
class SDate(Schema):
def dump(self, obj: date) -> Json:
return obj.isoformat()
def load(self, dct: str):
return date.fromisoformat(dct)
PRIMITIVES = {
# int and float are handled a bit differently to allow implicit casts
# isinstance(.., Real) works both for int and for float
# Real can't be serialized back, but if you look in SPrimitive, it leaves the values intact anyway
# since the actual serialization of primitives is handled by orjson
int: Real,
float: Real,
str: str,
type(None): type(None),
bool: bool,
# if type is Any, there isn't much we can do to dump it -- just dump into json and rely on the best
# so in this sense it works exacly like primitives
Any: Any,
}
def build_schema(Type) -> Schema:
# just to avoid confusion in case of weirdness with stringish type annotations
assert not isinstance(Type, str), Type
Type = resolve_type_parameters(Type)
ptype = PRIMITIVES.get(Type)
if ptype is not None:
return SPrimitive(type=ptype)
origin = get_origin(Type)
# origin is 'unsubscripted/erased' version of type
# if origin is NOT None, it's some sort of generic type
if origin is None:
if issubclass(Type, Exception):
return SException(type=Type)
if issubclass(Type, datetime):
return SDatetime(type=Type)
if issubclass(Type, date):
return SDate(type=Type)
if not (is_dataclass(Type) or is_namedtuple(Type)):
raise TypeNotSupported(type_=Type, reason='unknown type')
try:
hints = get_type_hints(Type)
except TypeError as te:
# this can happen for instance on 3.9 if pipe syntax was used for Union types
# would be nice to provide a friendlier error though
raise TypeNotSupported(type_=Type, reason='failed to get type hints') from te
fields = tuple((k, build_schema(t)) for k, t in hints.items())
return SDataclass(
type=Type,
fields=fields,
)
args = get_args(Type)
is_union = origin is Union or origin is types.UnionType
if is_union:
# We 'erasing' types (since generic types don't work with isinstance checks).
# So we need to make sure the types are unique to make sure we can deserialise them.
schemas = [build_schema(a) for a in args]
union_types = [s.type for s in schemas if s.type is not Real]
if len(set(union_types)) != len(union_types):
raise TypeNotSupported(type_=Type, reason=f'runtime union arguments are not unique: {union_types}')
return SUnion(
type=origin,
args=tuple(
(tidx, s)
for tidx, s in enumerate(schemas)
),
) # fmt: skip
is_listish = origin is list
if is_listish:
(t,) = args
return SList(
type=origin,
arg=build_schema(t),
)
# hmm check for is typing.Sequence doesn't pass for some reason
# perhaps because it's a deprecated alias?
is_tuplish = origin is tuple or origin is abc.Sequence
if is_tuplish:
if origin is tuple:
# this is for Tuple[()], which is the way to represent empty tuple
# before python 3.11, get_args for that gives ((),) instead of an empty tuple () as one might expect
if args == ((),):
args = ()
return STuple(
type=origin,
args=tuple(build_schema(a) for a in args),
)
else:
(t,) = args
return SSequence(
type=origin,
arg=build_schema(t),
)
is_dictish = origin is dict
if is_dictish:
(ft, tt) = args
fts = build_schema(ft)
tts = build_schema(tt)
assert isinstance(fts, SPrimitive)
return SDict(
type=origin,
ft=fts,
tt=tts,
)
raise RuntimeError(f"unsupported: {Type=} {origin=} {args=}")
######### tests
def _test_identity(obj, Type_, expected=None):
if expected is None:
expected = obj
m = CachewMarshall(Type_)
j = m.dump(obj)
obj2 = m.load(j)
# Exception's don't support equality normally, so we need to do some hacks..
def normalise(x):
if isinstance(x, Exception):
return (type(x), x.args)
if type(x) is list:
return [(type(i), i.args) if isinstance(i, Exception) else i for i in x]
return x
# ugh that doesn't work
# def exc_eq(s, other):
# return (type(s), s.args) == (type(other), other.args)
# Exception.__eq__ = exc_eq
assert normalise(expected) == normalise(obj2), (expected, obj2)
return (j, obj2)
## this is used for test below...
# however if we define this inside the test function, it fails if from __future__ import annotations is present on the file..
type _IntType = int
type _StrIntType = str | int
##
# TODO customise with cattrs
def test_serialize_and_deserialize() -> None:
import pytest
helper = _test_identity
# primitives
helper(1, int)
helper('aaa', str)
helper(None, type(None))
# TODO emit other value as none type? not sure what should happen
# implicit casts, simple version
helper(None, int)
helper(None, str)
helper(1, float)
# implicit casts, inside other types
# technically not type safe, but might happen in practice
# doesn't matter how to deserialize None anyway so let's allow this
helper(None, str | int)
# old syntax
helper(None, Union[str, int]) # noqa: UP007
# even though 1 is not isinstance(float), often it ends up as float in data
# see https://github.com/karlicoss/cachew/issues/54
helper(1, float | str)
helper(2, float | int)
helper(2.0, float | int)
helper((1, 2), tuple[int, float])
# optionals
helper('aaa', str | None)
helper(None, str | None)
# old syntax
helper('aaa', Optional[str]) # noqa: UP045
helper('aaa', Union[str, None]) # noqa: UP007
helper(None, Union[str, None]) # noqa: UP007
# lists/tuples/sequences
# TODO test with from __future__ import annotations..
helper([1, 2, 3], list[int])
helper([1, 2, 3], Optional[List[int]]) # noqa: UP006,UP045
helper([1, 2, 3], Sequence[int], expected=(1, 2, 3))
helper((1, 2, 3), Sequence[int])
helper((1, 2, 3), tuple[int, int, int])
# old syntax
helper([1, 2, 3], List[int]) # noqa: UP006
helper((1, 2, 3), Tuple[int, int, int]) # noqa: UP006
helper((1, 2, 3), Optional[tuple[int, int, int]]) # noqa: UP045
# dicts
helper({'a': 'aa', 'b': 'bb'}, dict[str, str])
helper({'a': None, 'b': 'bb'}, dict[str, str | None])
helper({'a': 'aa', 'b': 'bb'}, dict[str, str])
# old syntax
helper({'a': None, 'b': 'bb'}, Dict[str, Optional[str]]) # noqa: UP006,UP045
# unions
helper('aaa', str | int)
# old syntax
helper(1, Union[str, int]) # noqa: UP007
# compounds of simple types
helper(['1', 2, '3'], list[str | int])
# old syntax
helper(['1', 2, '3'], list[Union[str, int]]) # noqa: UP007
# TODO need to add test for equivalent dataclasses
@dataclass
class Point:
x: int
y: int
# dataclasses
helper(Point(x=1, y=2), Point)
# Namedtuple
class NT(NamedTuple):
first: str
last: str
helper(NT(first='aaa', last='bbb'), NT)
@dataclass
class WithJson:
id: int
raw_data: dict[str, Any]
## type aliases including new 3.12 type aliases
# this works..
StrInt = str | int
helper('aaa', StrInt)
helper('aaa', _StrIntType)
helper([1, 2, 3], list[_IntType])
@dataclass
class TestTypeAlias:
x: _IntType
value: _StrIntType
helper(TestTypeAlias(x=1, value='aaa'), TestTypeAlias)
##
# json-ish stuff
helper({}, dict[str, Any])
helper(WithJson(id=123, raw_data={'payload': 'whatever', 'tags': ['a', 'b', 'c']}), WithJson)
helper([], list[Any])
# exceptions
helper(RuntimeError('whatever!'), RuntimeError)
# fmt: off
helper([
RuntimeError('I', 'am', 'exception', 123),
Point(x=1, y=2),
Point(x=11, y=22),
RuntimeError('more stuff'),
RuntimeError(),
], list[RuntimeError | Point])
exc_with_datetime = Exception('I happenned on', datetime.fromisoformat('2021-04-03T10:11:12'))
exc_with_datetime_exp = Exception('I happenned on', '2021-04-03T10:11:12')
helper(exc_with_datetime, Exception, expected=exc_with_datetime_exp)
# fmt: on
# datetimes
import pytz
tz_london = pytz.timezone('Europe/London')
dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S')
dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S')
dwinter_tz = tz_london.localize(dwinter)
dsummer_tz = tz_london.localize(dsummer)
dates_tz = [
dwinter_tz,
dsummer_tz,
]
tz_sydney = ZoneInfo('Australia/Sydney')
## these will have same local time (2025-04-06 02:01:00) in Sydney due to DST shift!
## the second one will have fold=1 set to disambiguate
utc_before_shift = datetime.fromisoformat('2025-04-05T15:01:00+00:00')
utc_after__shift = datetime.fromisoformat('2025-04-05T16:01:00+00:00')
##
sydney_before = utc_before_shift.astimezone(tz_sydney)
sydney__after = utc_after__shift.astimezone(tz_sydney)
dates_tz.extend([sydney_before, sydney__after])
dates = [
*dates_tz,
dwinter,
dsummer,
dsummer.replace(tzinfo=UTC),
]
for d in dates:
_jj, dd = helper(d, datetime)
assert str(d) == str(dd)
# test that we preserve zone names
if d in dates_tz:
# this works both with pytz and zoneinfo without getting .zone or .key attributes
assert str(d.tzinfo) == str(dd.tzinfo)
assert helper(dsummer_tz, datetime)[0] == ('2020-08-03T01:02:03+01:00', 'Europe/London', _TZTAG_PYTZ)
assert helper(dwinter, datetime)[0] == ('2020-02-03T01:02:03', None, None)
assert helper(sydney_before, datetime)[0] == ('2025-04-06T02:01:00+11:00', 'Australia/Sydney', _TZTAG_ZONEINFO)
assert helper(sydney__after, datetime)[0] == ('2025-04-06T02:01:00+10:00', 'Australia/Sydney', _TZTAG_ZONEINFO)
assert helper(dwinter.date(), date)[0] == '2020-02-03'
# unsupported types
class NotSupported:
pass
with pytest.raises(RuntimeError, match=r".*NotSupported.* isn't supported by cachew"):
helper([NotSupported()], list[NotSupported])
# edge cases
helper((), tuple[()])
# unions of generic sequences and such
# these don't work because the erased type of both is just 'list'..
# so there is no way to tell which one we need to construct :(
with pytest.raises(TypeNotSupported, match=r".*runtime union arguments are not unique"):
helper([1, 2, 3], list[int] | list[Exception])
with pytest.raises(TypeNotSupported, match=r".*runtime union arguments are not unique"):
helper([1, 2, 3], list[Exception] | list[int])
================================================
FILE: src/cachew/marshall/common.py
================================================
from abc import abstractmethod
from typing import Any
type Json = dict[str, Any] | tuple[Any, ...] | str | float | int | bool | None
class AbstractMarshall[T]:
@abstractmethod
def dump(self, obj: T) -> Json:
raise NotImplementedError
@abstractmethod
def load(self, dct: Json) -> T:
raise NotImplementedError
================================================
FILE: src/cachew/py.typed
================================================
================================================
FILE: src/cachew/pytest.py
================================================
"""
Helpers to prevent depending on pytest in runtime
"""
import sys
import typing
under_pytest = 'pytest' in sys.modules
if typing.TYPE_CHECKING or under_pytest:
import pytest
parametrize = pytest.mark.parametrize
else:
def parametrize(*_args, **_kwargs):
def wrapper(f):
return f
return wrapper
================================================
FILE: src/cachew/tests/marshall.py
================================================
# ruff: noqa: ARG001 # ruff thinks pytest fixtures are unused arguments
import shutil
import sqlite3
import sys
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
from typing import Any, Literal
import orjson
import pytest
from ..marshall.cachew import CachewMarshall
from ..marshall.common import Json
from .utils import (
gc_control, # noqa: F401
profile,
running_on_ci,
timer,
)
Impl = Literal[
'cachew', # our custom deserialization
'cattrs',
'legacy', # our legacy deserialization
]
# don't include legacy by default, it's only here just for the sake of comparing once before switch
Impls: list[Impl] = ['cachew', 'cattrs']
def do_test(*, test_name: str, Type, factory, count: int, impl: Impl = 'cachew') -> None:
if count > 100 and running_on_ci:
pytest.skip("test too heavy for CI, only meant to run manually")
to_json: Any
from_json: Any
if impl == 'cachew':
marshall = CachewMarshall(Type_=Type)
to_json = marshall.dump
from_json = marshall.load
elif impl == 'legacy':
from ..legacy import NTBinder
# NOTE: legacy binder emits a tuple which can be inserted directly into the database
# so 'json dump' and 'json load' should really be disregarded for this flavor
# if you're comparing with implementation, you should compare
# legacy serializing as the sum of serializing + json dump
# that said, this way legacy will have a bit of an advantage since custom types (e.g. datetime)
# would normally be handled by sqlalchemy instead
binder = NTBinder.make(Type)
to_json = binder.to_row
from_json = binder.from_row
elif impl == 'cattrs':
from cattrs import Converter
converter = Converter()
from typing import get_args
# TODO use later
# from typing import Union, get_origin
# import types
# def is_union(type_) -> bool:
# origin = get_origin(type_)
# return origin is Union or origin is types.UnionType
def union_structure_hook_factory(_):
def union_hook(data, type_):
args = get_args(type_)
if data is None: # we don't try to coerce None into anything
return None
for t in args:
try:
res = converter.structure(data, t)
except Exception:
continue
else:
return res
raise ValueError(f"Could not cast {data} to {type_}")
return union_hook
# borrowed from https://github.com/python-attrs/cattrs/issues/423
# uhh, this doesn't really work straightaway...
# likely need to combine what cattr does with configure_tagged_union
# converter.register_structure_hook_factory(is_union, union_structure_hook_factory)
# configure_tagged_union(
# union=Type,
# converter=converter,
# )
# NOTE: this seems to give a bit of speedup... maybe raise an issue or something?
# fmt: off
unstruct_func = converter._unstructure_func.dispatch(Type) # type: ignore[call-arg, misc] # about 20% speedup
struct_func = converter._structure_func .dispatch(Type) # type: ignore[call-arg, misc] # TODO speedup
# fmt: on
to_json = unstruct_func
# todo would be nice to use partial? but how do we bind a positional arg?
from_json = lambda x: struct_func(x, Type)
else:
raise RuntimeError(impl)
print(file=sys.stderr) # kinda annoying, pytest starts printing on the same line as test name
with profile(test_name + ':baseline'), timer(f'building {count} objects of type {Type}'):
objects = list(factory(count=count))
jsons: list[Json] = [None for _ in range(count)]
with profile(test_name + ':serialize'), timer(f'serializing {count} objects of type {Type}'):
for i in range(count):
jsons[i] = to_json(objects[i]) # ty: ignore[invalid-assignment]
strs: list[bytes] = [None for _ in range(count)] # type: ignore[misc]
with profile(test_name + ':json_dump'), timer(f'json dump {count} objects of type {Type}'):
for i in range(count):
# TODO any orjson options to speed up?
strs[i] = orjson.dumps(jsons[i])
db = Path('/tmp/cachew_test/db.sqlite')
if db.parent.exists():
shutil.rmtree(db.parent)
db.parent.mkdir()
with profile(test_name + ':sqlite_dump'), timer(f'sqlite dump {count} objects of type {Type}'):
with sqlite3.connect(db) as conn:
conn.execute('CREATE TABLE data (value BLOB)')
conn.executemany('INSERT INTO data (value) VALUES (?)', [(s,) for s in strs])
conn.close()
strs2: list[bytes] = [None for _ in range(count)] # type: ignore[misc]
with profile(test_name + ':sqlite_load'), timer(f'sqlite load {count} objects of type {Type}'):
with sqlite3.connect(db) as conn:
i = 0
for (value,) in conn.execute('SELECT value FROM data'):
strs2[i] = value
i += 1
conn.close()
cache = db.parent / 'cache.jsonl'
with profile(test_name + ':jsonl_dump'), timer(f'jsonl dump {count} objects of type {Type}'):
with cache.open('wb') as fw:
for s in strs:
fw.write(s + b'\n')
strs3: list[bytes] = [None for _ in range(count)] # type: ignore[misc]
with profile(test_name + ':jsonl_load'), timer(f'jsonl load {count} objects of type {Type}'):
i = 0
with cache.open('rb') as fr:
for l in fr:
l = l.rstrip(b'\n')
strs3[i] = l
i += 1
assert strs2[:100] + strs2[-100:] == strs3[:100] + strs3[-100:] # just in case
jsons2: list[Json] = [None for _ in range(count)]
with profile(test_name + ':json_load'), timer(f'json load {count} objects of type {Type}'):
for i in range(count):
# TODO any orjson options to speed up?
jsons2[i] = orjson.loads(strs2[i])
objects2 = [None for _ in range(count)]
with profile(test_name + ':deserialize'), timer(f'deserializing {count} objects of type {Type}'):
for i in range(count):
objects2[i] = from_json(jsons2[i]) # ty: ignore[invalid-argument-type]
assert objects[:100] + objects[-100:] == objects2[:100] + objects2[-100:]
@dataclass
class Name:
first: str
last: str
@pytest.mark.parametrize('impl', Impls)
@pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_union_str_dataclass(impl: Impl, count: int, gc_control, request) -> None:
# NOTE: previously was union_str_namedtuple, but adapted to work with cattrs for now
# perf difference between datacalss/namedtuple here seems negligible so old benchmark results should apply
if impl == 'cattrs':
pytest.skip('TODO need to adjust the handling of Union types..')
def factory(count: int):
objects: list[str | Name] = []
for i in range(count):
if i % 2 == 0:
objects.append(str(i))
else:
objects.append(Name(first=f'first {i}', last=f'last {i}'))
return objects
do_test(test_name=request.node.name, Type=str | Name, factory=factory, count=count, impl=impl)
# OK, performance with calling this manually (not via pytest) is the same
# do_test_union_str_dataclass(count=1_000_000, test_name='adhoc')
@pytest.mark.parametrize('impl', Impls)
@pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_datetimes(impl: Impl, count: int, gc_control, request) -> None:
if impl == 'cattrs':
pytest.skip('TODO support datetime with pytz for cattrs')
import pytz
def factory(*, count: int):
tzs = [
pytz.timezone('Europe/Berlin'),
UTC,
pytz.timezone('America/New_York'),
]
start = datetime.fromisoformat('1990-01-01T00:00:00')
end = datetime.fromisoformat('2030-01-01T00:00:00')
step = (end - start) / count
for i in range(count):
dt = start + step * i
tz = tzs[i % len(tzs)]
yield dt.replace(tzinfo=tz)
do_test(test_name=request.node.name, Type=datetime, factory=factory, count=count, impl=impl)
@pytest.mark.parametrize('impl', Impls)
@pytest.mark.parametrize('count', [99, 1_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_nested_dataclass(impl: Impl, count: int, gc_control, request) -> None:
# NOTE: was previously named test_many_from_cachew
@dataclass
class UUU:
xx: int
yy: int
@dataclass
class TE2:
value: int
uuu: UUU
value2: int
def factory(*, count: int):
for i in range(count):
yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i)
do_test(test_name=request.node.name, Type=TE2, factory=factory, count=count, impl=impl)
# TODO next test should probs be runtimeerror?
================================================
FILE: src/cachew/tests/test_cachew.py
================================================
# ruff: noqa: ARG001 # ruff thinks pytest fixtures are unused arguments
import hashlib
import inspect
import platform
import string
import sys
import time
import timeit
from collections.abc import Iterable, Iterator, Sequence
from concurrent.futures import ProcessPoolExecutor
from contextlib import nullcontext
from dataclasses import asdict, dataclass
from datetime import UTC, date, datetime
from itertools import chain, islice
from pathlib import Path
from random import Random
from subprocess import check_call, check_output, run
from time import sleep
from typing import (
Any,
NamedTuple,
cast,
)
import patchy
import pytest
from more_itertools import ilen, last, one, unique_everseen
from .. import (
Backend,
CachewException,
cachew,
callable_name,
get_logger,
settings,
)
from .utils import (
gc_control, # noqa: F401
running_on_ci,
)
logger = get_logger()
@pytest.fixture(autouse=True)
def set_default_cachew_dir(tmp_path: Path):
tpath = tmp_path / 'cachew_default'
settings.DEFAULT_CACHEW_DIR = tpath
@pytest.fixture(autouse=True)
def throw_on_errors():
# NOTE: in tests we always throw on errors, it's a more reasonable default for testing.
# we still check defensive behaviour in test_defensive
settings.THROW_ON_ERROR = True
# TODO restore it?
@pytest.fixture(autouse=True, params=['sqlite', 'file'])
def set_backend(restore_settings, request):
backend = request.param
settings.DEFAULT_BACKEND = backend
# TODO restore it??
@pytest.fixture
def restore_settings():
orig = {k: v for k, v in settings.__dict__.items() if not k.startswith('__')}
try:
yield
finally:
for k, v in orig.items():
setattr(settings, k, v)
class UUU(NamedTuple):
xx: int
yy: int
def test_simple() -> None:
# just make sure all the high level cachew stuff is working
@cachew
def fun() -> Iterable[UUU]:
yield from []
list(fun())
def test_string_annotation_old() -> None:
"""
For some reason collections.abc.Iterable doesn't seem to work here on python <= 3.11
, it only sees 'UUU' as a string
Keeping this just as a demonstration, probably not worth trying to support as it's fairly esoteric combo.
"""
from typing import Iterable as typing_Iterable # noqa: UP035
@cachew
def fun() -> typing_Iterable['UUU']:
yield from []
# should properly infer UUU type
list(fun())
def test_string_annotation_new() -> None:
@cachew
def fun() -> Iterable['UUU']:
yield from []
# should properly infer UUU type
list(fun())
def test_custom_hash(tmp_path: Path) -> None:
"""
Demo of using argument's modification time to determine if underlying data changed
"""
src = tmp_path / 'source'
src.write_text('0')
entities = [
UUU(xx=1, yy=1),
UUU(xx=2, yy=2),
UUU(xx=3, yy=3),
]
calls = 0
def get_path_version(path: Path):
ns = path.stat().st_mtime_ns
# hmm, this might be unreliable, sometimes mtime doesn't change even after modifications?
# I suppose it takes some time for them to sync or something...
# so let's compute md5 or something in addition..
md5 = hashlib.md5(path.read_bytes()).digest()
return str((ns, md5))
@cachew(
cache_path=tmp_path,
depends_on=get_path_version, # when path is updated, underlying cache would be discarded
)
def data(path: Path) -> Iterable[UUU]:
nonlocal calls
calls += 1
count = int(path.read_text())
return entities[:count]
ldata = lambda: list(data(path=src))
assert len(ldata()) == 0
assert len(ldata()) == 0
assert len(ldata()) == 0
assert calls == 1
src.write_text('1')
assert ldata() == entities[:1]
assert ldata() == entities[:1]
assert calls == 2
src.write_text('3')
assert ldata() == entities
assert ldata() == entities
assert calls == 3
def test_caching(tmp_path: Path) -> None:
@cachew(tmp_path)
def data() -> Iterator[UUU]:
time.sleep(1)
for i in range(5):
yield UUU(xx=i, yy=i)
time.sleep(1)
# https://stackoverflow.com/a/40385994/706389
template = """
def inner(_it, _timer{init}):
{setup}
_t0 = _timer()
for _i in _it:
retval = {stmt}
_t1 = _timer()
return _t1 - _t0, retval
"""
timeit.template = template # type: ignore[attr-defined]
timer = timeit.Timer(lambda: len(list(data())))
t, cnt = cast(tuple[float, int], timer.timeit(number=1))
assert cnt == 5
assert t > 5.0, 'should take at least 5 seconds'
t, cnt = cast(tuple[float, int], timer.timeit(number=1))
assert cnt == 5
assert t < 2.0, 'should be pretty much instantaneous'
def test_error(tmp_path: Path) -> None:
'''
Test behaviour when the first time cache is initialized it ends up with an error
'''
cache_file = tmp_path / 'cache'
assert not cache_file.exists(), cache_file # just precondition
should_raise = True
@cachew(cache_file, force_file=True)
def fun() -> Iterator[str]:
yield 'string1'
if should_raise:
raise RuntimeError('oops')
yield 'string2'
with pytest.raises(RuntimeError, match='oops'):
list(fun())
# vvv this would be nice but might be tricky because of the way sqlite works (i.e. wal mode creates a file)
# assert not cache_file.exists(), cache_file
# perhaps doesn't hurt either way as long this vvv works properly
# shouldn't cache anything and crach again
with pytest.raises(RuntimeError, match='oops'):
list(fun())
should_raise = False
assert list(fun()) == ['string1', 'string2']
def test_cache_path(tmp_path: Path) -> None:
'''
Tests various ways of specifying cache path
'''
calls = 0
def orig() -> Iterable[int]:
nonlocal calls
yield 1
yield 2
calls += 1
fun = cachew(tmp_path / 'non_existent_dir' / 'cache_dir')(orig)
assert list(fun()) == [1, 2]
assert calls == 1
assert list(fun()) == [1, 2]
assert calls == 1
# dir by default
cdir = tmp_path / 'non_existent_dir' / 'cache_dir'
assert cdir.is_dir()
cfile = one(cdir.glob('*'))
assert cfile.name.startswith('cachew.tests.test_cachew:test_cache_path.')
# treat None as "don't cache"
fun = cachew(cache_path=None)(orig)
assert list(fun()) == [1, 2]
assert calls == 2
assert list(fun()) == [1, 2]
assert calls == 3
f = tmp_path / 'a_file'
f.touch()
fun = cachew(cache_path=f)(orig)
assert list(fun()) == [1, 2]
assert calls == 4
assert list(fun()) == [1, 2]
assert calls == 4
fun = cachew(tmp_path / 'name', force_file=True)(orig)
assert list(fun()) == [1, 2]
assert calls == 5
assert list(fun()) == [1, 2]
assert calls == 5
# if passed force_file, also treat as file
assert (tmp_path / 'name').is_file()
# treat None as "don't cache" ('factory')
# hmm not sure why mypy complains here.. might better if we get to use ParamSpec?
fun = cachew(cache_path=lambda *args: None)(orig) # type: ignore[arg-type] # noqa: ARG005
assert list(fun()) == [1, 2]
assert calls == 6
assert list(fun()) == [1, 2]
assert calls == 7
# TODO this won't work at the moment
# f.write_text('garbage')
# not sure... on the one hand could just delete the garbage file and overwrite with db
# on the other hand, wouldn't want to delete some user file by accident
class UGood(NamedTuple):
x: int
class UBad:
pass
def test_unsupported_class(tmp_path: Path) -> None:
with pytest.raises(CachewException, match=r'.*failed to infer cache type.*'):
@cachew(cache_path=tmp_path)
def fun() -> list[UBad]:
return [UBad()]
with pytest.raises(CachewException, match=r".*can't infer type from.*"):
@cachew(cache_path=tmp_path)
def fun2() -> Iterable[UGood | UBad]:
yield UGood(x=1)
yield UBad()
yield UGood(x=2)
class TE2(NamedTuple):
value: int
uuu: UUU
value2: int
# you can run one specific test (e.g. to profile) by passing it as -k to pytest
# e.g. -k 'test_many[500000-False]'
@pytest.mark.parametrize('count', [99, 500_000, 1_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_many(count: int, tmp_path: Path, gc_control) -> None:
if count > 99 and running_on_ci:
pytest.skip("test would be too slow on CI, only meant to run manually")
# should be a parametrized test perhaps
src = tmp_path / 'source'
src.touch()
cache_path = tmp_path / 'test_many'
@cachew(cache_path=cache_path, force_file=True)
def iter_data() -> Iterator[TE2]:
for i in range(count):
# TODO also profile datetimes?
yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i)
a = time.time()
assert ilen(iter_data()) == count # initial
b = time.time()
print(f'test_many: initial write to cache took {b - a:.1f}s', file=sys.stderr)
print(f'test_many: cache size is {cache_path.stat().st_size / 10**6}Mb', file=sys.stderr)
a = time.time()
assert ilen(iter_data()) == count # hitting cache
b = time.time()
print(f'test_many: reading from cache took {b - a:.1f}s', file=sys.stderr)
assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1)
# serializing to db
# in-memory: 16 seconds
# without transaction: 22secs
# without transaction and size 100 chunks -- some crazy amount of time, as expected
# with transaction:
# about 17 secs to write 1M entries (just None)
# chunking by 20K doesn't seem to help
# chunking by 100 also gives same perf
# with to_row binding: 21 secs for dummy NamedTuple with None inside, 22 for less trivial class
# deserializing from db:
# initially, took 20 secs to load 1M entries (TE2)
# 9 secs currently
# 6 secs if we instantiate namedtuple directly via indices
# 3.5 secs if we just return None from row
class BB(NamedTuple):
xx: int
yy: int
class AA(NamedTuple):
value: int
b: BB | None
value2: int
def test_return_type_inference(tmp_path: Path) -> None:
"""
Tests that return type (BB) is inferred from the type annotation
"""
@cachew(tmp_path)
def data() -> Iterator[BB]:
yield BB(xx=1, yy=2)
yield BB(xx=3, yy=4)
assert len(list(data())) == 2
assert len(list(data())) == 2
def test_return_type_mismatch(tmp_path: Path) -> None:
# even though user got invalid type annotation here, they specified correct type, and it's the one that should be used
@cachew(tmp_path, cls=AA)
def data2() -> list[BB]:
return [ # ty: ignore[invalid-return-type]
AA(value=1, b=None, value2=123), # type: ignore[list-item]
]
# TODO hmm, this is kinda a downside that it always returns
# could preserve the original return type, but too much trouble for now
assert list(data2()) == [AA(value=1, b=None, value2=123)] # type: ignore[comparison-overlap]
def test_return_type_none(tmp_path: Path) -> None:
with pytest.raises(CachewException):
@cachew(tmp_path)
def data():
return []
def test_callable_cache_path(tmp_path: Path) -> None:
"""
Cache path can be function dependent on wrapped function's arguments
"""
called: set[str] = set()
@cachew(cache_path=lambda kind: tmp_path / f'{kind}.cache')
def get_data(kind: str) -> Iterator[BB]:
assert kind not in called
called.add(kind)
if kind == 'first':
yield BB(xx=1, yy=1)
else:
yield BB(xx=2, yy=2)
# fmt: off
assert list(get_data('first')) == [BB(xx=1, yy=1)]
assert list(get_data('second')) == [BB(xx=2, yy=2)]
assert list(get_data('first')) == [BB(xx=1, yy=1)]
assert list(get_data('second')) == [BB(xx=2, yy=2)]
# fmt: on
def test_nested(tmp_path: Path) -> None:
d1 = AA(
value=1,
b=BB(xx=2, yy=3),
value2=4,
)
d2 = AA(
value=3,
b=None,
value2=5,
)
def data():
yield d1
yield d2
@cachew(cache_path=tmp_path, cls=AA)
def get_data():
yield from data()
assert list(get_data()) == [d1, d2]
assert list(get_data()) == [d1, d2]
class BBv2(NamedTuple):
xx: int
yy: int
zz: float
def test_schema_change(tmp_path: Path) -> None:
"""
Should discard cache on schema change (BB to BBv2) in this example
"""
b = BB(xx=2, yy=3)
@cachew(cache_path=tmp_path, cls=BB)
def get_data():
return [b]
assert list(get_data()) == [b]
# TODO make type part of key?
b2 = BBv2(xx=3, yy=4, zz=5.0)
@cachew(cache_path=tmp_path, cls=BBv2)
def get_data_v2():
return [b2]
assert list(get_data_v2()) == [b2]
def test_transaction(tmp_path: Path) -> None:
"""
Should keep old cache and not leave it in some broken state in case of errors
"""
# logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
class TestError(Exception):
pass
@cachew(cache_path=tmp_path, cls=BB, chunk_by=1)
def get_data(version: int):
for i in range(3):
yield BB(xx=2, yy=i)
if version == 2:
raise TestError
exp = [BB(xx=2, yy=0), BB(xx=2, yy=1), BB(xx=2, yy=2)]
assert list(get_data(1)) == exp
assert list(get_data(1)) == exp
# TODO test that hash is unchanged?
with pytest.raises(TestError):
list(get_data(2))
assert list(get_data(1)) == exp
class Job(NamedTuple):
company: str
title: str | None
def test_optional(tmp_path: Path) -> None:
"""
Tests support for typing.Optional
"""
@cachew(tmp_path)
def data() -> Iterator[Job]:
# fmt: off
yield Job('google' , title='engineed')
yield Job('selfemployed', title=None)
# fmt: on
list(data()) # trigger cachew
# fmt: off
assert list(data()) == [
Job('google' , title='engineed'),
Job('selfemployed', title=None),
]
# fmt: on
# TODO add test for optional for misleading type annotation
class Person(NamedTuple):
name: str
secondname: str
age: int
job: Job | None
def make_people_data(count: int) -> Iterator[Person]:
g = Random(124)
chars = string.ascii_uppercase + string.ascii_lowercase
randstr = lambda len_: ''.join(g.choices(chars, k=len_))
for _ in range(count):
has_job = g.choice([True, False])
maybe_job: Job | None = None
if has_job:
maybe_job = Job(company=randstr(12), title=randstr(8))
yield Person(
name=randstr(5),
secondname=randstr(10),
age=g.randint(20, 60),
job=maybe_job,
)
def test_stats(tmp_path: Path) -> None:
cache_file = tmp_path / 'cache'
# 4 + things are string lengths
one = (4 + 5) + (4 + 10) + 4 + (4 + 12 + 4 + 8)
N = 10000
@cachew(cache_path=cache_file, cls=Person)
def get_people_data() -> Iterator[Person]:
yield from make_people_data(count=N)
list(get_people_data())
print(
f"Cache db size for {N} entries: estimated size {one * N // 1024} Kb, actual size {cache_file.stat().st_size // 1024} Kb;"
)
@dataclass
class Test:
field: int
def test_dataclass(tmp_path: Path) -> None:
@cachew(tmp_path)
def get_dataclasses() -> Iterator[Test]:
yield from [Test(field=i) for i in range(5)]
assert list(get_dataclasses()) == [Test(field=i) for i in range(5)]
assert list(get_dataclasses()) == [Test(field=i) for i in range(5)]
def test_inner_class(tmp_path: Path) -> None:
# NOTE: this doesn't work at the moment if from __future__ import annotations is used in client code (e.g. on top of this test)
# see test_future_annotations for more info
@dataclass
class InnerDataclass:
field: int
@cachew(tmp_path)
def fun() -> Iterator[InnerDataclass]:
yield from []
# should manage to infer type and not crash at least
list(fun())
list(fun())
@dataclass
class Dates:
d1: datetime
d2: datetime
d3: datetime
d4: datetime
d5: datetime
def test_dates(tmp_path: Path) -> None:
from zoneinfo import ZoneInfo
tz = ZoneInfo('Europe/London')
dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S')
dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S')
x = Dates(
d1=dwinter.replace(tzinfo=tz),
d2=dsummer.replace(tzinfo=tz),
d3=dwinter,
d4=dsummer,
d5=dsummer.replace(tzinfo=UTC),
)
@cachew(tmp_path)
def fun() -> Iterable[Dates]:
yield x
assert one(fun()) == x
assert one(fun()) == x
# make sure the actuall tzinfo is preserved... otherwise we might end up with raw offsets and lose some info
r = one(fun())
assert str(r.d1.tzinfo) == str(x.d1.tzinfo)
assert str(r.d2.tzinfo) == str(x.d2.tzinfo)
assert r.d3.tzname() is None
assert r.d4.tzname() is None
assert r.d5.tzinfo is UTC
# fmt: off
@dataclass
class AllTypes:
a_str : str
an_int : int
a_float : float
a_bool : bool
a_dt : datetime
a_date : date
a_dict : dict[str, Any]
a_list : list[Any]
a_tuple : tuple[float, str]
an_exc : Exception
an_opt : str | None
# fmt: on
# TODO support vararg tuples?
def test_types(tmp_path: Path) -> None:
import pytz
tz = pytz.timezone('Europe/Berlin')
# fmt: off
obj = AllTypes(
a_str = 'abac',
an_int = 1123,
a_float = 3.131,
a_bool = True,
a_dt = datetime.now(tz=tz),
a_date = datetime.now().replace(year=2000).date(),
a_dict = {'a': True, 'x': {'whatever': 3.14}},
a_list = ['aba', 123, None],
a_tuple = (1.23, '3.2.1'),
an_exc = RuntimeError('error!', 123),
an_opt = 'hello',
)
# fmt: on
@cachew(tmp_path)
def get() -> Iterator[AllTypes]:
yield obj
def helper(t: AllTypes):
# Exceptions can't be directly compared.. so this kinda helps
d = asdict(t)
d['an_exc'] = d['an_exc'].args
return d
assert helper(one(get())) == helper(obj)
assert helper(one(get())) == helper(obj)
# TODO if I do perf tests, look at this https://docs.sqlalchemy.org/en/13/_modules/examples/performance/large_resultsets.html
# TODO should be possible to iterate anonymous tuples too? or just sequences of primitive types?
def test_primitive(tmp_path: Path) -> None:
@cachew(tmp_path)
def fun() -> Iterator[str]:
yield 'aba'
yield 'caba'
assert list(fun()) == ['aba', 'caba']
assert list(fun()) == ['aba', 'caba']
def test_single_value(tmp_path: Path) -> None:
@cachew(tmp_path)
def fun_int() -> int:
return 123
assert fun_int() == 123
assert fun_int() == 123
@cachew(tmp_path, cls=('single', str))
def fun_str():
return 'whatever'
assert fun_str() == 'whatever'
assert fun_str() == 'whatever'
@cachew(tmp_path)
def fun_opt_namedtuple(none: bool) -> UUU | None: # noqa: FBT001
if none:
return None
else:
return UUU(xx=1, yy=2)
assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2)
assert fun_opt_namedtuple(none=False) == UUU(xx=1, yy=2)
assert fun_opt_namedtuple(none=True) is None
assert fun_opt_namedtuple(none=True) is None
class O(NamedTuple):
x: int
class _HackHash:
def __init__(self, x: int) -> None:
self.x = x
def __repr__(self):
return repr(self.x)
def test_default_arguments(tmp_path: Path) -> None:
hh = _HackHash(1)
calls = 0
def orig(a: int, param: _HackHash = hh) -> Iterator[O]:
yield O(hh.x)
nonlocal calls
calls += 1
def depends_on(a: int, param: _HackHash) -> str:
# hmm. in principle this should be str according to typing
# on practice though we always convert hash to str, so maybe type should be changed to Any?
return (a, param.x) # type: ignore[return-value]
fun = cachew(tmp_path, depends_on=depends_on)(orig)
list(fun(123))
assert list(fun(123)) == [O(1)]
assert calls == 1
# now, change hash. That should cause the composite hash to invalidate and recompute
hh.x = 2
assert list(fun(123)) == [O(2)]
assert calls == 2
# should be ok with explicitly passing
assert list(fun(123, param=_HackHash(2))) == [O(2)]
assert calls == 2
# we don't have to handle the default param in the default hash key
fun = cachew(tmp_path)(fun)
assert list(fun(456)) == [O(2)]
assert calls == 3
assert list(fun(456)) == [O(2)]
assert calls == 3
# changing the default should trigger the default (i.e. kwargs) key function to invalidate the cache
hh.x = 3
assert list(fun(456)) == [O(3)]
assert calls == 4
# you don't have to pass the default parameter explicitly
fun = cachew(tmp_path, depends_on=lambda a: a)(orig)
assert list(fun(456)) == [O(3)]
assert calls == 5
# but watch out if you forget to handle it!
hh.x = 4
assert list(fun(456)) == [O(3)]
assert calls == 5
class U(NamedTuple):
x: str | O
def test_union(tmp_path: Path) -> None:
@cachew(tmp_path)
def fun() -> Iterator[U]:
yield U('hi')
yield U(O(123))
list(fun())
assert list(fun()) == [U('hi'), U(O(123))]
# NOTE: empty dataclass doesn't have __annotations__ ??? not sure if need to handle it...
@dataclass
class DD:
x: int
def test_union_with_dataclass(tmp_path: Path) -> None:
@cachew(tmp_path)
def fun() -> Iterator[int | DD]:
yield 123
yield DD(456)
assert list(fun()) == [123, DD(456)]
# ugh. we need to pass backend here explicitly since it might not get picked up from the fixture
# that sets it in settings. due to multiprocess stuff
def _concurrent_helper(cache_path: Path, count: int, backend: Backend, sleep_s=0.1):
@cachew(cache_path, backend=backend)
def test(count: int) -> Iterator[int]:
for i in range(count):
print(f"{count}: GENERATING {i}")
sleep(sleep_s)
yield i * i
return list(test(count=count))
@pytest.fixture
def fuzz_cachew_impl():
"""
Insert random sleeps in cachew_impl to increase likelihood of concurrency issues
"""
from .. import cachew_wrapper
patch = '''\
@@ -189,6 +189,11 @@
old_hash = backend.get_old_hash()
logger.debug(f'old hash: {old_hash}')
+ from random import random
+ rs = random() * 2
+ print("sleeping for: ", rs)
+ from time import sleep; sleep(rs)
+
if new_hash == old_hash:
logger.debug('hash matched: loading from cache')
yield from cached_items()
'''
patchy.patch(cachew_wrapper, patch)
yield
patchy.unpatch(cachew_wrapper, patch)
# TODO fuzz when they start so they enter transaction at different times?
# TODO how to run it enough times on CI and increase likelihood of failing?
# for now, stress testing manually:
# while PYTHONPATH=src pytest -s cachew -k concurrent_writes ; do sleep 0.5; done
@pytest.mark.xfail(condition=platform.system() == 'Darwin', reason='seems like file writes might not be atomic on osx?')
def test_concurrent_writes(tmp_path: Path, fuzz_cachew_impl) -> None:
cache_path = tmp_path / 'cache.sqlite'
# warm up to create the database
# FIXME ok, that will be fixed separately with atomic move I suppose
_concurrent_helper(cache_path, 1, settings.DEFAULT_BACKEND)
processes = 5
with ProcessPoolExecutor() as pool:
futures = [
pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND) for count in range(processes)
]
for count, f in enumerate(futures):
assert f.result() == [i * i for i in range(count)]
# TODO ugh. need to keep two processes around to test for yield holding transaction lock
def test_concurrent_reads(tmp_path: Path, fuzz_cachew_impl):
cache_path = tmp_path / 'cache.sqlite'
count = 10
# warm up
_concurrent_helper(cache_path, count, settings.DEFAULT_BACKEND, sleep_s=0)
processes = 4
start = time.time()
with ProcessPoolExecutor() as pool:
futures = [
pool.submit(_concurrent_helper, cache_path, count, settings.DEFAULT_BACKEND, 1) for _ in range(processes)
]
for f in futures:
print(f.result())
end = time.time()
taken = end - start
# should be pretty instantaneous
# if it takes more, most likely means that helper was called again
assert taken < 5
def test_mcachew(tmp_path: Path):
# TODO how to test for defensive behaviour?
from cachew.extra import mcachew
# TODO check throw on error
@mcachew(cache_path=tmp_path / 'cache')
def func() -> Iterator[str]:
yield 'one'
yield 'two'
assert list(func()) == ['one', 'two']
assert list(func()) == ['one', 'two']
def test_defensive(restore_settings) -> None:
'''
Make sure that cachew doesn't crash on misconfiguration
'''
def orig() -> Iterator[int]:
yield 123
def orig2():
yield "x"
yield 123
fun = cachew(bad_arg=123)(orig) # type: ignore[call-overload]
assert list(fun()) == [123]
assert list(fun()) == [123]
for throw in [True, False]:
ctx = pytest.raises(Exception) if throw else nullcontext()
settings.THROW_ON_ERROR = throw
with ctx:
fun = cachew(cache_path=lambda: 1 + 'bad_path_provider')(orig) # type: ignore[arg-type,misc,operator]
assert list(fun()) == [123]
assert list(fun()) == [123]
fun = cachew(cache_path=lambda p: '/tmp/' + str(p))(orig)
assert list(fun()) == [123]
assert list(fun()) == [123]
fun = cachew(orig2)
assert list(fun()) == ['x', 123]
assert list(fun()) == ['x', 123]
settings.DEFAULT_CACHEW_DIR = '/dev/nonexistent'
fun = cachew(orig)
assert list(fun()) == [123]
assert list(fun()) == [123]
@pytest.mark.parametrize('throw', [False, True])
def test_bad_annotation(*, tmp_path: Path, throw: bool) -> None:
"""
this will work in runtime without cachew if from __future__ import annotations is used
so should work with cachew decorator as well
"""
src = tmp_path / 'src.py'
src.write_text(
f'''
from __future__ import annotations
from cachew import settings, cachew
settings.THROW_ON_ERROR = {throw}
@cachew
def fun() -> BadType:
print("called!")
return 0
fun()
'''.lstrip()
)
ctx = pytest.raises(Exception) if throw else nullcontext()
with ctx:
assert check_output([sys.executable, src], text=True).strip() == "called!"
def test_recursive_simple(tmp_path: Path) -> None:
d0 = 0
d1 = 1000
calls = 0
@cachew(tmp_path)
def factorials(n: int) -> Iterable[int]:
nonlocal calls, d0, d1
calls += 1
if n == 0:
d0 = len(inspect.stack(0))
if n == 1:
d1 = len(inspect.stack(0))
if n == 0:
yield 1
return
prev = factorials(n - 1)
last = 1
# TODO potentially quadratic? measure perf perhaps?
for x in prev:
yield x
last = x
yield last * n
assert calls == 0
assert list(factorials(3)) == [1, 1, 2, 6]
# make sure the recursion isn't eating too much stack
# ideally would have 1? not sure if possible without some insane hacking?
# todo maybe check stack frame size as well?
assert abs(d0 - d1) <= 2
assert calls == 4
assert list(factorials(3)) == [1, 1, 2, 6]
assert calls == 4
assert list(factorials(5)) == [1, 1, 2, 6, 24, 120]
assert calls == 6
assert list(factorials(3)) == [1, 1, 2, 6]
assert calls == 10
def test_recursive_deep(tmp_path: Path) -> None:
@cachew(tmp_path)
def numbers(n: int) -> Iterable[int]:
if n == 0:
yield 0
return
yield from numbers(n - 1)
yield n
@cachew(cache_path=None)
def numbers_cache_disabled(n: int) -> Iterable[int]:
if n == 0:
yield 0
return
yield from numbers(n - 1)
yield n
rlimit = sys.getrecursionlimit()
# NOTE in reality it has to do with the number of file descriptors (ulimit -Sn, e.g. 1024?)
# but it seems that during the error unrolling, pytest or something else actually hits the recursion limit somehow
# pytest ends up with an internal error in such case... which is good enough as long as tests are concerned I guess.
sys.setrecursionlimit(2 * 800 + 100)
try:
# at the moment each recursive call takes two frames (one for the original call, one for cachew_wrapper)
# + allow 100 calls for random constant overhead like pytest etc
list(numbers(800))
list(numbers(800))
list(numbers_cache_disabled(800))
list(numbers_cache_disabled(800))
finally:
sys.setrecursionlimit(rlimit)
def test_recursive_error(tmp_path: Path) -> None:
@cachew(tmp_path)
def rec(n: int) -> Iterable[int]:
if n == 0:
yield 0
return
yield from rec(n - 1)
yield n
rlimit = sys.getrecursionlimit()
try:
sys.setrecursionlimit(50)
list(rec(100))
raise AssertionError('Expecting recursion error')
except RecursionError:
pass
finally:
sys.setrecursionlimit(rlimit)
# todo not sure if cache file should exist??
# either way, at least check that the db is not completely messed up
assert len(list(rec(100))) == 101
def test_exceptions(tmp_path: Path) -> None:
class X(NamedTuple):
a: int
d = datetime.strptime('20200102 03:04:05', '%Y%m%d %H:%M:%S')
@cachew(tmp_path)
def fun() -> Iterator[Exception]:
yield RuntimeError('whatever', 123, d, X(a=123))
list(fun())
[e] = fun()
# not sure if there is anything that can be done to preserve type information?
assert type(e) is Exception
assert e.args == ('whatever', 123, '2020-01-02T03:04:05', 'X(a=123)')
# see https://beepb00p.xyz/mypy-error-handling.html#kiss
def test_result(tmp_path: Path) -> None:
@cachew(tmp_path)
def fun() -> Iterator[Exception | int]:
yield 1
yield RuntimeError("sad!")
yield 123
list(fun())
[v1, ve, v123] = fun()
assert v1 == 1
assert v123 == 123
assert isinstance(ve, Exception)
assert ve.args == ('sad!',)
def test_version_change(tmp_path: Path) -> None:
calls = 0
@cachew(tmp_path, logger=logger)
def fun() -> Iterator[str]:
nonlocal calls
calls += 1
yield from ['a', 'b', 'c']
list(fun())
list(fun())
assert calls == 1
# todo ugh. not sure how to do this as a relative import??
import cachew as cachew_module
old_version = cachew_module.CACHEW_VERSION
try:
cachew_module.CACHEW_VERSION = old_version + '_whatever'
# should invalidate cachew now
list(fun())
assert calls == 2
list(fun())
assert calls == 2
finally:
cachew_module.CACHEW_VERSION = old_version
# and now again, back to the old version
list(fun())
assert calls == 3
list(fun())
assert calls == 3
def dump_old_cache(tmp_path: Path) -> None:
# call this if you want to get an sql script for version upgrade tests..
oc = tmp_path / 'old_cache.sqlite'
@cachew(oc)
def fun() -> Iterator[int]:
yield from [1, 2, 3]
list(fun())
assert oc.exists(), oc
sql = check_output(['sqlite3', oc, '.dump']).decode('utf8')
print(sql, file=sys.stderr)
def test_old_cache_v0_6_3(tmp_path: Path) -> None:
if settings.DEFAULT_BACKEND != 'sqlite':
pytest.skip('this test only makes sense for sqlite backend')
sql = '''
PRAGMA foreign_keys=OFF;
BEGIN TRANSACTION;
CREATE TABLE hash (
value VARCHAR
);
INSERT INTO hash VALUES('cachew: 1, schema: {''_'': }, hash: ()');
CREATE TABLE IF NOT EXISTS "table" (
_cachew_primitive INTEGER
);
INSERT INTO "table" VALUES(1);
INSERT INTO "table" VALUES(2);
INSERT INTO "table" VALUES(3);
COMMIT;
'''
db = tmp_path / 'cache.sqlite'
check_call(['sqlite3', db, sql])
@cachew(db)
def fun() -> Iterator[int]:
yield from [1, 2, 3]
# this tests that it doesn't crash
# for actual version upgrade test see test_version_change
assert list(fun()) == [1, 2, 3]
def test_disabled(tmp_path: Path) -> None:
calls = 0
@cachew(tmp_path)
def fun() -> Iterator[int]:
yield 1
yield 2
nonlocal calls
calls += 1
assert list(fun()) == [1, 2]
assert list(fun()) == [1, 2]
assert calls == 1
from cachew.extra import disabled_cachew
with disabled_cachew():
assert list(fun()) == [1, 2]
assert calls == 2
assert list(fun()) == [1, 2]
assert calls == 3
def test_early_exit_simple(tmp_path: Path) -> None:
# cachew works on iterators and we'd prefer not to cache if the iterator hasn't been exhausted
calls_f = 0
@cachew(tmp_path)
def f() -> Iterator[int]:
yield from range(20)
nonlocal calls_f
calls_f += 1
calls_g = 0
@cachew(tmp_path)
def g() -> Iterator[int]:
yield from f()
nonlocal calls_g
calls_g += 1
# only consume 10/20 items
assert len(list(islice(g(), 0, 10))) == 10
# precondition
assert calls_f == 0 # f hasn't been fully exhausted
assert calls_g == 0 # g hasn't been fully exhausted
# todo not sure if need to check that db is empty?
assert len(list(g())) == 20
assert calls_f == 1
assert calls_g == 1
# should be cached now
assert len(list(g())) == 20
assert calls_f == 1
assert calls_g == 1
# see https://github.com/sqlalchemy/sqlalchemy/issues/5522#issuecomment-705156746
def test_early_exit_shutdown(tmp_path: Path) -> None:
# don't ask... otherwise the exception doesn't appear :shrug:
import_hack = '''
from sqlalchemy import Column
import re
re.hack = lambda: None
'''
Path(tmp_path / 'import_hack.py').write_text(import_hack)
prog = f'''
import sys
sys.path.insert(0, '')
import import_hack
import cachew
cachew.settings.THROW_ON_ERROR = True # todo check with both?
@cachew.cachew('{tmp_path}', cls=int)
def fun():
yield 0
g = fun()
e = next(g)
print("FINISHED")
'''
r = run([sys.executable, '-c', prog], cwd=tmp_path, capture_output=True, check=True)
assert r.stdout.strip() == b'FINISHED'
assert b'Traceback' not in r.stderr
# tests both modes side by side to demonstrate the difference
@pytest.mark.parametrize('use_synthetic', ['False', 'True'])
def test_synthetic_keyset(*, tmp_path: Path, use_synthetic: bool) -> None:
# just to keep track of which data we had to compute from scratch
_recomputed: list[str] = []
# assume key i is responsible for numbers i and i-1
# in reality this could be some slow function we'd like to avoid calling if its results is already cached
# e.g. the key would typically be a filename (e.g. isoformat timestamp)
# and the returned values could be the results of an export over the month prior to the timestamp, or something like that
# see https://beepb00p.xyz/exports.html#synthetic for more on the motivation
def compute(key: str) -> Iterator[str]:
_recomputed.append(key)
n = int(key)
yield str(n - 1)
yield str(n)
# fmt: off
# should result in 01 + 12 + 45 == 01245
keys125 = ['1', '2', '5' ]
# should result in 01 + 12 + 45 + 56 + 67 == 0124567
keys12567 = ['1', '2', '5', '6', '7' ]
# should result in 01 + 12 + 45 + 56 + 78 + 89 == 012456789
keys125689 = ['1', '2', '5', '6', '8', '9']
# should result in 45 + 56 + 78 + 89 == 456789
keys5689 = [ '5', '6', '8', '9']
# fmt: on
def recomputed() -> list[str]:
r = list(_recomputed)
_recomputed.clear()
return r
## 'cachew_cached' will just be [] if synthetic key is not used, so no impact on data
@cachew(tmp_path, synthetic_key=('keys' if use_synthetic else None))
def fun_aux(keys: Sequence[str], *, cachew_cached: Iterable[str] = []) -> Iterator[str]:
yield from unique_everseen(
chain(
cachew_cached,
*(compute(key) for key in keys),
)
)
def fun(keys: Sequence[str]) -> set[str]:
return set(fun_aux(keys=keys))
##
# preserve formatting of string arguments it makes easier to read the tes
# fmt: off
assert fun(keys125) == set('01' '12' '45')
assert recomputed() == keys125
assert fun(keys125) == set('01' '12' '45')
assert recomputed() == [] # should be cached
assert fun(keys12567) == set('01' '12' '45' '56' '67')
if use_synthetic:
# 1, 2 and 5 should be already cached from the previous call
assert recomputed() == ['6', '7']
else:
# but without synthetic key this would cause everything to recompute
assert recomputed() == keys12567
assert fun(keys12567) == set('01' '12' '45' '56' '67')
assert recomputed() == [] # should be cached
assert fun(keys125689) == set('01' '12' '45' '56' '78' '89')
if use_synthetic:
# similarly, 1 2 5 6 7 are cached from the previous cacll
assert recomputed() == ['8', '9']
else:
# and we need to call against all keys otherwise
assert recomputed() == keys125689
assert fun(keys125689) == set('01' '12' '45' '56' '78' '89')
assert recomputed() == [] # should be cached
assert fun(keys5689) == set('45' '56' '78' '89')
# now the prefix has changed, so if we returned cached items it might return too much
# so have to recompute everything
assert recomputed() == keys5689
assert fun(keys5689) == set('45' '56' '78' '89')
assert recomputed() == [] # should be cached
# fmt: on
# TODO maybe call combined function? so it could return total result and last cached?
# TODO another option is:
# the function yields all cached stuff first
# then the user yields stuff from new
# and then external function does merging
# TODO test with kwargs hash?...
# TODO try without and with simultaneously?
# TODO check what happens when errors happen?
# FIXME check what happens if we switch between modes? (synthetic/non-synthetic)
# FIXME make sure this thing works if len(keys) > chunk size?
# TODO check what happens when we forget to set 'cachew_cached' argument
# TODO check what happens when keys are not str but e.g. Path
def test_db_path_matches_fun_name(tmp_path: Path) -> None:
@cachew(tmp_path)
def fun_single() -> int:
return 123
@cachew(tmp_path)
def fun_multiple() -> Iterable[int]:
return [123]
# write to cache
fun_single()
list(fun_multiple())
assert (tmp_path / callable_name(fun_single)).exists()
assert (tmp_path / callable_name(fun_multiple)).exists()
def test_type_alias_type_1(tmp_path: Path) -> None:
type Int = int
@cachew(tmp_path)
def fun() -> Iterator[Int]:
yield 123
assert list(fun()) == [123]
assert list(fun()) == [123]
def test_type_alias_type_2(tmp_path: Path) -> None:
type IteratorInt = Iterator[int]
@cachew(tmp_path)
def fun() -> IteratorInt:
yield 123
assert list(fun()) == [123]
assert list(fun()) == [123]
def test_type_alias_generic(tmp_path: Path) -> None:
type Res[T] = T | Exception
type IntRes = Res[int]
@cachew(tmp_path)
def fun() -> Iterator[IntRes]:
yield 123
assert list(fun()) == [123]
assert list(fun()) == [123]
================================================
FILE: src/cachew/tests/test_future_annotations.py
================================================
from __future__ import annotations
import os
import sys
import textwrap
from collections.abc import Iterator
from dataclasses import dataclass
from pathlib import Path
from subprocess import check_output
from typing import Any
import pytest
from more_itertools import one
from .. import cachew
type _Str = str # deliberate, to test 3.12 'type ... = ...' type definitions
# fmt: off
@dataclass
class NewStyleTypes1:
a_str : str
a_dict : dict[str, Any]
a_list : list[Any]
a_tuple : tuple[float, _Str]
# fmt: on
def test_types1(tmp_path: Path) -> None:
obj = NewStyleTypes1(
a_str = 'abac',
a_dict = {'a': True, 'x': {'whatever': 3.14}},
a_list = ['aba', 123, None],
a_tuple = (1.23, '3.2.1'),
) # fmt: skip
@cachew(tmp_path)
def get() -> Iterator[NewStyleTypes1]:
yield obj
assert one(get()) == obj
assert one(get()) == obj
# fmt: off
@dataclass
class NewStyleTypes2:
an_opt : str | None
a_union : _Str | int
# fmt: on
def test_types2(tmp_path: Path) -> None:
obj = NewStyleTypes2(
an_opt = 'hello',
a_union = 999,
) # fmt: skip
@cachew(tmp_path)
def get() -> Iterator[NewStyleTypes2]:
yield obj
assert one(get()) == obj
assert one(get()) == obj
@pytest.mark.parametrize('use_future_annotations', [False, True])
@pytest.mark.parametrize('local', [False, True])
@pytest.mark.parametrize('throw', [False, True])
def test_future_annotations(
*,
use_future_annotations: bool,
local: bool,
throw: bool,
tmp_path: Path,
) -> None:
"""
Checks handling of postponed evaluation of annotations (from __future__ import annotations)
"""
# NOTE: to avoid weird interactions with existing interpreter in which pytest is running
# , we compose a program and running in python directly instead
# (also not sure if it's even possible to tweak postponed annotations without doing that)
if use_future_annotations and local and throw:
# when annotation is local (like inner class), then they end up as strings
# so we can't eval it as we don't have access to a class defined inside function
# keeping this test just to keep track of whether this is fixed at some point
# possibly relevant:
# - https://peps.python.org/pep-0563/#keeping-the-ability-to-use-function-local-state-when-defining-annotations
pytest.skip("local aliases/classses don't work with from __future__ import annotations")
_PREAMBLE = f'''
from pathlib import Path
import tempfile
from cachew import cachew, settings
settings.THROW_ON_ERROR = {throw}
temp_dir = tempfile.TemporaryDirectory()
td = Path(temp_dir.name)
'''
_TEST = '''
type Identity[T] = T
I = int
type S = Identity[str]
@cachew(td)
def fun() -> list[I | S]:
print("called")
return [1, "2"]
assert list(fun()) == [1, "2"]
assert list(fun()) == [1, "2"]
'''
if use_future_annotations:
code = '''
from __future__ import annotations
'''
else:
code = ''
code += _PREAMBLE
if local:
code += f'''
def test() -> None:
{textwrap.indent(_TEST, prefix=" ")}
test()
'''
else:
code += _TEST
run_py = tmp_path / 'run.py'
run_py.write_text(code)
cache_dir = tmp_path / 'cache'
cache_dir.mkdir()
res = check_output(
[sys.executable, run_py],
env={'TMPDIR': str(cache_dir), **os.environ},
text=True,
)
called = int(res.count('called'))
if use_future_annotations and local and not throw:
# cachew fails to set up, so no caching but at least it works otherwise
assert called == 2
else:
assert called == 1
================================================
FILE: src/cachew/tests/test_resolve_type_parameters.py
================================================
from ..utils import resolve_type_parameters
def test_simple_generic_alias() -> None:
# if you define types ad-hoc, they resolve to GenericAlias, not TypeAliasType
assert resolve_type_parameters(int) == int # noqa: E721
assert resolve_type_parameters(list[bool]) == list[bool]
assert resolve_type_parameters(dict[str, list[float]]) == dict[str, list[float]]
def test_simple_type_keyword() -> None:
type Int = int
assert resolve_type_parameters(Int) == int # noqa: E721
assert resolve_type_parameters(list[Int]) == list[int]
assert resolve_type_parameters(dict[str, list[Int]]) == dict[str, list[int]]
def test_generic_collections() -> None:
type ListInt = list[int]
assert resolve_type_parameters(ListInt) == list[int]
assert resolve_type_parameters(dict[str, ListInt]) == dict[str, list[int]]
type TupleInt = tuple[int, bool]
assert resolve_type_parameters(TupleInt) == tuple[int, bool]
type TupleIntStr = tuple[TupleInt, str]
assert resolve_type_parameters(TupleIntStr) == tuple[tuple[int, bool], str]
type SetStr = set[str]
assert resolve_type_parameters(SetStr) == set[str]
type DictAlias[K, V] = dict[K, V]
assert resolve_type_parameters(DictAlias[str, int]) == dict[str, int]
assert resolve_type_parameters(DictAlias[int, list[str]]) == dict[int, list[str]]
type ComplexDict = dict[str, tuple[ListInt, SetStr]]
assert resolve_type_parameters(ComplexDict) == dict[str, tuple[list[int], set[str]]]
def test_generic_type_keyword() -> None:
type Id[T] = T
type IdInt = Id[int]
assert resolve_type_parameters(IdInt) == int # noqa: E721
assert resolve_type_parameters(list[IdInt]) == list[int]
# check multiple uses of type params
type Pair[T] = tuple[T, T]
type PairInt = Pair[int]
assert resolve_type_parameters(PairInt) == tuple[int, int]
assert resolve_type_parameters(Pair[str]) == tuple[str, str]
assert resolve_type_parameters(list[Pair[int]]) == list[tuple[int, int]]
# check if type params aren't used
type NotUsing1[T, V] = int
type NotUsing2[V, W] = NotUsing1[bool, float]
type ListInt1 = list[NotUsing2[list, str]]
assert resolve_type_parameters(ListInt1) == list[int]
# Test generic alias with alias as parameter
type Container[T] = list[T]
type Int = int
assert resolve_type_parameters(Container[Int]) == list[int]
def test_chaining() -> None:
type Int = int
type Int2 = Int
type Int3 = Int2
assert resolve_type_parameters(Int3) == int # noqa: E721
type ListInt3 = list[Int3]
assert resolve_type_parameters(ListInt3) == list[int]
type Box[T] = list[T]
type DoubleBox[T] = Box[Box[T]]
type DoubleBoxFloat = DoubleBox[float]
assert resolve_type_parameters(DoubleBoxFloat) == list[list[float]]
def test_optional_and_union() -> None:
type Int = int
type MaybeInt = int | None
assert resolve_type_parameters(MaybeInt) == (int | None)
assert resolve_type_parameters(list[MaybeInt]) == list[int | None]
type Str = str # FIXME extract outside?
type StrOrInt = Str | Int
assert resolve_type_parameters(StrOrInt) == (str | int)
type UnionWithAlias = int | Str
assert resolve_type_parameters(UnionWithAlias) == (int | str)
# Test union in generic contexts
type OptionalList[T] = list[T] | None
assert resolve_type_parameters(OptionalList[int]) == (list[int] | None)
assert resolve_type_parameters(OptionalList[str]) == (list[str] | None)
# Test nested unions with aliases
type Bool = bool
type StrOrIntOrBool = StrOrInt | Bool
assert resolve_type_parameters(StrOrIntOrBool) == (int | str | bool)
# Test union with complex aliased types
type ListInt = list[int]
type DictStrInt = dict[str, int]
type ComplexUnion = ListInt | DictStrInt | None
assert resolve_type_parameters(ComplexUnion) == (list[int] | dict[str, int] | None)
def test_old_aliases() -> None:
"""
Old style typing.* aliases get 'normalised' by typing.get_origin call.
This shouldn't really be a problem, so just highihghting it here.
"""
from typing import Dict, List, Optional # noqa: UP035
type OptionalInt = Optional[int] # noqa: UP045
assert resolve_type_parameters(OptionalInt) == int | None
type ListInt = List[int] # noqa: UP006
assert resolve_type_parameters(ListInt) == list[int]
type DictIntStr = Dict[int, str] # noqa: UP006
assert resolve_type_parameters(DictIntStr) == dict[int, str]
def test_old_union() -> None:
from typing import Union
type IntUnion[T] = Union[int, T, bool] # noqa: UP007
assert resolve_type_parameters(IntUnion[str]) == (int | str | bool)
def test_typevar() -> None:
from typing import TypeVar
X = TypeVar('X')
ListX = list[X]
type ListInt = ListX[int]
assert resolve_type_parameters(ListInt) == list[int]
SetX = set[X]
SetFloat = SetX[float]
assert resolve_type_parameters(SetFloat) == set[float]
def test_misc() -> None:
"""
Miscellaneous more complex tests.
"""
# Test union inside list/dict
type MaybeStr = str | None
assert resolve_type_parameters(list[MaybeStr]) == list[str | None]
assert resolve_type_parameters(dict[str, MaybeStr]) == dict[str, str | None]
# Test union with nested generic aliases
type Container[T] = list[T]
type OptionalContainer[T] = Container[T] | None
assert resolve_type_parameters(OptionalContainer[int]) == (list[int] | None)
# Test union with multiple aliased generics
type ListAlias[T] = list[T]
type SetAlias[T] = set[T]
type CollectionUnion[T] = ListAlias[T] | SetAlias[T]
assert resolve_type_parameters(CollectionUnion[str]) == (list[str] | set[str])
# Test union in tuple
type IntOrStr = int | str
assert resolve_type_parameters(tuple[IntOrStr, bool]) == tuple[int | str, bool]
# Test deeply nested union with aliases
type Middle = list[IntOrStr]
type Outer = Middle | None
assert resolve_type_parameters(Outer) == (list[int | str] | None)
# Test union with chained aliases
type Level1 = int
type Level2 = Level1
type Level3 = Level2
type UnionChained = Level3 | str | None
assert resolve_type_parameters(UnionChained) == (int | str | None)
# Test union with generic that resolves to union
type MaybeList[T] = list[T] | None
type NestedMaybe = MaybeList[int | str]
assert resolve_type_parameters(NestedMaybe) == (list[int | str] | None)
# Test union with aliased union
type NumberOrStr = int | float | str
type ExtendedUnion = NumberOrStr | bool
assert resolve_type_parameters(ExtendedUnion) == (int | float | str | bool)
# Test union in dict values and keys
type FlexibleKey = str | int
type FlexibleValue = list[int] | dict[str, str] | None
assert (
resolve_type_parameters(dict[FlexibleKey, FlexibleValue]) == dict[str | int, list[int] | dict[str, str] | None]
)
# Test union with same type repeated (Python may or may not normalize this)
type RepeatUnion = int | int | str # noqa: PYI016
# Python's union implementation may deduplicate, so we accept both
assert resolve_type_parameters(RepeatUnion) == (int | str) or resolve_type_parameters(RepeatUnion) == (int | int | str) # fmt: skip
# Test union with TypeAliasType in multiple positions
type AliasA = list[int]
type AliasB = dict[str, int]
type AliasC = set[str]
type MultiAliasUnion = AliasA | AliasB | AliasC
assert resolve_type_parameters(MultiAliasUnion) == (list[int] | dict[str, int] | set[str])
# Test generic union with substitution
type Result[T, E] = T | E
assert resolve_type_parameters(Result[int, str]) == (int | str)
assert resolve_type_parameters(Result[list[int], dict[str, str]]) == (list[int] | dict[str, str])
# Test union with None (Optional pattern) in various positions
type OptionalInt = int | None
type ListOfOptional = list[OptionalInt]
assert resolve_type_parameters(ListOfOptional) == list[int | None]
# Test union with multiple levels of aliased unions
type UnionA = int | str
type UnionB = bool | float
type CombinedUnion = UnionA | UnionB
assert resolve_type_parameters(CombinedUnion) == (int | str | bool | float)
# Test union as generic parameter with nested aliases
type NestedAlias = list[int]
type UnionParam[T] = dict[str, T | None]
assert resolve_type_parameters(UnionParam[NestedAlias]) == dict[str, list[int] | None]
# Test complex scenario: generic alias that returns a union, used in another union
type ComplexUnion[T] = MaybeList[T] | dict[str, T]
assert resolve_type_parameters(ComplexUnion[int]) == (list[int] | None | dict[str, int])
# Test union in tuple with multiple aliased elements
type AliasInt = int
type AliasStr = str
type TupleWithUnions = tuple[AliasInt | None, list[AliasStr | bool]]
assert resolve_type_parameters(TupleWithUnions) == tuple[int | None, list[str | bool]]
# Test three-way union with all aliased types
type TypeA = list[int]
type TypeB = dict[str, str]
type TypeC = set[bool]
type ThreeWayUnion = TypeA | TypeB | TypeC
assert resolve_type_parameters(ThreeWayUnion) == (list[int] | dict[str, str] | set[bool])
# Test union where members themselves contain unions
type InnerUnion1 = int | str
type InnerUnion2 = bool | float
type OuterUnion = list[InnerUnion1] | dict[str, InnerUnion2]
assert resolve_type_parameters(OuterUnion) == (list[int | str] | dict[str, bool | float])
# Test generic union with nested type aliases in parameters
type Box[T] = list[T]
type OptionBox[T] = Box[T] | None
assert resolve_type_parameters(OptionBox[int | str]) == (list[int | str] | None)
# Test union with mix of generic and non-generic aliases
type SimpleAlias = int
type GenericAlias[T] = list[T]
type MixedUnion[T] = SimpleAlias | GenericAlias[T]
assert resolve_type_parameters(MixedUnion[str]) == (int | list[str])
# Test generic alias that returns the parameter unchanged
type Same[T] = T
assert resolve_type_parameters(Same[int]) == int # noqa: E721
assert resolve_type_parameters(Same[list[str]]) == list[str]
assert resolve_type_parameters(Same[Same[int]]) == int # noqa: E721
# Test deeply nested generics
type Deep = dict[str, list[tuple[int, set[str]]]]
assert resolve_type_parameters(Deep) == dict[str, list[tuple[int, set[str]]]]
# Test union in complex nested structure
type Data[T] = dict[str, list[T] | None]
assert resolve_type_parameters(Data[int | str]) == dict[str, list[int | str] | None]
# Test alias in tuple with mixed types
type Mixed = tuple[int, list[str], dict[str, int]]
assert resolve_type_parameters(Mixed) == tuple[int, list[str], dict[str, int]]
================================================
FILE: src/cachew/tests/utils.py
================================================
import gc
import os
import sys
from contextlib import contextmanager
from pathlib import Path
import pytest
PROFILES = Path(__file__).absolute().parent / 'profiles'
@contextmanager
def profile(name: str):
# ugh. seems like pyinstrument slows down code quite a bit?
if os.environ.get('PYINSTRUMENT') is None:
yield
return
from pyinstrument import Profiler
with Profiler() as profiler:
yield
PROFILES.mkdir(exist_ok=True)
results_file = PROFILES / f"{name}.html"
print("results for " + name, file=sys.stderr)
profiler.print()
results_file.write_text(profiler.output_html())
def timer(name: str):
from codetiming import Timer
return Timer(name=name, text=name + ': ' + '{:.2f}s')
@pytest.fixture
def gc_control(*, gc_on: bool):
if gc_on:
# no need to do anything, should be on by default
yield
return
gc.disable()
try:
yield
finally:
gc.enable()
running_on_ci = 'CI' in os.environ
================================================
FILE: src/cachew/utils.py
================================================
from collections.abc import Mapping
from types import UnionType
from typing import TypeAliasType, TypeVar, get_args, get_origin
# https://stackoverflow.com/a/2166841/706389
def is_namedtuple(t) -> bool:
b = getattr(t, '__bases__', None)
if b is None:
return False
if len(b) != 1 or b[0] is not tuple:
return False
f = getattr(t, '_fields', None)
if not isinstance(f, tuple):
return False
return all(type(n) == str for n in f) # noqa: E721
def resolve_type_parameters(t) -> type:
return _resolve_type_parameters_aux(t, typevar_to_type={})
def _resolve_type_parameters_aux(t, *, typevar_to_type: Mapping[TypeVar, type]) -> type:
if isinstance(t, TypeVar):
return typevar_to_type[t]
# This is the 'left hand side' case, i.e. in type ... =
if isinstance(t, TypeAliasType):
return _resolve_type_parameters_aux(t.__value__, typevar_to_type=typevar_to_type)
# note: args is never none
raw_args = get_args(t)
resolved_args = tuple(_resolve_type_parameters_aux(arg, typevar_to_type=typevar_to_type) for arg in raw_args)
# UnionType: resolve each member of the union
if isinstance(t, UnionType):
# Reconstruct the union with resolved args
result = resolved_args[0]
for arg in resolved_args[1:]:
result = result | arg # type: ignore[assignment]
return result
origin = get_origin(t)
# Must be a non-generic type
if origin is None:
return t
# This is the 'right hand side', e.g. '... = Id[int]' matches this
if isinstance(origin, TypeAliasType):
type_params = origin.__type_params__
new_typevar_to_type: Mapping[TypeVar, type] = {
**typevar_to_type,
**dict(zip(type_params, resolved_args, strict=True)), # type: ignore[arg-type]
}
return _resolve_type_parameters_aux(origin.__value__, typevar_to_type=new_typevar_to_type)
# Just a regular generic type
return origin[resolved_args]
================================================
FILE: tox.ini
================================================
[tox]
minversion = 4
# relies on the correct version of Python installed
# (we rely on CI for the test matrix)
envlist = ruff,tests,mypy,ty
# https://github.com/tox-dev/tox/issues/20#issuecomment-247788333
# hack to prevent .tox from crapping to the project directory
toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox
[testenv]
# TODO how to get package name from setuptools?
package_name = "cachew"
pass_env =
# useful for tests to know they are running under ci
CI
CI_*
# respect user's cache dirs to prevent tox from crapping into project dir
PYTHONPYCACHEPREFIX
MYPY_CACHE_DIR
RUFF_CACHE_DIR
set_env =
# do not add current working directory to pythonpath
# generally this is more robust and safer, prevents weird issues later on
PYTHONSAFEPATH=1
runner = uv-venv-lock-runner
uv_sync_locked = false
[testenv:ruff]
skip_install = true
dependency_groups = testing
commands =
{envpython} -m ruff check \
{posargs}
[testenv:tests]
dependency_groups = testing
commands =
# posargs allow test filtering, e.g. tox ... -- -k test_name
{envpython} -m pytest \
--pyargs {[testenv]package_name} \
{posargs}
[testenv:mypy]
dependency_groups = typecheck
commands =
{envpython} -m mypy --no-install-types \
-p {[testenv]package_name} \
--txt-report .coverage.mypy \
--html-report .coverage.mypy \
# this is for github actions to upload to codecov.io
# sadly xml coverage crashes on windows... so we need to disable it
{env:CI_MYPY_COVERAGE} \
{posargs}
[testenv:ty]
dependency_groups = typecheck
commands =
{envpython} -m ty \
check \
{posargs}
================================================
FILE: ty.toml
================================================
[src]
exclude = [
"doc/test_serialization.py",
]