Showing preview only (855K chars total). Download the full file or copy to clipboard to get everything.
Repository: teaxyz/chai
Branch: main
Commit: 5ba601243a93
Files: 142
Total size: 809.9 KB
Directory structure:
gitextract_5cor20ar/
├── .dockerignore
├── .github/
│ ├── actions/
│ │ └── complain/
│ │ └── action.yml
│ └── workflows/
│ ├── chai-api.ci.yml
│ ├── ci.yml
│ └── deploy.yml
├── .gitignore
├── .python-version
├── LICENSE
├── README.md
├── alembic/
│ ├── .pkgx.yaml
│ ├── Dockerfile
│ ├── README.md
│ ├── alembic.ini
│ ├── env.py
│ ├── init-script.sql
│ ├── load-values.sql
│ ├── run_migrations.sh
│ ├── script.py.mako
│ └── versions/
│ ├── 20241028_1217-base_migration.py
│ ├── 20250312_0045-add_legacy_dependency_table.py
│ ├── 20250312_2244-canons.py
│ ├── 20250416_0223-add_ranks.py
│ ├── 20250422_0940-add_unique_package_to_canon_packages.py
│ ├── 20250508_1752-add_trgm_indexes.py
│ ├── 20250529_2341-rename_canons_table_and_recreate.py
│ └── 20250529_2345-recreate_canon_foreign_keys.py
├── api/
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Cargo.toml
│ ├── Dockerfile
│ ├── README.md
│ └── src/
│ ├── app_state.rs
│ ├── db.rs
│ ├── handlers.rs
│ ├── logging.rs
│ ├── main.rs
│ └── utils.rs
├── core/
│ ├── README.md
│ ├── config.py
│ ├── db.py
│ ├── fetcher.py
│ ├── logger.py
│ ├── models/
│ │ └── __init__.py
│ ├── requirements.txt
│ ├── scheduler.py
│ ├── structs.py
│ ├── test.json
│ ├── transformer.py
│ └── utils.py
├── db/
│ ├── README.md
│ └── queries.md
├── docker-compose.yml
├── examples/
│ ├── sbom-meta/
│ │ ├── README.md
│ │ ├── go.mod
│ │ ├── go.sum
│ │ └── main.go
│ └── visualizer/
│ ├── README.md
│ ├── main.py
│ └── monitor.py
├── package_managers/
│ ├── crates/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── db.py
│ │ ├── diff.py
│ │ ├── main.py
│ │ ├── structs.py
│ │ └── transformer.py
│ ├── debian/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── db.py
│ │ ├── debian_sources.py
│ │ ├── diff.py
│ │ ├── main.py
│ │ ├── parser.py
│ │ ├── scripts/
│ │ │ ├── investigate_sources.py
│ │ │ └── test_investigate_sources.py
│ │ └── structs.py
│ ├── homebrew/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── db.py
│ │ ├── diff.py
│ │ ├── formulae.py
│ │ ├── main.py
│ │ └── structs.py
│ └── pkgx/
│ ├── Dockerfile
│ ├── db.py
│ ├── diff.py
│ ├── loader.py
│ ├── main.py
│ ├── parser.py
│ └── url.py
├── pkgx.yaml
├── pyproject.toml
├── ranker/
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Dockerfile
│ ├── README.md
│ ├── config.py
│ ├── db.py
│ ├── dedupe.py
│ ├── main.py
│ ├── naming.py
│ ├── requirements.txt
│ ├── rx_graph.py
│ └── utils/
│ ├── analyze_ranks.py
│ └── parse_log.py
├── scripts/
│ ├── chai-legacy-loader/
│ │ ├── README.md
│ │ ├── add_package_fields.py
│ │ ├── batch_insert_package_urls.py
│ │ ├── batch_insert_urls.py
│ │ ├── copy_dependencies_no_thread.py
│ │ ├── pkgx.yaml
│ │ └── sql/
│ │ ├── dependencies.sql
│ │ ├── packages.sql
│ │ └── urls.sql
│ ├── npm-singleton/
│ │ ├── README.md
│ │ ├── pkgx.yaml
│ │ └── single.py
│ ├── package_to_package/
│ │ └── package_dependencies.py
│ └── upgrade_canons/
│ ├── .gitignore
│ ├── README.md
│ ├── create_deleted_canons.py
│ ├── db.py
│ ├── delete_non_canonical_urls.py
│ ├── main.py
│ ├── registered_projects.py
│ └── structs.py
└── tests/
├── README.md
├── conftest.py
├── package_managers/
│ ├── crates/
│ │ ├── conftest.py
│ │ └── test_crates_diff_deps.py
│ ├── debian/
│ │ ├── conftest.py
│ │ ├── test_debian_diff.py
│ │ ├── test_debian_parser.py
│ │ └── test_debian_sources.py
│ ├── homebrew/
│ │ ├── conftest.py
│ │ └── test_homebrew_diff_deps.py
│ └── pkgx/
│ ├── test_pkgx_diff.py
│ └── test_special_case.py
├── ranker/
│ ├── test_compute_canon_name.py
│ ├── test_dedupe.py
│ └── test_rx_graph.py
└── scripts/
└── upgrade_canons/
└── test_analyze_packages_needing_canonicalization.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
# directories
data/
.venv/
tests/
scripts/
logs/
db/
# other files
.gitignore
docker-compose.yml
.DS_Store
.git
README.md
LICENSE
.cursorrules
.coverage
*.xml
================================================
FILE: .github/actions/complain/action.yml
================================================
name: teaxyz/chai/complain
description: creates an issue for any failing tests
inputs:
test_function:
description: test function to complain about
required: true
token:
description: github token
required: true
default: ${{ github.token }}
runs:
using: composite
steps:
- name: Find Issue
uses: actions-cool/issues-helper@v3
id: find
with:
actions: "find-issues"
token: ${{ inputs.token }}
issue-state: "open"
title-includes: "❌ test failure"
labels: "test-failure"
- name: Create Issue
uses: actions-cool/issues-helper@v3
id: create
if: ${{ steps.find.outputs.issues == '[]' }}
with:
actions: "create-issue"
token: ${{ inputs.token }}
title: "❌ test failure"
body: "Running log of test failure for ${{ inputs.test_function }}"
labels: "test-failure"
assignees: "sanchitram1"
- name: Log Comment
uses: actions-cool/issues-helper@v3
with:
actions: "create-comment"
token: ${{ inputs.token }}
issue-number: ${{ steps.create.outputs.issue-number || fromJSON(steps.find.outputs.issues)[0].number }}
body: |
# Test failure
## ${{ inputs.test_function }}
logs: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
================================================
FILE: .github/workflows/chai-api.ci.yml
================================================
name: api.ci
on:
push:
branches: [main]
paths:
- "api/**"
pull_request:
paths:
- "api/**"
env:
CARGO_TERM_COLOR: always
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
test:
name: test
runs-on: ubuntu-latest
services:
postgres:
image: postgres
env:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: s3cr3t
POSTGRES_DB: chai
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 5435:5432
steps:
- uses: actions/checkout@v5
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y postgresql-client
- name: Run tests
run: cargo test --verbose
working-directory: api
env:
DATABASE_URL: postgresql://postgres:s3cr3t@localhost:5435/chai
fmt:
name: Rustfmt
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
components: rustfmt
- run: cargo fmt --all -- --check
working-directory: api
clippy:
name: Clippy
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
components: clippy
- run: cargo clippy --all-targets --all-features -- -D warnings
working-directory: api
docker-build:
name: Build Docker Image
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Build
uses: docker/build-push-action@v6
with:
context: ./api
push: false # Set push to false to prevent pushing the image
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on:
workflow_dispatch:
inputs:
env:
description: "The environment to test against"
required: false
type: choice
options:
- dev
- sepolia
- mainnet
default: "dev"
push:
branches:
- main
paths:
- "**/*.py"
- tests/**
- core/**
- package_managers/**
- ranker/**
pull_request:
branches:
- main
paths:
- "**/*.py"
- tests/**
- core/**
- package_managers/**
- ranker/**
jobs:
check:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Setup with pkgx
uses: pkgxdev/setup@v4
with:
+: astral.sh/uv
astral.sh/ruff
pytest.org
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version-file: ".python-version"
- name: Install the project
run: pkgx uv sync --locked --all-extras --dev --all-groups
- name: Lint with Ruff
uses: astral-sh/ruff-action@v3
with:
src: .
- name: Run tests with pytest
id: pytest
run: |
pkgx uv run pytest tests/
complain:
needs: check
if: failure()
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- uses: actions/checkout@v5
- uses: ./.github/actions/complain
with:
test_function: "pytest"
token: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .github/workflows/deploy.yml
================================================
name: "Release Chai"
run-name: Release Chai - ${{ inputs.env || 'auto' }} - ${{ inputs.ref || github.ref }}
on:
push:
branches:
- main
workflow_dispatch:
inputs:
env:
description: "The environment to deploy to"
required: true
type: choice
options:
- dev
- sepolia
- testnet
- mainnet
ref:
description: "The git ref (SHA or tag) to deploy"
required: false
deploy_latest:
description: "Force deploy all components from latest commit"
required: false
type: boolean
permissions:
id-token: write
contents: read
jobs:
build:
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
environment: ${{ inputs.env || 'dev' }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v3
with:
role-to-assume: ${{ secrets.TEA_AWS_ROLE }}
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Set deployment ref
id: set-ref
run: |
DEPLOY_REF=${{ inputs.ref || github.sha }}
echo "DEPLOY_REF=${DEPLOY_REF}" >> $GITHUB_ENV
echo "deploy_ref=${DEPLOY_REF}" >> $GITHUB_OUTPUT
- name: Set environment
id: set-env
run: |
DEPLOY_ENV=${{ inputs.env || 'dev' }}
echo "DEPLOY_ENV=${DEPLOY_ENV}" >> $GITHUB_ENV
echo "deploy_env=${DEPLOY_ENV}" >> $GITHUB_OUTPUT
- name: Build and push Crates indexer image
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
uses: docker/build-push-action@v6
with:
context: .
file: ./package_managers/crates/Dockerfile
push: true
tags: |
${{ steps.login-ecr.outputs.registry }}/chai-v2/crates:${{ env.DEPLOY_REF }}-${{ env.DEPLOY_ENV }}
${{ steps.login-ecr.outputs.registry }}/chai-v2/crates:latest
- name: Build and push Homebrew indexer image
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
uses: docker/build-push-action@v6
with:
context: .
file: ./package_managers/homebrew/Dockerfile
push: true
tags: |
${{ steps.login-ecr.outputs.registry }}/chai-v2/homebrew:${{ env.DEPLOY_REF }}-${{ env.DEPLOY_ENV }}
${{ steps.login-ecr.outputs.registry }}/chai-v2/homebrew:latest
- name: Build and push Debian indexer image
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
uses: docker/build-push-action@v6
with:
context: .
file: ./package_managers/debian/Dockerfile
push: true
tags: |
${{ steps.login-ecr.outputs.registry }}/chai-v2/debian:${{ env.DEPLOY_REF }}-${{ env.DEPLOY_ENV }}
${{ steps.login-ecr.outputs.registry }}/chai-v2/debian:latest
- name: Build and push Pkgx indexer image
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
uses: docker/build-push-action@v6
with:
context: .
file: ./package_managers/pkgx/Dockerfile
push: true
tags: |
${{ steps.login-ecr.outputs.registry }}/chai-v2/pkgx:${{ env.DEPLOY_REF }}-${{ env.DEPLOY_ENV }}
${{ steps.login-ecr.outputs.registry }}/chai-v2/pkgx:latest
- name: Build and push Alembic image
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
uses: docker/build-push-action@v6
with:
context: .
file: ./alembic/Dockerfile
push: true
tags: |
${{ steps.login-ecr.outputs.registry }}/chai-v2/alembic:${{ env.DEPLOY_REF }}-${{ env.DEPLOY_ENV }}
${{ steps.login-ecr.outputs.registry }}/chai-v2/alembic:latest
- name: Build and push chai-api image
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
uses: docker/build-push-action@v6
with:
context: ./api
file: ./api/Dockerfile
push: true
tags: |
${{ steps.login-ecr.outputs.registry }}/chai-v2/chai-api:${{ env.DEPLOY_REF }}-${{ env.DEPLOY_ENV }}
${{ steps.login-ecr.outputs.registry }}/chai-v2/chai-api:latest
- name: Build and push Ranker indexer image
if: ${{ github.event_name == 'workflow_dispatch' && inputs.deploy_latest == true }}
uses: docker/build-push-action@v2
with:
context: .
file: ./ranker/Dockerfile
push: true
tags: |
${{ steps.login-ecr.outputs.registry }}/chai-v2/ranker:${{ env.DEPLOY_REF }}-${{ env.DEPLOY_ENV }}
${{ steps.login-ecr.outputs.registry }}/chai-v2/ranker:latest
deploy:
needs: build
if: ${{ github.event_name == 'workflow_dispatch' }}
runs-on: ubuntu-latest
environment: ${{ inputs.env || 'dev' }}
steps:
- uses: actions/checkout@v5
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v3
with:
role-to-assume: ${{ secrets.TEA_AWS_ROLE }}
aws-region: us-east-1
- name: Set deployment ref
id: set-ref
run: |
DEPLOY_REF=${{ inputs.ref }}
echo "DEPLOY_REF=${DEPLOY_REF}" >> $GITHUB_ENV
echo "deploy_ref=${DEPLOY_REF}" >> $GITHUB_OUTPUT
- name: Set environment
id: set-env
run: |
DEPLOY_ENV=${{ inputs.env }}
echo "DEPLOY_ENV=${DEPLOY_ENV}" >> $GITHUB_ENV
echo "deploy_env=${DEPLOY_ENV}" >> $GITHUB_OUTPUT
- name: Deploy chai-api
run: |
aws ecs update-service --cluster chai-${{ env.DEPLOY_ENV }} \
--service ${{ env.DEPLOY_ENV }}-chai-api \
--force-new-deployment
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# data files
data
db/data
# examples
examples/sbom-meta/sbom-meta
*.svg
# cursor
.cursorrules
.cursor/
TASKS.md
# DS Store
.DS_Store
# Profiling
*.prof
================================================
FILE: .python-version
================================================
3.11
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2024 tea protocol
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# CHAI
CHAI is an attempt at an open-source data pipeline for package managers. The
goal is to have a pipeline that can use the data from any package manager and
provide a normalized data source for myriads of different use cases.
## Getting Started
Use [Docker](https://docker.com)
1. Install Docker
2. Clone the chai repository (https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository)
3. Using a terminal, navigate to the cloned repository directory
4. Run `docker compose build` to create the latest Docker images
5. Then, run `docker compose up` to launch.
> [!NOTE]
>
> This will run CHAI for all package managers. As an example crates by
> itself will take over an hour and consume >5GB storage.
>
> Currently, we support:
>
> - crates
> - Homebrew
> - Debian
> - pkgx
>
> You can run a single package manager by running
> `PACKAGE_MANAGER=<name> docker compose up`
>
> We are planning on supporting `NPM`, `PyPI`, and `rubygems` next.
### Arguments
Specify these eg. `FOO=bar docker compose up`:
- `ENABLE_SCHEDULER`: When true, the pipeline runs on a schedule set by `FREQUENCY`.
- `FREQUENCY`: Sets how often (in hours) the pipeline should run.
- `TEST`: Useful for running in a test code
insertions.
- `FETCH`: Determines whether to fetch new data or use whatever was saved locally.
- `NO_CACHE`: When true, deletes temporary files after processing.
> [!NOTE]
> The flag `NO_CACHE` does not mean that files will not get downloaded to your local
> storage (specifically, the ./data directory). It only means that we'll
> delete these temporary files from ./data once we're done processing them.
> If `FETCH` is false, the pipeline looks for source data in the cache, so this
> will fail if you run `NO_CACHE` first, and `FETCH` false second.
These arguments are all configurable in the `docker-compose.yml` file.
### Docker Services Overview
1. `db`: [PostgreSQL] database for the reduced package data
2. `alembic`: handles migrations
3. `package_managers`: fetches and writes data for each package manager
4. `api`: a simple REST API for reading from the db
5. `ranker`: deduplicates and ranks the packages
### Hard Reset
Stuff happens. Start over:
`rm -rf ./data`: removes all the data the fetcher is putting.
<!-- this is handled now that alembic/psycopg2 are in pkgx -->
<!--
## Alembic Alternatives
- sqlx command line tool to manage migrations, alongside models for sqlx in rust
- vapor's migrations are written in swift
-->
## Goals
Our goal is to build a data schema that looks like this:

You can read more about specific data models in the dbs [readme](db/README.md)
Our specific application extracts the dependency graph understand what are
critical pieces of the open-source graph. We also built a simple example that displays
[sbom-metadata](examples/sbom-meta) for your repository.
There are many other potential use cases for this data:
- License compatibility checker
- Developer publications
- Package popularity
- Dependency analysis vulnerability tool (requires translating semver)
> [!TIP]
> Help us add the above to the examples folder.
## FAQs / Common Issues
1. The database url is `postgresql://postgres:s3cr3t@localhost:5435/chai`, and
is used as `CHAI_DATABASE_URL` in the environment. `psql CHAI_DATABASE_URL`
will connect you to the database.
2. If you're orchestrating via docker, swap `localhost` for `host.docker.internal`
## Managing Dependencies
We use [`uv`](https://astral.sh/uv) to manage dependencies (and sometimes execution).
All dependencies are listed in [`pyproject.toml`](./pyproject.toml), under the
`dependency-groups` header. Each group helps us classify the service we're adding a
dependency for. For example, if we're adding a new dependency for all the indexers:
```bash
uv add --group indexer requests
# use the --all-groups flag to sync your venv for all dependencies
uv sync --all-groups
uv pip compile --group indexers -o core/requirements.txt
```
The last step writes the updated dependencies to a requirements.txt file, which is
crucial for the Docker containers executing the individual services to build correctly.
Each indexer shares the same set of dependencies, and that requirement file is
**generated by uv**, and maintained in [core/requirements.txt](core/requirements.txt)
> [!IMPORTANT]
> DO NOT UPDATE ANY `requirements.txt` FILES DIRECTLY
> `uv` provides a way to generate that automatically, based on the pyproject.toml
>
> Have an idea on a better way to do this? Open to input...
## Deployment
```sh
export CHAI_DATABASE_URL=postgresql://<user>:<pw>@host.docker.internal:<port>/chai
export PGPASSWORD=<pw>
docker compose up alembic
```
## Tasks
These are tasks that can be run using [xcfile.dev]. If you use `pkgx`, typing
`dev` loads the environment. Alternatively, run them manually.
### reset
```sh
rm -rf db/data data .venv
```
### build
```sh
docker compose build
```
### start-all
Requires: build
```sh
docker compose up -d
```
### stop
```sh
docker compose down
```
### logs
```sh
docker compose logs
```
### db-start
Runs migrations and starts up the database
```sh
docker compose build --no-cache db alembic
docker compose up alembic -d
```
### db-reset
Requires: stop
```sh
rm -rf db/data
```
### db-generate-migration
Inputs: MIGRATION_NAME
Env: CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@localhost:5435/chai
```sh
cd alembic
alembic revision --autogenerate -m "$MIGRATION_NAME"
```
### db-upgrade
Env: CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@localhost:5435/chai
```sh
cd alembic
alembic upgrade head
```
### db-downgrade
Inputs: STEP
Env: CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@localhost:5435/chai
```sh
cd alembic
alembic downgrade -$STEP
```
### db
```sh
psql "postgresql://postgres:s3cr3t@localhost:5435/chai"
```
### restart-api
Refreshes table knowledge from the db.
```sh
docker compose restart api
```
### remove-orphans
```sh
docker compose down --remove-orphans
```
### start-service
Inputs: SERVICE
Env: CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@host.docker.internal:5435/chai
```sh
docker compose up $SERVICE --build
```
### check
Inputs: FOLDER
Environment: FOLDER=.
```sh
pkgx +python@3.13 ty check $FOLDER
```
[PostgreSQL]: https://www.postgresql.org
[`pkgx`]: https://pkgx.sh
================================================
FILE: alembic/.pkgx.yaml
================================================
# this .pkgx.yaml file is only for alembic
dependencies:
postgresql.org: 16
alembic.sqlalchemy.org: 1
psycopg.org/psycopg2: 2
================================================
FILE: alembic/Dockerfile
================================================
FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim
RUN apt update && apt -y install postgresql
RUN uv pip install alembic==1.13.2 psycopg2-binary==2.9.10 sqlalchemy==2.0.41 --system
COPY . .
WORKDIR /alembic
RUN chmod +x /alembic/run_migrations.sh
ENTRYPOINT ["/bin/sh", "/alembic/run_migrations.sh"]
================================================
FILE: alembic/README.md
================================================
# CHAI Data Migrations
This directory contains the Alembic configuration and migration scripts for managing the
database schema of the CHAI project. Alembic is used to handle database migrations,
allowing for version control of our database schema.
### About Alembic
Alembic is a database migration tool for SQLAlchemy. It allows us to:
- Track changes to our database schema over time
- Apply and revert these changes in a controlled manner
- Generate migration scripts automatically based on model changes
> [!NOTE]
> It's important to note that while `alembic` serves our current needs, it may not be
> our long-term solution. As the CHAI project evolves, we might explore other database
> migration tools or strategies that better fit our growing requirements. We're open to
> reassessing our approach to schema management as needed.
## Entrypoint
The main entrypoint for running migrations is the
[run migrations script](run_migrations.sh). This script orchestrates the initialization
and migration process.
## Steps
1. [Initialize](init-script.sql)
The initialization script creates the database `chai`, and loads it up with any
extensions that we'd need, so we've got a clean slate for our db structures.
2. [Load](load-values.sql)
The load script prepopulates some of the tables, with `enum`-like values - specifically
for:
- `url_types`: defines different types of URLs (e.g., source, homepage, documentation)
- `depends_on_types`: defines different types of dependencies (e.g., runtime,
development)
- `sources` and `package_managers`: defines different package managers (e.g., npm, pypi)
3. Run Alembic Migrations
After initialization and loading initial data, the script runs Alembic migrations to apply any pending database schema changes.
## Contributing
To contribute to the database schema:
1. Make a change in the [models](../core/models/__init__.py) file
2. Generate a new migration script: `alembic revision --autogenerate "Description"`
3. Review the generated migration script in the [versions](versions/) directory. The
auto-generation is powerful but not perfect, please review the script carefully.
4. Test the migration by running `alembic upgrade head`.
================================================
FILE: alembic/alembic.ini
================================================
[alembic]
script_location = .
file_template = %%(year)d%%(month).2d%%(day).2d_%%(hour).2d%%(minute).2d-%%(slug)s
prepend_sys_path = ..
version_path_separator = os
# URL
sqlalchemy.url = ${env:CHAI_DATABASE_URL}
[post_write_hooks]
# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
# TODO: this doesn't work rn
# hooks = ruff
# ruff.type = exec
# ruff.executable = %(here)s/.venv/bin/ruff
# ruff.options = --fix REVISION_SCRIPT_FILENAME
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
================================================
FILE: alembic/env.py
================================================
import os
from logging.config import fileConfig
from sqlalchemy import engine_from_config, pool
from alembic import context
from core.models import Base
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# interpret the config file for Python logging.
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# metadata for all models
target_metadata = Base.metadata
# get database url
database_url = os.getenv("CHAI_DATABASE_URL")
if database_url:
config.set_main_option("sqlalchemy.url", database_url)
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(connection=connection, target_metadata=target_metadata)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
================================================
FILE: alembic/init-script.sql
================================================
CREATE DATABASE chai;
\c chai
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS pg_trgm;
================================================
FILE: alembic/load-values.sql
================================================
-- url types
INSERT INTO "url_types" ("name")
VALUES ('source'), ('homepage'), ('documentation'), ('repository')
ON CONFLICT (name) DO NOTHING;
-- dependency types
INSERT INTO "depends_on_types" ("name")
VALUES
('build'),
('development'),
('runtime'),
('test'),
('optional'),
('recommended'),
('uses_from_macos')
ON CONFLICT (name) DO NOTHING;
-- sources
INSERT INTO "sources" ("type")
VALUES ('crates'), ('npm'), ('pypi'), ('rubygems'), ('github'), ('homebrew'), ('debian'), ('pkgx')
ON CONFLICT (type) DO NOTHING;
INSERT INTO "package_managers" ("source_id")
SELECT id
FROM "sources"
WHERE "type" IN ('crates', 'npm', 'pypi', 'rubygems', 'github', 'homebrew', 'debian', 'pkgx')
ON CONFLICT (source_id) DO NOTHING;
================================================
FILE: alembic/run_migrations.sh
================================================
#!/bin/bash
set -uo pipefail
# This script sets up the database, runs migrations, and loads initial values
# Check if the 'chai' database exists, create it if it doesn't
if psql "$CHAI_DATABASE_ADMIN_URL" -tAc "SELECT 1 FROM pg_database WHERE datname='chai'" | grep -q 1
then
echo "Database 'chai' already exists"
else
echo "Database 'chai' does not exist, creating..."
psql "$CHAI_DATABASE_ADMIN_URL" -f init-script.sql -a
fi
# Run migrations and load data (uses 'chai' database)
echo "Current database version: $(alembic current)"
alembic upgrade head || { echo "Migration failed"; exit 1; }
echo "Loading initial values into the database..."
psql "$CHAI_DATABASE_URL" -f load-values.sql -a
echo "Database setup and initialization complete"
================================================
FILE: alembic/script.py.mako
================================================
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}
================================================
FILE: alembic/versions/20241028_1217-base_migration.py
================================================
"""base migration
Revision ID: 238d591d5310
Revises:
Create Date: 2024-10-28 12:17:43.762965
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "238d591d5310"
down_revision: str | None = None
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"depends_on_types",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("name", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_depends_on_types")),
)
op.create_index(
op.f("ix_depends_on_types_name"), "depends_on_types", ["name"], unique=True
)
op.create_table(
"licenses",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("name", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_licenses")),
)
op.create_index(op.f("ix_licenses_name"), "licenses", ["name"], unique=True)
op.create_table(
"sources",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("type", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_sources")),
sa.UniqueConstraint("type", name=op.f("uq_sources_type")),
)
op.create_table(
"url_types",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("name", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_url_types")),
sa.UniqueConstraint("name", name=op.f("uq_url_types_name")),
)
op.create_table(
"package_managers",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("source_id", sa.UUID(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["source_id"],
["sources.id"],
name=op.f("fk_package_managers_source_id_sources"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_package_managers")),
sa.UniqueConstraint("source_id", name=op.f("uq_package_managers_source_id")),
)
op.create_table(
"urls",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("url", sa.String(), nullable=False),
sa.Column("url_type_id", sa.UUID(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["url_type_id"],
["url_types.id"],
name=op.f("fk_urls_url_type_id_url_types"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_urls")),
sa.UniqueConstraint("url_type_id", "url", name="uq_url_type_url"),
)
op.create_index(op.f("ix_urls_url"), "urls", ["url"], unique=False)
op.create_index(op.f("ix_urls_url_type_id"), "urls", ["url_type_id"], unique=False)
op.create_table(
"users",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("username", sa.String(), nullable=False),
sa.Column("source_id", sa.UUID(), nullable=False),
sa.Column("import_id", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["source_id"], ["sources.id"], name=op.f("fk_users_source_id_sources")
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_users")),
sa.UniqueConstraint("source_id", "username", name="uq_source_username"),
)
op.create_index(op.f("ix_users_import_id"), "users", ["import_id"], unique=False)
op.create_index(op.f("ix_users_source_id"), "users", ["source_id"], unique=False)
op.create_index(op.f("ix_users_username"), "users", ["username"], unique=False)
op.create_table(
"load_history",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("package_manager_id", sa.UUID(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["package_manager_id"],
["package_managers.id"],
name=op.f("fk_load_history_package_manager_id_package_managers"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_load_history")),
)
op.create_table(
"packages",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("derived_id", sa.String(), nullable=False),
sa.Column("name", sa.String(), nullable=False),
sa.Column("package_manager_id", sa.UUID(), nullable=False),
sa.Column("import_id", sa.String(), nullable=False),
sa.Column("readme", sa.String(), nullable=True),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["package_manager_id"],
["package_managers.id"],
name=op.f("fk_packages_package_manager_id_package_managers"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_packages")),
sa.UniqueConstraint("derived_id", name=op.f("uq_packages_derived_id")),
sa.UniqueConstraint(
"package_manager_id", "import_id", name="uq_package_manager_import_id"
),
)
op.create_index(
op.f("ix_packages_import_id"), "packages", ["import_id"], unique=False
)
op.create_index(op.f("ix_packages_name"), "packages", ["name"], unique=False)
op.create_index(
op.f("ix_packages_package_manager_id"),
"packages",
["package_manager_id"],
unique=False,
)
op.create_table(
"package_urls",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("package_id", sa.UUID(), nullable=False),
sa.Column("url_id", sa.UUID(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["package_id"],
["packages.id"],
name=op.f("fk_package_urls_package_id_packages"),
),
sa.ForeignKeyConstraint(
["url_id"], ["urls.id"], name=op.f("fk_package_urls_url_id_urls")
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_package_urls")),
sa.UniqueConstraint("package_id", "url_id", name="uq_package_url"),
)
op.create_index(
op.f("ix_package_urls_package_id"), "package_urls", ["package_id"], unique=False
)
op.create_index(
op.f("ix_package_urls_url_id"), "package_urls", ["url_id"], unique=False
)
op.create_table(
"user_packages",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("user_id", sa.UUID(), nullable=False),
sa.Column("package_id", sa.UUID(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["package_id"],
["packages.id"],
name=op.f("fk_user_packages_package_id_packages"),
),
sa.ForeignKeyConstraint(
["user_id"], ["users.id"], name=op.f("fk_user_packages_user_id_users")
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_user_packages")),
sa.UniqueConstraint("user_id", "package_id", name="uq_user_package"),
)
op.create_index(
op.f("ix_user_packages_package_id"),
"user_packages",
["package_id"],
unique=False,
)
op.create_index(
op.f("ix_user_packages_user_id"), "user_packages", ["user_id"], unique=False
)
op.create_table(
"versions",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("package_id", sa.UUID(), nullable=False),
sa.Column("version", sa.String(), nullable=False),
sa.Column("import_id", sa.String(), nullable=False),
sa.Column("size", sa.Integer(), nullable=True),
sa.Column("published_at", sa.DateTime(), nullable=True),
sa.Column("license_id", sa.UUID(), nullable=True),
sa.Column("downloads", sa.Integer(), nullable=True),
sa.Column("checksum", sa.String(), nullable=True),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["license_id"],
["licenses.id"],
name=op.f("fk_versions_license_id_licenses"),
),
sa.ForeignKeyConstraint(
["package_id"],
["packages.id"],
name=op.f("fk_versions_package_id_packages"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_versions")),
sa.UniqueConstraint("package_id", "version", name="uq_package_version"),
)
op.create_index(
op.f("ix_versions_downloads"), "versions", ["downloads"], unique=False
)
op.create_index(
op.f("ix_versions_import_id"), "versions", ["import_id"], unique=False
)
op.create_index(
op.f("ix_versions_license_id"), "versions", ["license_id"], unique=False
)
op.create_index(
op.f("ix_versions_package_id"), "versions", ["package_id"], unique=False
)
op.create_index(
op.f("ix_versions_published_at"), "versions", ["published_at"], unique=False
)
op.create_index(op.f("ix_versions_size"), "versions", ["size"], unique=False)
op.create_index(op.f("ix_versions_version"), "versions", ["version"], unique=False)
op.create_table(
"dependencies",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("version_id", sa.UUID(), nullable=False),
sa.Column("dependency_id", sa.UUID(), nullable=False),
sa.Column("dependency_type_id", sa.UUID(), nullable=True),
sa.Column("semver_range", sa.String(), nullable=True),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["dependency_id"],
["packages.id"],
name=op.f("fk_dependencies_dependency_id_packages"),
),
sa.ForeignKeyConstraint(
["dependency_type_id"],
["depends_on_types.id"],
name=op.f("fk_dependencies_dependency_type_id_depends_on_types"),
),
sa.ForeignKeyConstraint(
["version_id"],
["versions.id"],
name=op.f("fk_dependencies_version_id_versions"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_dependencies")),
sa.UniqueConstraint(
"version_id",
"dependency_id",
"dependency_type_id",
name="uq_version_dependency_type",
),
)
op.create_index(
op.f("ix_dependencies_dependency_id"),
"dependencies",
["dependency_id"],
unique=False,
)
op.create_index(
op.f("ix_dependencies_dependency_type_id"),
"dependencies",
["dependency_type_id"],
unique=False,
)
op.create_index(
op.f("ix_dependencies_version_id"), "dependencies", ["version_id"], unique=False
)
op.create_table(
"user_versions",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("user_id", sa.UUID(), nullable=False),
sa.Column("version_id", sa.UUID(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["user_id"], ["users.id"], name=op.f("fk_user_versions_user_id_users")
),
sa.ForeignKeyConstraint(
["version_id"],
["versions.id"],
name=op.f("fk_user_versions_version_id_versions"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_user_versions")),
sa.UniqueConstraint("user_id", "version_id", name="uq_user_version"),
)
op.create_index(
op.f("ix_user_versions_user_id"), "user_versions", ["user_id"], unique=False
)
op.create_index(
op.f("ix_user_versions_version_id"),
"user_versions",
["version_id"],
unique=False,
)
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f("ix_user_versions_version_id"), table_name="user_versions")
op.drop_index(op.f("ix_user_versions_user_id"), table_name="user_versions")
op.drop_table("user_versions")
op.drop_index(op.f("ix_dependencies_version_id"), table_name="dependencies")
op.drop_index(op.f("ix_dependencies_dependency_type_id"), table_name="dependencies")
op.drop_index(op.f("ix_dependencies_dependency_id"), table_name="dependencies")
op.drop_table("dependencies")
op.drop_index(op.f("ix_versions_version"), table_name="versions")
op.drop_index(op.f("ix_versions_size"), table_name="versions")
op.drop_index(op.f("ix_versions_published_at"), table_name="versions")
op.drop_index(op.f("ix_versions_package_id"), table_name="versions")
op.drop_index(op.f("ix_versions_license_id"), table_name="versions")
op.drop_index(op.f("ix_versions_import_id"), table_name="versions")
op.drop_index(op.f("ix_versions_downloads"), table_name="versions")
op.drop_table("versions")
op.drop_index(op.f("ix_user_packages_user_id"), table_name="user_packages")
op.drop_index(op.f("ix_user_packages_package_id"), table_name="user_packages")
op.drop_table("user_packages")
op.drop_index(op.f("ix_package_urls_url_id"), table_name="package_urls")
op.drop_index(op.f("ix_package_urls_package_id"), table_name="package_urls")
op.drop_table("package_urls")
op.drop_index(op.f("ix_packages_package_manager_id"), table_name="packages")
op.drop_index(op.f("ix_packages_name"), table_name="packages")
op.drop_index(op.f("ix_packages_import_id"), table_name="packages")
op.drop_table("packages")
op.drop_table("load_history")
op.drop_index(op.f("ix_users_username"), table_name="users")
op.drop_index(op.f("ix_users_source_id"), table_name="users")
op.drop_index(op.f("ix_users_import_id"), table_name="users")
op.drop_table("users")
op.drop_index(op.f("ix_urls_url_type_id"), table_name="urls")
op.drop_index(op.f("ix_urls_url"), table_name="urls")
op.drop_table("urls")
op.drop_table("package_managers")
op.drop_table("url_types")
op.drop_table("sources")
op.drop_index(op.f("ix_licenses_name"), table_name="licenses")
op.drop_table("licenses")
op.drop_index(op.f("ix_depends_on_types_name"), table_name="depends_on_types")
op.drop_table("depends_on_types")
# ### end Alembic commands ###
================================================
FILE: alembic/versions/20250312_0045-add_legacy_dependency_table.py
================================================
"""add-legacy-dependency-table
Revision ID: 89af630dc946
Revises: 238d591d5310
Create Date: 2025-03-12 00:45:35.727521
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "89af630dc946"
down_revision: str | None = "238d591d5310"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"legacy_dependencies",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("package_id", sa.UUID(), nullable=False),
sa.Column("dependency_id", sa.UUID(), nullable=False),
sa.Column("dependency_type_id", sa.UUID(), nullable=False),
sa.Column("semver_range", sa.String(), nullable=True),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["dependency_id"],
["packages.id"],
name=op.f("fk_legacy_dependencies_dependency_id_packages"),
),
sa.ForeignKeyConstraint(
["dependency_type_id"],
["depends_on_types.id"],
name=op.f("fk_legacy_dependencies_dependency_type_id_depends_on_types"),
),
sa.ForeignKeyConstraint(
["package_id"],
["packages.id"],
name=op.f("fk_legacy_dependencies_package_id_packages"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_legacy_dependencies")),
sa.UniqueConstraint(
"package_id", "dependency_id", name="uq_package_dependency"
),
)
op.create_index(
op.f("ix_legacy_dependencies_dependency_id"),
"legacy_dependencies",
["dependency_id"],
unique=False,
)
op.create_index(
op.f("ix_legacy_dependencies_dependency_type_id"),
"legacy_dependencies",
["dependency_type_id"],
unique=False,
)
op.create_index(
op.f("ix_legacy_dependencies_package_id"),
"legacy_dependencies",
["package_id"],
unique=False,
)
def downgrade() -> None:
op.drop_index(
op.f("ix_legacy_dependencies_package_id"), table_name="legacy_dependencies"
)
op.drop_index(
op.f("ix_legacy_dependencies_dependency_type_id"),
table_name="legacy_dependencies",
)
op.drop_index(
op.f("ix_legacy_dependencies_dependency_id"), table_name="legacy_dependencies"
)
op.drop_table("legacy_dependencies")
================================================
FILE: alembic/versions/20250312_2244-canons.py
================================================
"""canons
Revision ID: e7632ae1aff7
Revises: 89af630dc946
Create Date: 2025-03-12 22:44:45.272179
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "e7632ae1aff7"
down_revision: str | None = "89af630dc946"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"canons",
sa.Column("id", sa.UUID(), nullable=False),
sa.Column("url", sa.String(), nullable=False),
sa.Column("name", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_canons")),
)
op.create_index(op.f("ix_canons_name"), "canons", ["name"], unique=False)
op.create_index(op.f("ix_canons_url"), "canons", ["url"], unique=True)
op.create_table(
"canon_packages",
sa.Column("id", sa.UUID(), nullable=False),
sa.Column("canon_id", sa.UUID(), nullable=False),
sa.Column("package_id", sa.UUID(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["canon_id"], ["canons.id"], name=op.f("fk_canon_packages_canon_id_canons")
),
sa.ForeignKeyConstraint(
["package_id"],
["packages.id"],
name=op.f("fk_canon_packages_package_id_packages"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_canon_packages")),
)
op.create_index(
op.f("ix_canon_packages_canon_id"), "canon_packages", ["canon_id"], unique=False
)
op.create_index(
op.f("ix_canon_packages_package_id"),
"canon_packages",
["package_id"],
unique=False,
)
def downgrade() -> None:
op.drop_index(op.f("ix_canon_packages_package_id"), table_name="canon_packages")
op.drop_index(op.f("ix_canon_packages_canon_id"), table_name="canon_packages")
op.drop_table("canon_packages")
op.drop_index(op.f("ix_canons_url"), table_name="canons")
op.drop_index(op.f("ix_canons_name"), table_name="canons")
op.drop_table("canons")
================================================
FILE: alembic/versions/20250416_0223-add_ranks.py
================================================
"""add-ranks
Revision ID: 26e124131bf8
Revises: e7632ae1aff7
Create Date: 2025-04-16 02:23:33.665773
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "26e124131bf8"
down_revision: str | None = "e7632ae1aff7"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"tea_rank_runs",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("run", sa.Integer(), nullable=False),
sa.Column("split_ratio", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_tea_rank_runs")),
)
op.create_table(
"tea_ranks",
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("uuid_generate_v4()"),
nullable=False,
),
sa.Column("tea_rank_run", sa.Integer(), nullable=False),
sa.Column("canon_id", sa.UUID(), nullable=False),
sa.Column("rank", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["canon_id"], ["canons.id"], name=op.f("fk_tea_ranks_canon_id_canons")
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_tea_ranks")),
)
op.create_index(
op.f("ix_tea_ranks_canon_id"), "tea_ranks", ["canon_id"], unique=False
)
op.create_index(
op.f("ix_tea_ranks_tea_rank_run"), "tea_ranks", ["tea_rank_run"], unique=False
)
def downgrade() -> None:
op.drop_index(op.f("ix_tea_ranks_tea_rank_run"), table_name="tea_ranks")
op.drop_index(op.f("ix_tea_ranks_canon_id"), table_name="tea_ranks")
op.drop_table("tea_ranks")
op.drop_table("tea_rank_runs")
================================================
FILE: alembic/versions/20250422_0940-add_unique_package_to_canon_packages.py
================================================
"""add-unique-package-to-canon-packages
Revision ID: a41236bd2340
Revises: 26e124131bf8
Create Date: 2025-04-22 09:40:22.901637
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "a41236bd2340"
down_revision: str | None = "26e124131bf8"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.drop_index("ix_canon_packages_package_id", table_name="canon_packages")
op.create_index(
op.f("ix_canon_packages_package_id"),
"canon_packages",
["package_id"],
unique=True,
)
def downgrade() -> None:
op.drop_index(op.f("ix_canon_packages_package_id"), table_name="canon_packages")
op.create_index(
"ix_canon_packages_package_id", "canon_packages", ["package_id"], unique=False
)
================================================
FILE: alembic/versions/20250508_1752-add_trgm_indexes.py
================================================
"""add_trgm_indexes
Revision ID: 7392d4d74ce2
Revises: a41236bd2340
Create Date: 2025-05-08 17:52:40.417822
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "7392d4d74ce2"
down_revision: str | None = "a41236bd2340"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
# Drop the existing indexes
op.drop_index("ix_canons_name", table_name="canons")
op.drop_index("ix_urls_url", table_name="urls")
# Create trigram indexes
# NOTE: this was added manually to this script (not auto-generated)
op.create_index(
"ix_urls_url_trgm",
"urls",
["url"],
unique=False,
postgresql_using="gin",
postgresql_ops={"url": "gin_trgm_ops"},
)
op.create_index(
"ix_canons_name_trgm",
"canons",
["name"],
unique=False,
postgresql_using="gin",
postgresql_ops={"name": "gin_trgm_ops"},
)
def downgrade() -> None:
# Drop the trigram indexes
# NOTE: this was added manually to this script (not auto-generated)
op.drop_index("ix_urls_url_trgm", table_name="urls")
op.drop_index("ix_canons_name_trgm", table_name="canons")
# Recreate the existing indexes (auto-generated)
op.create_index("ix_urls_url", "urls", ["url"], unique=False)
op.create_index("ix_canons_name", "canons", ["name"], unique=False)
================================================
FILE: alembic/versions/20250529_2341-rename_canons_table_and_recreate.py
================================================
"""rename_canons_table_and_recreate
Revision ID: 542d79f30fc9
Revises: 7392d4d74ce2
Create Date: 2025-05-29 23:41:38.465987
"""
from collections.abc import Sequence
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import UUID
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "542d79f30fc9"
down_revision: str | None = "7392d4d74ce2"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""
Rename existing canons table and create new one with proper url_id FK
"""
# Step 1: Rename existing table to preserve data as backup
op.rename_table("canons", "canons_old")
# Step 2: Drop FK constraints that pointed to old table (from other tables)
op.drop_constraint(
"fk_canon_packages_canon_id_canons", "canon_packages", type_="foreignkey"
)
op.drop_constraint("fk_tea_ranks_canon_id_canons", "tea_ranks", type_="foreignkey")
# Step 3: Drop indexes and constraints from old table to avoid naming conflicts
op.drop_constraint("pk_canons", "canons_old", type_="primary")
op.drop_index("ix_canons_url", table_name="canons_old")
op.drop_index("ix_canons_name_trgm", table_name="canons_old")
# Step 4: Create new canons table with proper schema
op.create_table(
"canons",
sa.Column(
"id",
UUID(as_uuid=True),
primary_key=True,
server_default=sa.func.uuid_generate_v4(),
),
sa.Column(
"url_id", UUID(as_uuid=True), nullable=False, index=True, unique=True
),
sa.Column("name", sa.String(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
),
sa.Column(
"updated_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
),
# Constraints
sa.ForeignKeyConstraint(["url_id"], ["urls.id"], name="fk_canons_url_id_urls"),
sa.UniqueConstraint("url_id", name="uq_canons_url_id"),
)
# Step 5: Create indexes
op.create_index(
"ix_canons_name_trgm",
"canons",
["name"],
postgresql_using="gin",
postgresql_ops={"name": "gin_trgm_ops"},
)
# Note: FK constraints to this table will be recreated in a separate migration
# after data population, since this table starts empty
def downgrade() -> None:
"""
Restore original canons table with all its original indexes and constraints
"""
# FK constraints were dropped in upgrade and not recreated, so no need to drop them here
# Drop new table
op.drop_table("canons")
# Restore old table
op.rename_table("canons_old", "canons")
# Recreate all original constraints and indexes on restored table
op.create_primary_key("pk_canons", "canons", ["id"])
op.create_index("ix_canons_url", "canons", ["url"], unique=True)
op.create_index(
"ix_canons_name_trgm",
"canons",
["name"],
postgresql_using="gin",
postgresql_ops={"name": "gin_trgm_ops"},
)
# Recreate FK constraints from other tables pointing to canons
op.create_foreign_key(
"fk_canon_packages_canon_id_canons",
"canon_packages",
"canons",
["canon_id"],
["id"],
)
op.create_foreign_key(
"fk_tea_ranks_canon_id_canons", "tea_ranks", "canons", ["canon_id"], ["id"]
)
================================================
FILE: alembic/versions/20250529_2345-recreate_canon_foreign_keys.py
================================================
"""recreate_canon_foreign_keys
Revision ID: 3de32bb99a71
Revises: 542d79f30fc9
Create Date: 2025-05-29 23:45:12.372951
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "3de32bb99a71"
down_revision: str | None = "542d79f30fc9"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""
Recreate FK constraints pointing to canons table after data population
Run this AFTER your canonicalization script has populated the canons table
"""
# First, clean up any orphaned records in referencing tables
# (Optional: uncomment if you want to auto-clean orphaned data)
# op.execute("""
# DELETE FROM canon_packages
# WHERE canon_id NOT IN (SELECT id FROM canons)
# """)
# op.execute("""
# DELETE FROM tea_ranks
# WHERE canon_id NOT IN (SELECT id FROM canons)
# """)
# Recreate FK constraints
op.create_foreign_key(
"fk_canon_packages_canon_id_canons",
"canon_packages",
"canons",
["canon_id"],
["id"],
)
op.create_foreign_key(
"fk_tea_ranks_canon_id_canons", "tea_ranks", "canons", ["canon_id"], ["id"]
)
def downgrade() -> None:
"""
Drop FK constraints pointing to canons table
"""
op.drop_constraint(
"fk_canon_packages_canon_id_canons", "canon_packages", type_="foreignkey"
)
op.drop_constraint("fk_tea_ranks_canon_id_canons", "tea_ranks", type_="foreignkey")
================================================
FILE: api/.dockerignore
================================================
/target
.git
.gitignore
README.md
================================================
FILE: api/.gitignore
================================================
/target
**/*.rs.bk
Cargo.lock
.env
================================================
FILE: api/Cargo.toml
================================================
[package]
name = "chai-api"
version = "1.3.0"
edition = "2021"
authors = ["Jacob Heider <jacob@pkgx.dev>"]
description = "A simple REST API for the CHAI database"
readme = "README.md"
license = "MIT"
repository = "https://github.com/teaxyz/chai-oss"
[dependencies]
uuid = { version = "1.11.0", features = ["serde", "v4"] }
actix-web = "4.3"
dotenv = "0.15"
tokio = { version = "1", features = ["full"] }
log = "0.4"
env_logger = "0.10"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
chrono = { version = "0.4", features = ["serde"] }
tokio-postgres = { version = "0.7", features = [
"with-serde_json-1",
"with-chrono-0_4",
"with-uuid-1",
] }
deadpool-postgres = "0.10.0"
url = "2.5.2"
dashmap = "6.1.0"
================================================
FILE: api/Dockerfile
================================================
FROM --platform=linux/amd64 lukemathwalker/cargo-chef:latest-rust-1.82.0 as chef
WORKDIR /app
FROM chef as planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json
FROM chef as builder
COPY --from=planner /app/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json
COPY . .
RUN cargo build --release
FROM debian:bookworm-slim as runtime
WORKDIR /app
RUN apt-get update && apt-get install -y curl openssl ca-certificates && rm -rf /var/lib/apt/lists/*
COPY --from=builder /app/target/release/chai-api /usr/local/bin
ENV DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai
EXPOSE 8080
CMD ["chai-api"]
================================================
FILE: api/README.md
================================================
# CHAI API
CHAI API is a REST API service for accessing the CHAI database, which contains package
manager data.
## Features
- List all tables in the database
- Fetch paginated data from any table
- Heartbeat endpoint for health checks
- Search deduplicated packages by name
## Requirements
- Rust 1.67 or later
- PostgreSQL database
## API Endpoints
### Health Check
```
GET /heartbeat
```
Returns the health status of the API and database connection.
**Response (Success)**
```txt
OK - Database connection is healthy
```
**Response (Failure - Database query failed):**
```txt
Database query failed
```
**Response (Failure - Database connection failed):**
```txt
Failed to get database connection
```
### List Tables
```
GET /tables
```
Returns a paginated list of all available tables in the database.
**Query Parameters**
- `page` (optional): Page number (default: 1)
- `limit` (optional): Number of items per page (default: 200)
**Response**
```json
{
"data": [
"legacy_dependencies",
"versions",
"canons_old",
"tea_rank_runs",
"canons",
"licenses",
"canon_packages",
"users",
"load_history",
"tea_ranks",
"alembic_version",
"sources",
"package_managers",
"url_types",
"urls",
"packages",
"package_urls",
"user_packages",
"dependencies",
"depends_on_types",
"user_versions",
"canon_packages_old",
"tea_rank_old"
],
"limit": 200,
"page": 1,
"total_count": 23,
"total_pages": 1
}
```
### Get Table Data
```
GET /{table}
```
Returns paginated data from the specified table.
**Path Parameters**
- `table`: Name of the table to query (see available tables in List Tables response)
**Query Parameters**
- `page` (optional): Page number (default: 1)
- `limit` (optional): Number of items per page (default: 200)
**Response**
```json
{
"table": "packages",
"total_count": 166459,
"page": 1,
"limit": 2,
"total_pages": 83230,
"columns": [
...
],
"data": [
{
"created_at": "2024-12-27 08:04:03.991832",
"derived_id": "...",
"id": "...",
"import_id": "...",
"name": "...",
"package_manager_id": "...",
"readme": "...",
"updated_at": "2024-12-27 08:04:03.991832"
},
...
]
}
```
### Get Table Row By ID
```
GET /{table}/{id}
```
Returns a specific row from the table by its UUID.
**Path Parameters**
- `table`: Name of the table to query
- `id`: UUID of the row to fetch
**Response**
```json
{
"created_at": "2024-12-27 08:04:03.991832",
"derived_id": "...",
"id": "...",
"import_id": "...",
"name": "...",
"package_manager_id": "...",
"readme": "...",
"updated_at": "2024-12-27 08:04:03.991832"
}
```
### Get Project
```
GET /project/{id}
```
Returns detailed information about a specific canon by its canonical ID.
**Path Parameters**
- `id`: UUID of the project (canon) to fetch
**Response**
```json
{
"projectId": "550e8400-e29b-41d4-a716-446655440000",
"homepage": "https://example.com",
"name": "example-project",
"source": "https://github.com/example/project",
"teaRank": "150",
"teaRankCalculatedAt": "2024-12-27T08:04:03.991832",
"packageManagers": ["homebrew", "crates"]
}
```
**Response (Not Found)**
```json
{
"error": "No row found with id '550e8400-e29b-41d4-a716-446655440000' in table canons"
}
```
### Get Projects Batch
```
POST /project/batch
```
Returns detailed information about multiple projects by their canonical IDs.
**Request Body**
```json
{
"projectIds": ["uuid1", "uuid2", "..."]
}
```
**Parameters**
- `projectIds`: Array of project UUIDs to include in the leaderboard (required, max 100)
**Example**
```
POST /project/batch
```
**Example Request**
```bash
curl -X POST http://localhost:8080/project/batch \
-H "Content-Type: application/json" \
-d '{
"projectIds": [
"550e8400-e29b-41d4-a716-446655440000",
"6ba7b810-9dad-11d1-80b4-00c04fd430c8"
]
}'
```
**Response**
```json
[
{
"projectId": "550e8400-e29b-41d4-a716-446655440000",
"homepage": "https://example.com",
"name": "example-project",
"source": "https://github.com/example/project",
"teaRank": "150",
"teaRankCalculatedAt": "2024-12-27T08:04:03.991832",
"packageManagers": ["homebrew", "crates"]
},
{
"projectId": "6ba7b810-9dad-11d1-80b4-00c04fd430c8",
"homepage": "https://another-example.com",
"name": "another-project",
"source": "https://github.com/another/project",
"teaRank": "75",
"teaRankCalculatedAt": "2024-12-26T10:15:22.123456",
"packageManagers": ["debian", "pkgx"]
}
]
```
**Response (Invalid UUIDs)**
```json
{
"error": "Invalid UUID format in project IDs"
}
```
### Search Projects
```
GET /project/search/{name}
```
Searches for projects by name using case-insensitive partial matching. Results are
ordered by name length and limited to 10 items.
**Path Parameters**
- `name`: Project name to search for (partial matches supported)
**Example**
```
GET /project/search/python
```
**Response**
```json
[
{
"projectId": "550e8400-e29b-41d4-a716-446655440000",
"homepage": "https://reactjs.org",
"name": "react",
"source": "https://github.com/facebook/react",
"packageManagers": ["homebrew", "npm"]
},
{
"projectId": "6ba7b810-9dad-11d1-80b4-00c04fd430c8",
"homepage": "https://reactrouter.com",
"name": "react-router",
"source": "https://github.com/remix-run/react-router",
"packageManagers": ["npm"]
}
]
```
**Response (Empty Search)**
```json
{
"error": "Search name cannot be empty"
}
```
### Leaderboard
```
POST /leaderboard
```
Returns detailed information about specified projects, ordered by tea rank in descending
order. This endpoint allows filtering by project IDs and limiting the number of results.
**Request Body**
```json
{
"projectIds": ["uuid1", "uuid2", "..."],
"limit": 10
}
```
**Parameters**
- `projectIds`: Array of project UUIDs to include in the leaderboard (required, max 100)
- `limit`: Maximum number of results to return (required, 1-100)
**Example Request**
```bash
curl -X POST http://localhost:8080/leaderboard \
-H "Content-Type: application/json" \
-d '{
"projectIds": [
"1e233f1b-2b49-4ada-9953-1763785fba2c",
"2c24aa45-4fe2-4f2b-ae58-09d4b9a4ad28"
],
"limit": 2
}'
```
**Response**
```json
[
{
"projectId": "1e233f1b-2b49-4ada-9953-1763785fba2c",
"homepage": "https://example.com",
"name": "example-project",
"source": "https://github.com/example/project",
"teaRank": "150",
"teaRankCalculatedAt": "2024-12-27T08:04:03.991832",
"packageManagers": ["homebrew", "crates"]
},
{
"projectId": "2c24aa45-4fe2-4f2b-ae58-09d4b9a4ad28",
"homepage": "https://another-example.com",
"name": "another-project",
"source": "https://github.com/another/project",
"teaRank": "75",
"teaRankCalculatedAt": "2024-12-26T10:15:22.123456",
"packageManagers": ["debian", "pkgx"]
}
]
```
**Response (Validation Errors)**
```json
{
"error": "At least one project ID is required"
}
```
```json
{
"error": "Too many project IDs (maximum 100 allowed)"
}
```
```json
{
"error": "Invalid limit 150: must be between 1 and 100"
}
```
## Available Tables
The database contains the following tables:
| Table Name | Description |
| ---------------- | ------------------------------------------------ |
| alembic_version | Store the current version of alembic |
| dependencies | Package dependencies |
| depends_on_types | Types of package dependencies |
| licenses | Package licenses |
| load_history | Load history |
| package_managers | Package manager information |
| package_urls | Relationship of packages to URLs |
| packages | Package metadata |
| sources | Package manager sources (homebrew, crates, etc.) |
| url_types | Types of URLs (homepage, repository, etc.) |
| urls | Actual URLs |
| user_packages | User-package relationships |
| user_versions | User-version relationships |
| users | User (package owner) information |
| versions | Package versions |
By default, the API will be available at `http://localhost:8080`.
## Deployment
The CHAI API is deployed using AWS services with the following stack:
- **Amazon ECR (Elastic Container Registry)** - Container image storage
- **Amazon ECS (Elastic Container Service)** - Container orchestration
- **ECS Service** - Manages running tasks and load balancing
- **ECS Task Definition** - Defines container configuration
### Prerequisites
- AWS CLI configured with appropriate permissions
- Docker installed locally
- Access to the AWS account and ECR repository
### Building and Pushing Docker Image
1. **Get ECR login credentials:**
```bash
aws ecr get-login-password --region <your-region> | docker login --username AWS --password-stdin <account-id>.dkr.ecr.<your-region>.amazonaws.com
```
2. **Build the Docker image:**
```bash
docker build -t chai-api .
```
3. **Tag the image for ECR:**
```bash
docker tag chai-api:latest <account-id>.dkr.ecr.<your-region>.amazonaws.com/chai-api:latest
```
4. **Push the image to ECR:**
```bash
docker push <account-id>.dkr.ecr.<your-region>.amazonaws.com/chai-api:latest
```
> **Note:** Replace `<account-id>` and `<your-region>` with your AWS account ID and region. You can find the exact commands in your ECR repository console under "View push commands".
### Updating Existing ECS Service
If updating the ECS service, you first need to Build and Push the docker image. Then:
```bash
aws ecs update-service --cluster chai-<environment> --service <environment>-chai-api --force-new-deployment
```
### Environment Variables
Ensure the following environment variables are configured in your task definition:
- `DATABASE_URL`: PostgreSQL connection string
- `HOST`: Host to bind to (default: "0.0.0.0")
- `PORT`: Port to listen on (default: "8080")
### Useful AWS Documentation
- [Amazon ECR User Guide](https://docs.aws.amazon.com/ecr/)
- [Amazon ECS Developer Guide](https://docs.aws.amazon.com/ecs/)
- [ECS Task Definitions](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_definitions.html)
- [ECS Services](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs_services.html)
- [AWS CLI ECS Commands](https://docs.aws.amazon.com/cli/latest/reference/ecs/)
## Tasks
### Format
```bash
cargo fmt --all --
```
### Build
```bash
cargo build --release
```
### Validate
```bash
cargo clippy --all-targets --all-features -- -D warnings
```
### Run
Env: DATABASE_URL=postgresql://postgres:s3cr3t@localhost:5435/chai
```bash
target/release/chai-api
```
================================================
FILE: api/src/app_state.rs
================================================
use dashmap::DashMap;
use deadpool_postgres::Pool;
use serde_json::Value;
use std::sync::Arc;
use std::time::{Duration, Instant};
use uuid::Uuid;
const TTL: Duration = Duration::from_secs(3600); // 1 hour
#[derive(Clone)]
pub struct ProjectCacheEntry {
pub data: Arc<Value>,
pub created_at: Instant,
}
impl ProjectCacheEntry {
pub fn new(data: Value) -> Self {
Self {
data: Arc::new(data),
created_at: Instant::now(),
}
}
pub fn is_expired(&self) -> bool {
self.created_at.elapsed() > TTL
}
}
pub struct AppState {
pub pool: Pool,
pub tables: Arc<Vec<String>>,
pub project_cache: Arc<DashMap<Uuid, ProjectCacheEntry>>,
}
================================================
FILE: api/src/db.rs
================================================
use deadpool_postgres::{Config, Pool, Runtime};
use std::env;
use std::sync::Arc;
use tokio_postgres::{Client, NoTls};
use url::Url;
pub async fn create_pool() -> Pool {
let database_url = env::var("DATABASE_URL").expect("DATABASE_URL must be set");
let db_url = Url::parse(&database_url).expect("Invalid database URL");
let mut config = Config::new();
config.host = db_url.host_str().map(ToOwned::to_owned);
config.port = db_url.port();
config.user = Some(db_url.username().to_owned());
config.password = db_url.password().map(ToOwned::to_owned);
config.dbname = db_url.path().strip_prefix('/').map(ToOwned::to_owned);
config
.create_pool(Some(Runtime::Tokio1), NoTls)
.expect("Failed to create pool")
}
pub async fn get_tables(client: &Client) -> Vec<String> {
let rows = client
.query(
"SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'",
&[],
)
.await
.expect("Failed to fetch tables");
rows.into_iter()
.map(|row| row.get::<_, String>("table_name"))
.collect()
}
pub async fn initialize_db() -> (Pool, Arc<Vec<String>>) {
let pool = create_pool().await;
let client = pool.get().await.expect("Failed to get client from pool");
let tables = Arc::new(get_tables(&client).await);
(pool, tables)
}
================================================
FILE: api/src/handlers.rs
================================================
use actix_web::{get, post, web, HttpResponse, Responder};
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use std::sync::Arc;
use tokio_postgres::error::SqlState;
use uuid::Uuid;
use crate::app_state::AppState;
use crate::utils::{get_cached_projects, get_column_names, rows_to_json, Pagination};
const RESPONSE_LIMIT: i64 = 1000;
#[derive(Deserialize)]
pub struct PaginationParams {
pub page: Option<i64>,
pub limit: Option<i64>,
}
#[derive(Serialize)]
struct PaginatedResponse {
table: String,
total_count: i64,
page: i64,
limit: i64,
total_pages: i64,
columns: Vec<String>,
data: Vec<Value>,
}
#[derive(Deserialize)]
pub struct LeaderboardRequest {
#[serde(rename = "projectIds")]
pub project_ids: Option<Vec<Uuid>>,
pub limit: i64,
}
#[derive(Deserialize)]
pub struct ProjectBatchRequest {
#[serde(rename = "projectIds")]
pub project_ids: Vec<Uuid>,
}
pub fn check_table_exists(table: &str, tables: &[String]) -> Option<HttpResponse> {
if !tables.contains(&table.to_string()) {
Some(HttpResponse::NotFound().json(json!({
"error": format!("Table '{}' not found", table),
"valid_tables": tables,
"help": "Refer to the API documentation for valid table names."
})))
} else {
None
}
}
#[get("/tables")]
pub async fn list_tables(
query: web::Query<PaginationParams>,
data: web::Data<AppState>,
) -> impl Responder {
let total_count = data.tables.len() as i64;
let pagination = Pagination::new(query, total_count);
let start = pagination.offset as usize;
let end = (start + pagination.limit as usize).min(data.tables.len());
let paginated_tables = &data.tables[start..end];
HttpResponse::Ok().json(json!({
"total_count": total_count,
"page": pagination.page,
"limit": pagination.limit,
"total_pages": pagination.total_pages,
"data": paginated_tables,
}))
}
#[get("/heartbeat")]
pub async fn heartbeat(data: web::Data<AppState>) -> impl Responder {
match data.pool.get().await {
Ok(client) => match client.query_one("SELECT 1", &[]).await {
Ok(_) => HttpResponse::Ok().body("OK - Database connection is healthy"),
Err(e) => {
log::error!("Database query failed: {e}");
HttpResponse::InternalServerError().body("Database query failed")
}
},
Err(e) => {
log::error!("Failed to get database connection: {e}");
HttpResponse::InternalServerError().body("Failed to get database connection")
}
}
}
#[get("/tables/{table}")]
pub async fn get_table(
path: web::Path<String>,
query: web::Query<PaginationParams>,
data: web::Data<AppState>,
) -> impl Responder {
let table = path.into_inner();
if let Some(response) = check_table_exists(&table, &data.tables) {
return response;
}
let count_query = format!("SELECT COUNT(*) FROM {table}");
match data.pool.get().await {
Ok(client) => match client.query_one(&count_query, &[]).await {
Ok(count_row) => {
let total_count: i64 = count_row.get(0);
let pagination = Pagination::new(query, total_count);
let data_query = format!("SELECT * FROM {table} LIMIT $1 OFFSET $2");
match client
.query(&data_query, &[&pagination.limit, &pagination.offset])
.await
{
Ok(rows) => {
let columns = get_column_names(&rows);
let data = rows_to_json(&rows);
let response = PaginatedResponse {
table,
total_count,
page: pagination.page,
limit: pagination.limit,
total_pages: pagination.total_pages,
columns,
data,
};
HttpResponse::Ok().json(response)
}
Err(e) => {
log::error!("Database query error: {e}");
HttpResponse::InternalServerError().json(json!({
"error": "An error occurred while querying the database"
}))
}
}
}
Err(e) => {
log::error!("Database count query error: {e}");
HttpResponse::InternalServerError().json(json!({
"error": "An error occurred while counting rows in the database"
}))
}
},
Err(e) => {
log::error!("Failed to get database connection: {e}");
HttpResponse::InternalServerError().body("Failed to get database connection")
}
}
}
#[get("/tables/{table}/{id}")]
pub async fn get_table_row(
path: web::Path<(String, Uuid)>,
data: web::Data<AppState>,
) -> impl Responder {
let (table_name, id) = path.into_inner();
if let Some(response) = check_table_exists(&table_name, &data.tables) {
return response;
}
let query = format!("SELECT * FROM {table_name} WHERE id = $1");
match data.pool.get().await {
Ok(client) => match client.query_one(&query, &[&id]).await {
Ok(row) => {
let json = rows_to_json(&[row]);
let value = json.first().unwrap();
HttpResponse::Ok().json(value)
}
Err(e) => {
if e.as_db_error()
.is_some_and(|db_err| db_err.code() == &SqlState::UNDEFINED_TABLE)
{
HttpResponse::NotFound().json(json!({
"error": format!("Table '{}' not found", table_name)
}))
} else if e
.as_db_error()
.is_some_and(|e| e.code() == &SqlState::NO_DATA_FOUND)
{
HttpResponse::NotFound().json(json!({
"error": format!("No row found with id '{}' in table '{}'", id, table_name)
}))
} else {
HttpResponse::InternalServerError().json(json!({
"error": format!("Database error: {}", e)
}))
}
}
},
Err(e) => {
log::error!("Failed to get database connection: {e}");
HttpResponse::InternalServerError().body("Failed to get database connection")
}
}
}
#[get("/project/{id}")]
pub async fn get_project(path: web::Path<Uuid>, data: web::Data<AppState>) -> impl Responder {
// Check if the table exists
let id = path.into_inner();
// Construct the query
let query = r#"
WITH base AS MATERIALIZED (
SELECT
c.id,
u_homepage.url AS homepage,
c.name,
COALESCE(tr_latest.rank, '0') AS "teaRank",
tr_latest.created_at AS "teaRankCalculatedAt",
(
SELECT ARRAY_AGG(DISTINCT s.type)
FROM canon_packages cp2
JOIN packages p2 ON cp2.package_id = p2.id
JOIN package_managers pm2 ON p2.package_manager_id = pm2.id
JOIN sources s ON pm2.source_id = s.id
WHERE cp2.canon_id = c.id
) AS "packageManagers",
(
SELECT COUNT(*)::bigint
FROM legacy_dependencies ld
JOIN canon_packages cp_out ON cp_out.package_id = ld.package_id
WHERE cp_out.canon_id = c.id
) AS "dependenciesCount",
(
SELECT COUNT(*)::bigint
FROM legacy_dependencies ld
JOIN canon_packages cp_in ON cp_in.package_id = ld.dependency_id
WHERE cp_in.canon_id = c.id
) AS "dependentsCount"
FROM canons c
JOIN urls u_homepage ON c.url_id = u_homepage.id
LEFT JOIN LATERAL (
SELECT tr.rank, tr.created_at
FROM tea_ranks tr
WHERE tr.canon_id = c.id
ORDER BY tr.created_at DESC
LIMIT 1
) tr_latest ON TRUE
WHERE c.id = $1
)
SELECT DISTINCT ON (b.id)
b.id AS "projectId",
b.homepage,
b.name,
u_source.url AS source,
b."teaRank",
b."teaRankCalculatedAt",
b."packageManagers",
b."dependenciesCount",
b."dependentsCount"
FROM base b
JOIN canon_packages cp ON cp.canon_id = b.id
JOIN package_urls pu ON pu.package_id = cp.package_id
JOIN urls u_source ON pu.url_id = u_source.id
JOIN url_types ut ON ut.id = u_source.url_type_id
WHERE ut.name = 'source'
ORDER BY b.id, b."teaRankCalculatedAt" DESC, u_source.url;"#;
match data.pool.get().await {
Ok(client) => match client.query_one(query, &[&id]).await {
Ok(row) => {
let json = rows_to_json(&[row]);
let value = json.first().unwrap();
HttpResponse::Ok().json(value)
}
Err(e) => {
if e.as_db_error()
.is_some_and(|e| e.code() == &SqlState::NO_DATA_FOUND)
{
HttpResponse::NotFound().json(json!({
"error": format!("No row found with id '{:?}' in table canons", id)
}))
} else {
HttpResponse::InternalServerError().json(json!({
"error": format!("Database error: {}", e)
}))
}
}
},
Err(e) => {
log::error!("Failed to get database connection: {e}");
HttpResponse::InternalServerError().body("Failed to get database connection")
}
}
}
#[post("/project/batch")]
pub async fn list_projects_by_id(
req: web::Json<ProjectBatchRequest>,
data: web::Data<AppState>,
) -> impl Responder {
if req.project_ids.is_empty() {
return HttpResponse::BadRequest().json(json!({
"error": "No project IDs provided"
}));
}
// Construct the query
let query = r#"
SELECT DISTINCT ON (c.id)
c.id AS "projectId",
u_homepage.url AS homepage,
c.name,
u_source.url AS source,
COALESCE(tr.rank,'0') AS "teaRank",
tr.created_at AS "teaRankCalculatedAt",
(
SELECT ARRAY_AGG(DISTINCT s.type)
FROM canon_packages cp2
JOIN packages p2 ON cp2.package_id = p2.id
JOIN package_managers pm2 ON p2.package_manager_id = pm2.id
JOIN sources s ON pm2.source_id = s.id
WHERE cp2.canon_id = c.id
) AS "packageManagers"
FROM canons c
JOIN urls u_homepage ON u_homepage.id = c.url_id
JOIN canon_packages cp ON cp.canon_id = c.id
JOIN package_urls pu ON pu.package_id = cp.package_id
JOIN urls u_source ON pu.url_id = u_source.id
JOIN url_types ut ON ut.id = u_source.url_type_id
LEFT JOIN tea_ranks tr ON tr.canon_id = c.id
WHERE c.id = ANY($1::uuid[]) AND ut.name = 'source'
ORDER BY c.id, tr.created_at DESC, u_source.url;"#;
match data.pool.get().await {
Ok(client) => match client.query(query, &[&req.project_ids]).await {
Ok(rows) => {
let json = rows_to_json(&rows);
HttpResponse::Ok().json(json)
}
Err(e) => {
log::error!("Database query error: {e}");
HttpResponse::InternalServerError().json(json!({
"error": format!("Database error: {}", e)
}))
}
},
Err(e) => {
log::error!("Failed to get database connection: {e}");
HttpResponse::InternalServerError().body("Failed to get database connection")
}
}
}
#[get("/project/search/{name}")]
pub async fn list_projects_by_name(
path: web::Path<String>,
data: web::Data<AppState>,
) -> impl Responder {
let name = path.into_inner();
if name.trim().is_empty() {
return HttpResponse::BadRequest().json(json!({
"error": "Search name cannot be empty"
}));
}
let wildcard = format!("%{name}%");
// Construct the query
let query = r#"
SELECT *
FROM (
SELECT DISTINCT ON (c.id)
c.id AS "projectId",
u_homepage.url AS homepage,
c.name,
u_source.url AS source,
(
SELECT ARRAY_AGG(DISTINCT s.type)
FROM canon_packages cp2
JOIN packages p2 ON cp2.package_id = p2.id
JOIN package_managers pm2 ON p2.package_manager_id = pm2.id
JOIN sources s ON pm2.source_id = s.id
WHERE cp2.canon_id = c.id
) AS "packageManagers"
FROM canons c
JOIN urls u_homepage ON c.url_id = u_homepage.id
JOIN canon_packages cp ON cp.canon_id = c.id
JOIN package_urls pu ON pu.package_id = cp.package_id
JOIN urls u_source ON pu.url_id = u_source.id
JOIN url_types ut_source ON ut_source.id = u_source.url_type_id
WHERE ut_source.name = 'source' AND (c.name ILIKE $1)
ORDER BY c.id
) sub
ORDER BY LENGTH(name), name
LIMIT 10;"#;
match data.pool.get().await {
Ok(client) => match client.query(query, &[&wildcard]).await {
Ok(rows) => {
let json = rows_to_json(&rows);
HttpResponse::Ok().json(json)
}
Err(e) => {
log::error!("Database query error: {e}");
HttpResponse::InternalServerError().json(json!({
"error": format!("Database error: {e}")
}))
}
},
Err(e) => {
log::error!("Failed to get database connection: {e}");
HttpResponse::InternalServerError().body("Failed to get database connection")
}
}
}
#[post("/leaderboard")]
pub async fn get_leaderboard(
req: web::Json<LeaderboardRequest>,
data: web::Data<AppState>,
) -> impl Responder {
let limit = req.limit.clamp(1, RESPONSE_LIMIT);
let Some(project_ids) = req.project_ids.as_deref() else {
return get_top_projects(data, limit).await;
};
if project_ids.len() > RESPONSE_LIMIT as usize {
return HttpResponse::BadRequest().json(json!({
"error": format!("Too many project IDs (maximum {} allowed)", RESPONSE_LIMIT)
}));
}
// Get cached projects and identify missing ones
let (cached_projects, missing_ids) =
get_cached_projects(data.project_cache.clone(), project_ids);
// If we have all projects cached, return them sorted
if missing_ids.is_empty() {
return sort_truncate_and_return(cached_projects, limit);
}
// Query for missing projects
let query = r#"
SELECT *
FROM (
SELECT DISTINCT ON (c.id)
c.id AS "projectId",
u_homepage.url AS homepage,
c.name,
u_source.url AS source,
COALESCE(tr.rank,'0') AS "teaRank",
tr.created_at AS "teaRankCalculatedAt",
(
SELECT ARRAY_AGG(DISTINCT s.type)
FROM canon_packages cp2
JOIN packages p2 ON cp2.package_id = p2.id
JOIN package_managers pm2 ON p2.package_manager_id = pm2.id
JOIN sources s ON pm2.source_id = s.id
WHERE cp2.canon_id = c.id
) AS "packageManagers"
FROM canons c
JOIN urls u_homepage ON c.url_id = u_homepage.id
JOIN canon_packages cp ON cp.canon_id = c.id
JOIN package_urls pu ON pu.package_id = cp.package_id
JOIN urls u_source ON pu.url_id = u_source.id
JOIN url_types ut_source ON ut_source.id = u_source.url_type_id
LEFT JOIN tea_ranks tr ON tr.canon_id = c.id
WHERE
c.id = ANY($1::uuid[])
AND ut_source.name = 'source'
AND CAST(tr.rank AS NUMERIC) > 0
ORDER BY c.id, tr.created_at DESC, u_source.url
) sub
ORDER BY CAST("teaRank" AS NUMERIC) DESC NULLS LAST
LIMIT $2"#;
match data.pool.get().await {
Ok(client) => match client.query(query, &[&missing_ids, &limit]).await {
Ok(rows) => {
let fresh_projects = rows_to_json(&rows);
// Cache the fresh projects
for project in &fresh_projects {
if let Some(project_id) = project.get("projectId").and_then(|v| v.as_str()) {
if let Ok(uuid) = Uuid::parse_str(project_id) {
data.project_cache.insert(
uuid,
crate::app_state::ProjectCacheEntry::new(project.clone()),
);
} else {
log::warn!("Failed to parse project ID as UUID: {}", project_id);
}
} else {
log::warn!("No projectId found in project: {:?}", project);
}
}
// Combine cached and fresh projects - keep Arc<Value> for cached ones
let mut all_projects: Vec<Arc<Value>> = cached_projects;
// Convert fresh projects to Arc<Value> to match the type
let fresh_arcs: Vec<Arc<Value>> =
fresh_projects.into_iter().map(Arc::new).collect();
all_projects.extend(fresh_arcs);
sort_truncate_and_return(all_projects, limit)
}
Err(e) => {
log::error!("Database query error: {e}");
HttpResponse::InternalServerError().json(json!({
"error": format!("Database error: {}", e)
}))
}
},
Err(e) => {
log::error!("Failed to get database connection: {e}");
HttpResponse::InternalServerError().body("Failed to get database connection")
}
}
}
// Helper function to sort, truncate, and return the final response
fn sort_truncate_and_return(projects: Vec<Arc<Value>>, limit: i64) -> actix_web::HttpResponse {
let mut projects = projects;
// Sort projects by teaRank (descending) - Arc<Value> derefs to Value
projects.sort_by(|a, b| {
let rank_a = a
.get("teaRank")
.and_then(|v| v.as_str())
.and_then(|s| s.parse::<i64>().ok())
.unwrap_or(0);
let rank_b = b
.get("teaRank")
.and_then(|v| v.as_str())
.and_then(|s| s.parse::<i64>().ok())
.unwrap_or(0);
rank_b.cmp(&rank_a)
});
// Apply limit
projects.truncate(limit as usize);
// Convert to Vec<Value> only for the final response - Arc<Value> doesn't implement Serialize
let final_projects: Vec<Value> = projects
.into_iter()
.map(|arc_val| (*arc_val).clone())
.collect();
actix_web::HttpResponse::Ok().json(final_projects)
}
async fn get_top_projects(data: web::Data<AppState>, limit: i64) -> HttpResponse {
// get client
let Ok(client) = data.pool.get().await else {
return HttpResponse::InternalServerError().body("Failed to get database connection");
};
// get latest run id
let run_query = r#"SELECT MAX(run) from tea_rank_runs"#;
let Ok(run_row) = client.query_one(run_query, &[]).await else {
return HttpResponse::InternalServerError().body("Failed to get latest run");
};
let run: i32 = run_row.get(0);
// get top projects (1-RESPONSE_LIMIT)
let top_ranks_query = r#"SELECT
canon_id as "projectId",
name,
rank as "teaRank",
(
SELECT ARRAY_AGG(DISTINCT s.type)
FROM canon_packages cp2
JOIN packages p2 ON cp2.package_id = p2.id
JOIN package_managers pm2 ON p2.package_manager_id = pm2.id
JOIN sources s ON pm2.source_id = s.id
WHERE cp2.canon_id = canon_id
) AS "packageManagers"
FROM
tea_ranks
JOIN canons ON canon_id = canons.id
WHERE
tea_rank_run = $1
ORDER BY
rank DESC
LIMIT $2"#;
let Ok(top_ranks) = client
.query(top_ranks_query, &[&run, &limit.clamp(1, RESPONSE_LIMIT)])
.await
else {
return HttpResponse::InternalServerError().json(json!({
"error": "Failed to fetch top ranks"
}));
};
let json = rows_to_json(&top_ranks);
HttpResponse::Ok().json(json)
}
================================================
FILE: api/src/logging.rs
================================================
use env_logger::Env;
pub fn setup_logger() {
env_logger::init_from_env(Env::default().default_filter_or("info"));
}
pub struct Logger;
impl Logger {
pub fn default() -> actix_web::middleware::Logger {
actix_web::middleware::Logger::new("%a '%r' %s %b '%{Referer}i' '%{User-Agent}i' %T")
}
}
================================================
FILE: api/src/main.rs
================================================
mod app_state;
mod db;
mod handlers;
mod logging;
mod utils;
use actix_web::{web, App, HttpServer};
use dashmap::DashMap;
use dotenv::dotenv;
use std::env;
use std::sync::Arc;
use crate::app_state::AppState;
use crate::handlers::{
get_leaderboard, get_project, get_table, get_table_row, heartbeat, list_projects_by_id,
list_projects_by_name, list_tables,
};
use crate::logging::setup_logger;
#[actix_web::main]
async fn main() -> std::io::Result<()> {
dotenv().ok();
setup_logger();
let host = env::var("HOST").unwrap_or_else(|_| "0.0.0.0".to_string());
let port = env::var("PORT").unwrap_or_else(|_| "8080".to_string());
let bind_address = format!("{host}:{port}");
let (pool, tables) = db::initialize_db().await;
// Cache for project data to reduce database load on leaderboard routes
let project_cache = Arc::new(DashMap::new());
log::info!("Available tables: {tables:?}");
log::info!("Starting server at http://{bind_address}");
HttpServer::new(move || {
App::new()
.wrap(logging::Logger::default())
.app_data(web::Data::new(AppState {
pool: pool.clone(),
tables: Arc::clone(&tables),
project_cache: Arc::clone(&project_cache),
}))
// HEALTH
.service(heartbeat)
// SIMPLE CRUD OPERATIONS
.service(list_tables)
.service(get_table)
.service(get_table_row)
// BUSINESS LOGIC
.service(get_leaderboard)
.service(get_project)
.service(list_projects_by_id)
.service(list_projects_by_name)
})
.bind(&bind_address)?
.run()
.await
}
================================================
FILE: api/src/utils.rs
================================================
use actix_web::web::Query;
use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
use dashmap::DashMap;
use serde_json::{json, Value};
use std::sync::Arc;
use tokio_postgres::{types::Type, Row};
use uuid::Uuid;
use crate::{app_state::ProjectCacheEntry, handlers::PaginationParams};
pub fn get_column_names(rows: &[Row]) -> Vec<String> {
if let Some(row) = rows.first() {
row.columns()
.iter()
.map(|col| col.name().to_string())
.collect()
} else {
vec![]
}
}
pub fn convert_optional_to_json<T, E>(result: Result<Option<T>, E>) -> Value
where
T: serde::Serialize,
{
match result {
Ok(Some(val)) => json!(val),
_ => Value::Null,
}
}
pub fn rows_to_json(rows: &[Row]) -> Vec<Value> {
rows.iter()
.map(|row| {
let mut map = serde_json::Map::new();
for (i, column) in row.columns().iter().enumerate() {
let value: Value = match *column.type_() {
Type::INT2 => convert_optional_to_json(row.try_get::<_, Option<i16>>(i)),
Type::INT4 => convert_optional_to_json(row.try_get::<_, Option<i32>>(i)),
Type::INT8 => convert_optional_to_json(row.try_get::<_, Option<i64>>(i)),
Type::FLOAT4 => convert_optional_to_json(row.try_get::<_, Option<f32>>(i)),
Type::FLOAT8 => convert_optional_to_json(row.try_get::<_, Option<f64>>(i)),
Type::BOOL => convert_optional_to_json(row.try_get::<_, Option<bool>>(i)),
Type::VARCHAR | Type::TEXT | Type::BPCHAR => {
convert_optional_to_json(row.try_get::<_, Option<String>>(i))
}
Type::TIMESTAMP => {
convert_optional_to_json(row.try_get::<_, Option<NaiveDateTime>>(i))
}
Type::TIMESTAMPTZ => {
convert_optional_to_json(row.try_get::<_, Option<DateTime<Utc>>>(i))
}
Type::DATE => convert_optional_to_json(row.try_get::<_, Option<NaiveDate>>(i)),
Type::JSON | Type::JSONB => {
convert_optional_to_json(row.try_get::<_, Option<serde_json::Value>>(i))
}
Type::UUID => convert_optional_to_json(row.try_get::<_, Option<Uuid>>(i)),
Type::TEXT_ARRAY | Type::VARCHAR_ARRAY => {
convert_optional_to_json(row.try_get::<_, Option<Vec<String>>>(i))
}
_ => {
// For unsupported types, try to convert to string
convert_optional_to_json(row.try_get::<_, Option<String>>(i))
}
};
map.insert(column.name().to_string(), value);
}
Value::Object(map)
})
.collect()
}
pub struct Pagination {
pub page: i64,
pub limit: i64,
pub offset: i64,
pub total_pages: i64,
}
impl Pagination {
pub fn new(query: Query<PaginationParams>, total_count: i64) -> Self {
let limit = query.limit.unwrap_or(200).clamp(1, 1000);
let total_pages = (total_count as f64 / limit as f64).ceil() as i64;
let page = query.page.unwrap_or(1).clamp(1, total_pages);
let offset = (page - 1) * limit;
Self {
page,
limit,
offset,
total_pages,
}
}
}
// Helper function to get cached projects and return missing ones
pub fn get_cached_projects(
cache: Arc<DashMap<Uuid, ProjectCacheEntry>>,
project_ids: &[Uuid],
) -> (Vec<Arc<Value>>, Vec<Uuid>) {
let mut cached_projects = Vec::new();
let mut missing_ids = Vec::new();
for &project_id in project_ids {
if let Some(entry) = cache.get(&project_id) {
if !entry.is_expired() {
cached_projects.push(entry.data.clone());
continue;
}
}
missing_ids.push(project_id);
}
(cached_projects, missing_ids)
}
================================================
FILE: core/README.md
================================================
# Core Tools for CHAI Python Loaders
This directory contains a set of core tools and utilities to facilitate loading the CHAI
database with package manager data, using python helpers. These tools provide a common
foundation for fetching, transforming, and loading data from various package managers
into the database.
In general, the flow of an indexer is:
1. Fetch data from source
2. Fetch data from CHAI
3. Do a giant diff
4. Create new entries, updated entries for each package model in the db
The best example is [Homebrew's](../package_managers/homebrew/main.py).
## Key Components
### [Config](config.py)
Entrypoint for all loaders, generally has all the information needed for the pipeline
to start. Includes:
- Execution flags:
- `FETCH` determines whether we request the data from source
- `TEST` enables a test mode, to test specific portions of the pipeline
- `NO_CACHE` to determine whether we save the intermediate pipeline files
- Package Manager flags
- `pm_id` gets the package manager id from the db, that we'd run the pipeline for
- `source` is the data source for that package manager. `SOURCES` defines the map.
The next 4 configuration classes retrieve the IDs for url types (homepage, documentation,
etc.), dependency types (build, runtime, etc.), user types (crates user, github user),
and all the package manager IDs as well.
### 2. [Database](db.py)
The DB class offers a set of methods for interacting with the database, including:
- Running queries to build a cache for the current state of the graph for a package
manager
- Batching utilities
- Some load functions
### 3. [Fetcher](fetcher.py)
The Fetcher class provides functionality for downloading and extracting data from
package manager sources. It supports:
- Downloading tarball / GZIP / Git files
- Extracting contents to a specified directory
- Maintaining a "latest" symlink so we always know where to look
### 4. [Logger](logger.py)
A custom logging utility that provides consistent logging across all loaders.
### 5. [Models](models/__init__.py)
SQLAlchemy models representing the database schema, including:
- Package, Version, User, License, DependsOn, and other relevant tables
> [!NOTE]
>
> This is currently used to actually generate the migrations as well
### 6. [Scheduler](scheduler.py)
A scheduling utility that allows loaders to run at specified intervals.
### 7. [Transformer](transformer.py)
The Transformer class provides a base for creating package manager-specific transformers.
It includes:
- Methods for locating and reading input files
- Placeholder methods for transforming data into the required format
## Usage
To create a new loader for a package manager:
1. Create a new directory under `package_managers/` for your package manager.
1. Implement a fetcher that inherits from the base Fetcher, that is able to fetch
the raw data from the package manager's source.
1. Implement a custom Transformer class that inherits from the base Transformer, that
figures out how to map the raw data provided by the package managers into the data
model described in the [models](models/__init__.py) module.
1. Load the cache for data currently in CHAI for that package manager
1. Implement a diff to compare them
1. Pass diff objects (lists of new / updated data points) to `db.ingest`
1. Orchestrate via a `main.py`.
Example usage can be found in the [crates](../package_managers/crates) loader.
# TODOs
- [ ] `Diff` currently has separate implementations for Homebrew and Crates, and could
be centralized - open to help here!
================================================
FILE: core/config.py
================================================
from enum import Enum
from sqlalchemy import UUID
from core.db import ConfigDB
from core.logger import Logger
from core.utils import env_vars
logger = Logger("config")
class PackageManager(Enum):
CRATES = "crates"
HOMEBREW = "homebrew"
DEBIAN = "debian"
NPM = "npm"
PKGX = "pkgx"
TEST = env_vars("TEST", "false")
FETCH = env_vars("FETCH", "true")
NO_CACHE = env_vars("NO_CACHE", "true")
SOURCES = {
PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz",
PackageManager.HOMEBREW: "https://formulae.brew.sh/api/formula.json",
# for debian, sources contains the urls, packages is tied to the linux distribution
PackageManager.DEBIAN: [
"https://ftp.debian.org/debian/dists/stable/main/binary-amd64/Packages.gz",
"https://ftp.debian.org/debian/dists/stable/main/source/Sources.gz",
],
PackageManager.NPM: "https://registry.npmjs.org/-/all", # fake for now
PackageManager.PKGX: "https://github.com/pkgxdev/pantry.git",
}
# The three configuration values URLTypes, DependencyTypes, and UserTypes will query the
# DB to get the respective IDs. If the values don't exist in the database, they will
# raise an AttributeError (None has no attribute id) at the start
class ExecConf:
test: bool
fetch: bool
no_cache: bool
def __init__(self) -> None:
self.test = TEST
self.fetch = FETCH
self.no_cache = NO_CACHE
def __str__(self):
return f"ExecConf(test={self.test},fetch={self.fetch},no_cache={self.no_cache})"
class PMConf:
package_manager: PackageManager
pm_id: str
source: str | list[str]
def __init__(self, pm: PackageManager, db: ConfigDB):
self.package_manager = pm
self.pm_id = db.select_package_manager_by_name(pm.value).id
self.source = SOURCES[pm]
def __str__(self):
return f"PMConf(pm_id={self.pm_id},source={self.source})"
class URLTypes:
homepage: UUID
repository: UUID
documentation: UUID
source: UUID
def __init__(self, db: ConfigDB):
self.load_url_types(db)
def load_url_types(self, db: ConfigDB) -> None:
self.homepage = db.select_url_types_by_name("homepage").id
self.repository = db.select_url_types_by_name("repository").id
self.documentation = db.select_url_types_by_name("documentation").id
self.source = db.select_url_types_by_name("source").id
def __str__(self) -> str:
return f"URLs(homepage={self.homepage},repo={self.repository},docs={self.documentation},src={self.source})"
class UserTypes:
crates: UUID
github: UUID
def __init__(self, db: ConfigDB):
self.crates = db.select_source_by_name("crates").id
self.github = db.select_source_by_name("github").id
def __str__(self) -> str:
return f"UserTypes(crates={self.crates},github={self.github})"
class DependencyTypes:
build: UUID
development: UUID
runtime: UUID
test: UUID
optional: UUID
recommended: UUID
def __init__(self, db: ConfigDB):
self.build = db.select_dependency_type_by_name("build").id
self.development = db.select_dependency_type_by_name("development").id
self.runtime = db.select_dependency_type_by_name("runtime").id
self.test = db.select_dependency_type_by_name("test").id
self.optional = db.select_dependency_type_by_name("optional").id
self.recommended = db.select_dependency_type_by_name("recommended").id
def __str__(self) -> str:
return f"DependencyTypes(build={self.build},development={self.development},runtime={self.runtime},test={self.test},optional={self.optional},recommended={self.recommended})"
class PackageManagers:
crates: UUID
homebrew: UUID
debian: UUID
npm: UUID
pkgx: UUID
def __init__(self, db: ConfigDB):
self.crates = db.select_package_manager_by_name("crates").id
self.homebrew = db.select_package_manager_by_name("homebrew").id
self.debian = db.select_package_manager_by_name("debian").id
self.npm = db.select_package_manager_by_name("npm").id
self.pkgx = db.select_package_manager_by_name("pkgx").id
class Config:
exec_config: ExecConf
pm_config: PMConf
url_types: URLTypes
user_types: UserTypes
dependency_types: DependencyTypes
package_managers: PackageManagers
def __init__(self, pm: PackageManager) -> None:
db = ConfigDB()
self.exec_config = ExecConf()
self.pm_config = PMConf(pm, db)
self.url_types = URLTypes(db)
self.user_types = UserTypes(db)
self.dependency_types = DependencyTypes(db)
self.package_managers = PackageManagers(db)
def __str__(self):
return f"Config(exec_config={self.exec_config}, pm_config={self.pm_config}, url_types={self.url_types}, user_types={self.user_types}, dependency_types={self.dependency_types}, package_managers={self.package_managers})"
================================================
FILE: core/db.py
================================================
import os
from collections import defaultdict
from datetime import datetime
from typing import Any
from uuid import UUID
from sqlalchemy import Insert, Result, Update, create_engine, select, update
from sqlalchemy.dialects import postgresql
from sqlalchemy.orm import Session, sessionmaker
from core.logger import Logger
from core.models import (
URL,
BaseModel,
DependsOnType,
LegacyDependency,
LoadHistory,
Package,
PackageManager,
PackageURL,
Source,
URLType,
)
from core.structs import CurrentGraph, CurrentURLs, URLKey
CHAI_DATABASE_URL = os.getenv("CHAI_DATABASE_URL")
DEFAULT_BATCH_SIZE = 10000
class DB:
def __init__(self, logger_name: str):
self.logger = Logger(logger_name)
self.engine = create_engine(CHAI_DATABASE_URL)
self.session = sessionmaker(self.engine)
self.logger.debug("connected")
self.now: datetime = datetime.now()
def insert_load_history(self, package_manager_id: str):
with self.session() as session:
session.add(LoadHistory(package_manager_id=package_manager_id))
session.commit()
def print_statement(self, stmt):
dialect = postgresql.dialect()
compiled_stmt = stmt.compile(
dialect=dialect, compile_kwargs={"literal_binds": True}
)
self.logger.log(str(compiled_stmt))
def close(self):
self.logger.debug("closing")
self.engine.dispose()
def search_names(
self, package_names: list[str], package_managers: list[UUID]
) -> list[str]:
"""Return Homepage URLs for packages with these names"""
with self.session() as session:
results = (
session.query(Package, URL)
.join(PackageURL, PackageURL.package_id == Package.id)
.join(URL, PackageURL.url_id == URL.id)
.join(URLType, URL.url_type_id == URLType.id)
.filter(URLType.name == "homepage")
.filter(Package.name.in_(package_names))
.filter(Package.package_manager_id.in_(package_managers))
.all()
)
# build a mapping
name_to_url = {result.Package.name: result.URL.url for result in results}
# return in the order preserved by the input (bc its relevant)
# and account for the fact that some names might not have a URL
return [
name_to_url.get(name) for name in package_names if name in name_to_url
]
def current_graph(self, package_manager_id: UUID) -> CurrentGraph:
"""Get the packages and dependencies for a specific package manager"""
package_map: dict[str, Package] = defaultdict(Package)
dependencies: dict[UUID, set[LegacyDependency]] = defaultdict(set)
stmt = (
select(Package, LegacyDependency)
.select_from(Package)
.join(
LegacyDependency,
onclause=Package.id == LegacyDependency.package_id,
isouter=True,
)
.where(Package.package_manager_id == package_manager_id)
)
with self.session() as session:
result: Result[tuple[Package, LegacyDependency]] = session.execute(stmt)
for pkg, dep in result:
# add to the package map, by import_id, which is usually name
package_map[pkg.import_id] = pkg
# and add to the dependencies map as well
if dep: # check because it's an outer join, so might be None
dependencies[pkg.id].add(dep)
self.logger.debug(f"Cached {len(package_map)} packages")
return CurrentGraph(package_map, dependencies)
def _build_current_urls(
self, result: Result[tuple[Package, PackageURL, URL]]
) -> CurrentURLs:
"""Build the CurrentURLs result based on a query of Package, PackageURL, URL"""
url_map: dict[URLKey, URL] = {}
package_urls: dict[UUID, set[PackageURL]] = {}
for pkg, pkg_url, url in result:
url_key = URLKey(url.url, url.url_type_id)
url_map[url_key] = url
# since it's a left join, we need to check if pkg is None
if pkg is not None:
if pkg.id not in package_urls:
package_urls[pkg.id] = set()
package_urls[pkg.id].add(pkg_url)
self.logger.debug(f"Cached {len(url_map)} URLs")
self.logger.debug(f"Cached {len(package_urls)} package URLs")
return CurrentURLs(url_map=url_map, package_urls=package_urls)
def current_urls(self, urls: set[str]) -> CurrentURLs:
"""Get the Package URL Relationships for a given set of URLs"""
stmt = (
select(Package, PackageURL, URL)
.select_from(URL)
.join(PackageURL, PackageURL.url_id == URL.id, isouter=True)
.join(Package, Package.id == PackageURL.package_id, isouter=True)
.where(URL.url.in_(urls))
)
with self.session() as session:
result: Result[tuple[Package, PackageURL, URL]] = session.execute(stmt)
return self._build_current_urls(result)
def all_current_urls(self) -> CurrentURLs:
"""Get all the URLs and the Packages they are tied to from CHAI"""
stmt = (
select(Package, PackageURL, URL)
.select_from(URL)
.join(PackageURL, PackageURL.url_id == URL.id, isouter=True)
.join(Package, Package.id == PackageURL.package_id, isouter=True)
)
with self.session() as session:
result: Result[tuple[Package, PackageURL, URL]] = session.execute(stmt)
return self._build_current_urls(result)
def load(
self, session: Session, data: list[BaseModel], stmt: Insert | Update
) -> None:
"""Smart batching utility"""
if data:
values: list[dict[str, str | UUID | datetime]] = [
obj.to_dict_v2() for obj in data
]
self.batch(session, stmt, values, DEFAULT_BATCH_SIZE)
def batch(
self,
session: Session,
stmt: Insert | Update,
values: list[dict[str, str | UUID | datetime]],
batch_size: int = DEFAULT_BATCH_SIZE,
):
"""
Utility to batch insert or update, but doesn't commit!
Inputs:
- session: the sessionmaker object, so create it before you use it
- stmt: the insert or update statement to construct, without the values
- values: the values to insert or update - generally using to_dict_v2()
- batch_size: the batch size, defaults to 10000
- obj_name: the name of the object being inserted for logging
"""
for i in range(0, len(values), batch_size):
batch = values[i : i + batch_size]
self.logger.log(
f"Processing batch {i // batch_size + 1}/{(len(values) - 1) // batch_size + 1} ({len(batch)})"
)
value_stmt = stmt.values(batch)
session.execute(value_stmt)
def ingest(
self,
new_packages: list[Package],
new_urls: list[URL],
new_package_urls: list[PackageURL],
new_deps: list[LegacyDependency],
removed_deps: list[LegacyDependency],
updated_packages: list[dict[str, UUID | str | datetime]],
updated_package_urls: list[dict[str, UUID | datetime]],
) -> None:
"""
Ingests a list of new, updated, and deleted objects from the database.
It flushes after each insert, to ensure that the database is in a valid
state prior to the next batch of ingestions.
TODO: if pkey is set in the values provided, then sqlalchemy will use
psycopg2.executemany(...), which is quicker, but still the slowest of all
execution options provided by psycopg2. The best one is execute_values, which
is **only** available for inserts, and can be used as follows:
looks like sqlalchemy^2 has a native support for insertmanyvalues, but
**I think** we need to pass the data in as a list[dict] instead of objects.
See: https://docs.sqlalchemy.org/en/20/core/connections.html#engine-insertmanyvalues
Inputs:
- new_packages: a list of new Package objects
- new_urls: a list of new URL objects
- new_package_urls: a list of new PackageURL objects
- updated_packages: a list of updated Package objects
- updated_package_urls: a list of updated PackageURL objects
- new_deps: a list of new LegacyDependency objects
- removed_deps: a list of removed LegacyDependency objects
"""
self.logger.log("-" * 100)
self.logger.log("Going to load")
self.logger.log(f"New packages: {len(new_packages)}")
self.logger.log(f"New URLs: {len(new_urls)}")
self.logger.log(f"New package URLs: {len(new_package_urls)}")
self.logger.log(f"Updated packages: {len(updated_packages)}")
self.logger.log(f"Updated package URLs: {len(updated_package_urls)}")
self.logger.log(f"New dependencies: {len(new_deps)}")
self.logger.log(f"Removed dependencies: {len(removed_deps)}")
self.logger.log("-" * 100)
with self.session() as session:
try:
# 1. Add all new objects with granular flushes
self.execute(session, new_packages, "add", "new packages")
self.execute(session, new_urls, "add", "new urls")
self.execute(session, new_package_urls, "add", "new package urls")
self.execute(session, removed_deps, "delete", "removed dependencies")
self.execute(session, new_deps, "add", "new dependencies")
# 2. Perform updates (these will now operate on a flushed state)
if updated_packages:
session.execute(update(Package), updated_packages)
if updated_package_urls:
session.execute(update(PackageURL), updated_package_urls)
# 3. Commit all changes
session.commit()
self.logger.log("✅ Successfully ingested")
except Exception as e:
self.logger.error(f"Error during batched ingest: {e}")
session.rollback()
raise e
def execute(self, session: Session, data: list[Any], method: str, log: str) -> None:
if method not in ["add", "delete"]:
raise ValueError(f"db.execute({method}) is unknown")
if data:
match method:
case "add":
session.add_all(data)
case "delete":
self.remove_all(session, data)
case _:
pass
session.flush()
self.logger.log(f"✅ {len(data):,} {log}")
def remove_all(self, session: Session, data: list[Any]) -> None:
for item in data:
session.delete(item)
class ConfigDB(DB):
def __init__(self):
super().__init__("ConfigDB")
def select_package_manager_by_name(self, package_manager: str) -> PackageManager:
with self.session() as session:
result = (
session.query(PackageManager)
.join(Source, PackageManager.source_id == Source.id)
.filter(Source.type == package_manager)
.first()
)
if result:
return result
raise ValueError(f"Package manager {package_manager} not found")
def select_url_types_by_name(self, name: str) -> URLType:
with self.session() as session:
return session.query(URLType).filter(URLType.name == name).first()
def select_source_by_name(self, name: str) -> Source:
with self.session() as session:
return session.query(Source).filter(Source.type == name).first()
def select_dependency_type_by_name(self, name: str) -> DependsOnType:
with self.session() as session:
return (
session.query(DependsOnType).filter(DependsOnType.name == name).first()
)
if __name__ == "__main__":
db = ConfigDB()
print(db.search_names(["elfutils.org", "elfutils"]))
================================================
FILE: core/fetcher.py
================================================
import gzip
import json
import os
import tarfile
from dataclasses import dataclass
from datetime import datetime
from io import BytesIO
from shutil import rmtree
from typing import Any
import git
from requests import get
from core.logger import Logger
@dataclass
class Data:
file_path: str
file_name: str
content: Any # json or bytes
class Fetcher:
def __init__(self, name: str, source: str, no_cache: bool, test: bool):
self.name = name
self.source = source
self.output = f"data/{name}"
self.logger = Logger(f"{name}_fetcher")
self.no_cache = no_cache
self.test = test
def write(self, files: list[Data]):
"""generic write function for some collection of files"""
# prep the file location
now = datetime.now().strftime("%Y-%m-%d")
root_path = f"{self.output}/{now}"
# write
# it can be anything - json, tarball, etc.
for item in files:
self.logger.debug(f"writing {item.file_path}/{item.file_name}")
file_path = item.file_path
file_name = item.file_name
file_content = item.content
full_path = os.path.join(root_path, file_path)
# make sure the path exists
os.makedirs(full_path, exist_ok=True)
with open(os.path.join(full_path, file_name), "wb") as f:
if isinstance(file_content, list | dict):
# Convert JSON-serializable objects to string
file_content = json.dumps(file_content)
# Ensure content is bytes before writing
if isinstance(file_content, str):
file_content = file_content.encode("utf-8")
f.write(file_content)
# update the latest symlink
self.update_symlink(now)
def update_symlink(self, latest_path: str):
latest_symlink = f"{self.output}/latest"
if os.path.islink(latest_symlink):
self.logger.debug(f"removing existing symlink {latest_symlink}")
os.remove(latest_symlink)
self.logger.debug(f"creating symlink {latest_symlink} -> {latest_path}")
os.symlink(latest_path, latest_symlink)
def fetch(self) -> bytes:
if not self.source:
raise ValueError("source is not set")
response = get(self.source)
try:
response.raise_for_status()
except Exception as e:
self.logger.error(f"error fetching {self.source}: {e}")
raise e
return response.content
def cleanup(self):
if self.no_cache:
# TODO: it's deleting everything here
rmtree(self.output, ignore_errors=True)
os.makedirs(self.output, exist_ok=True)
class TarballFetcher(Fetcher):
def __init__(self, name: str, source: str, no_cache: bool, test: bool):
super().__init__(name, source, no_cache, test)
def fetch(self) -> list[Data]:
content = super().fetch()
bytes_io_object = BytesIO(content)
bytes_io_object.seek(0)
files = []
with tarfile.open(fileobj=bytes_io_object, mode="r:gz") as tar:
for member in tar.getmembers():
if member.isfile():
bytes_io_file = BytesIO(tar.extractfile(member).read())
destination_key = member.name
file_name = destination_key.split("/")[-1]
file_path = "/".join(destination_key.split("/")[:-1])
self.logger.debug(f"file_path/file_name: {file_path}/{file_name}")
files.append(Data(file_path, file_name, bytes_io_file.read()))
return files
# GZip compresses only one file, so file_path and file_name are not used
class GZipFetcher(Fetcher):
def __init__(
self,
name: str,
source: str,
no_cache: bool,
test: bool,
file_path: str,
file_name: str,
):
super().__init__(name, source, no_cache, test)
self.file_path = file_path
self.file_name = file_name
def fetch(self) -> list[Data]:
content = super().fetch()
files = []
decompressed = gzip.decompress(content).decode("utf-8")
files.append(Data(self.file_path, self.file_name, decompressed.encode("utf-8")))
return files
class GitFetcher(Fetcher):
def __init__(self, name: str, source: str, no_cache: bool, test: bool):
super().__init__(name, source, no_cache, test)
def fetch(self) -> str:
# assume that source is a git repo whose main branch needs to be cloned
# we'll first prep the output directory, then clone, then update the symlinks
# NOTE: this is what the main Fetcher does, but slightly modified for this case
now = datetime.now().strftime("%Y-%m-%d")
root_dir = f"{self.output}/{now}"
os.makedirs(root_dir, exist_ok=True)
# now, clone the repo here
self.logger.debug(f"Cloning {self.source} into {root_dir}...")
_ = git.Repo.clone_from(self.source, root_dir, depth=1, branch="main")
self.logger.debug("Repository cloned successfully.")
# update the symlinks
self.update_symlink(now)
return root_dir
================================================
FILE: core/logger.py
================================================
import sys
import time
import traceback
from core.utils import env_vars
DEBUG = env_vars("DEBUG", "false")
def as_minutes(seconds: float) -> float:
return seconds / 60
class Logger:
SILENT = 0
NORMAL = 1
VERBOSE = 2
def __init__(
self, name: str, mode: int = NORMAL, start: float | None = None
) -> None:
self.name = name
self.start = start or time.time()
self.mode = Logger.VERBOSE if DEBUG else mode
def print(self, msg: str):
print(f"{self.time_diff():.2f}: [{self.name}]: {msg}", flush=True)
def error(self, message):
self.print(f"[ERROR]: {message}")
def log(self, message):
if self.mode >= Logger.NORMAL:
self.print(f"{message}")
def debug(self, message):
if self.mode >= Logger.VERBOSE:
self.print(f"[DEBUG]: {message}")
def warn(self, message):
if self.mode >= Logger.NORMAL:
self.print(f"[WARN]: {message}")
def is_verbose(self):
return self.mode >= Logger.VERBOSE
def time_diff(self):
return time.time() - self.start
def exception(self):
exc_type, exc_value, exc_traceback = sys.exc_info()
self.print(f"{exc_type.__name__}: {exc_value}")
self.print("***** TRACEBACK *****")
print(f"{''.join(traceback.format_tb(exc_traceback))}")
def info(self, message):
self.log(message)
def warning(self, message):
self.warn(message)
================================================
FILE: core/models/__init__.py
================================================
# __init__.py
from __future__ import annotations
from datetime import datetime
from sqlalchemy import (
Column,
DateTime,
ForeignKey,
Index,
Integer,
MetaData,
String,
UniqueConstraint,
func,
)
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Mapped, declarative_base, relationship
naming_convention = {
"ix": "ix_%(column_0_label)s",
"uq": "uq_%(table_name)s_%(column_0_name)s",
"ck": "ck_%(table_name)s_%(constraint_name)s",
"fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
"pk": "pk_%(table_name)s",
}
metadata = MetaData(naming_convention=naming_convention)
class BaseModel:
# we have UUIDs, strings, datetimes, ints, and floats
def to_dict_v2(self) -> dict[str, str | UUID | datetime | int | float]:
"""Return a dictionary of all non-None attributes."""
return {
attr: getattr(self, attr)
for attr in self.__table__.columns.keys() # noqa: SIM118
if getattr(self, attr) is not None
}
Base = declarative_base(metadata=metadata, cls=BaseModel)
class Package(Base):
__tablename__ = "packages"
__table_args__ = (
UniqueConstraint(
"package_manager_id", "import_id", name="uq_package_manager_import_id"
),
)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
derived_id = Column(String, nullable=False, unique=True) # package_manager/name
name = Column(String, nullable=False, index=True)
package_manager_id = Column(
UUID(as_uuid=True),
ForeignKey("package_managers.id"),
nullable=False,
index=True,
)
import_id = Column(String, nullable=False, index=True)
readme = Column(String, nullable=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
def to_dict(self):
return {
"derived_id": self.derived_id,
"name": self.name,
"package_manager_id": self.package_manager_id,
"import_id": self.import_id,
"readme": self.readme,
}
class PackageManager(Base):
__tablename__ = "package_managers"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
source_id = Column(
UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False, unique=True
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
class Version(Base):
__tablename__ = "versions"
__table_args__ = (
UniqueConstraint("package_id", "version", name="uq_package_version"),
)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
package_id = Column(
UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False, index=True
)
version = Column(String, nullable=False, index=True)
import_id = Column(String, nullable=False, index=True)
# size, published_at, license_id, downloads, checksum
# are nullable bc not all sources provide them
size = Column(Integer, nullable=True, index=True)
published_at = Column(DateTime, nullable=True, index=True)
license_id = Column(
UUID(as_uuid=True), ForeignKey("licenses.id"), nullable=True, index=True
)
downloads = Column(Integer, nullable=True, index=True)
checksum = Column(String, nullable=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
package: Mapped[Package] = relationship()
license: Mapped[License] = relationship()
def to_dict(self):
return {
"package_id": self.package_id,
"version": self.version,
"import_id": self.import_id,
"size": self.size,
"published_at": self.published_at,
"license_id": self.license_id,
"downloads": self.downloads,
"checksum": self.checksum,
}
class License(Base):
__tablename__ = "licenses"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
name = Column(String, nullable=False, unique=True, index=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
class DependsOn(Base):
__tablename__ = "dependencies"
__table_args__ = (
UniqueConstraint(
"version_id",
"dependency_id",
"dependency_type_id",
name="uq_version_dependency_type",
),
)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
version_id = Column(
UUID(as_uuid=True), ForeignKey("versions.id"), nullable=False, index=True
)
dependency_id = Column(
UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False, index=True
)
# ideally, these are non-nullable but diff package managers are picky about this
dependency_type_id = Column(
UUID(as_uuid=True), ForeignKey("depends_on_types.id"), nullable=True, index=True
)
semver_range = Column(String, nullable=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
version: Mapped[Version] = relationship()
dependency: Mapped[Package] = relationship()
dependency_type: Mapped[DependsOnType] = relationship()
def to_dict(self):
return {
"version_id": self.version_id,
"dependency_id": self.dependency_id,
# "dependency_type_id": self.dependency_type_id,
"semver_range": self.semver_range,
}
class DependsOnType(Base):
__tablename__ = "depends_on_types"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
name = Column(String, nullable=False, unique=True, index=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
class LoadHistory(Base):
__tablename__ = "load_history"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
package_manager_id = Column(
UUID(as_uuid=True), ForeignKey("package_managers.id"), nullable=False
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
# authoritative source of truth for all our sources
class Source(Base):
__tablename__ = "sources"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
type = Column(String, nullable=False, unique=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
# this is a collection of all the different type of URLs
class URL(Base):
__tablename__ = "urls"
__table_args__ = (UniqueConstraint("url_type_id", "url", name="uq_url_type_url"),)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
url_trgm_idx = Index(
"ix_urls_url_trgm",
"url",
postgresql_using="gin",
postgresql_ops={"url": "gin_trgm_ops"},
)
url = Column(String, nullable=False)
url_type_id = Column(
UUID(as_uuid=True), ForeignKey("url_types.id"), nullable=False, index=True
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
def to_dict(self):
return {"url": self.url, "url_type_id": self.url_type_id}
# homepage, repository, documentation, etc.
class URLType(Base):
__tablename__ = "url_types"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
name = Column(String, nullable=False, unique=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
class User(Base):
__tablename__ = "users"
__table_args__ = (
UniqueConstraint("source_id", "import_id", name="uq_source_import_id"),
)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
username = Column(String, nullable=False, index=True)
source_id = Column(
UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False, index=True
)
import_id = Column(String, nullable=False, unique=False, index=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
def to_dict(self):
return {
"username": self.username,
"source_id": self.source_id,
"import_id": self.import_id,
}
class UserVersion(Base):
__tablename__ = "user_versions"
__table_args__ = (
UniqueConstraint("user_id", "version_id", name="uq_user_version"),
)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
user_id = Column(
UUID(as_uuid=True), ForeignKey("users.id"), nullable=False, index=True
)
version_id = Column(
UUID(as_uuid=True), ForeignKey("versions.id"), nullable=False, index=True
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
def to_dict(self):
return {
"user_id": self.user_id,
"version_id": self.version_id,
}
class UserPackage(Base):
__tablename__ = "user_packages"
__table_args__ = (
UniqueConstraint("user_id", "package_id", name="uq_user_package"),
)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
user_id = Column(
UUID(as_uuid=True), ForeignKey("users.id"), nullable=False, index=True
)
package_id = Column(
UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False, index=True
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
def to_dict(self):
return {
"user_id": self.user_id,
"package_id": self.package_id,
}
class PackageURL(Base):
__tablename__ = "package_urls"
__table_args__ = (UniqueConstraint("package_id", "url_id", name="uq_package_url"),)
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
package_id = Column(
UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False, index=True
)
url_id = Column(
UUID(as_uuid=True), ForeignKey("urls.id"), nullable=False, index=True
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
# TODO: deprecated
def to_dict(self):
return {
"package_id": self.package_id,
"url_id": self.url_id,
}
class LegacyDependency(Base):
__tablename__ = "legacy_dependencies"
__table_args__ = (
UniqueConstraint("package_id", "dependency_id", name="uq_package_dependency"),
)
id = Column(Integer, primary_key=True)
package_id = Column(
UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False, index=True
)
dependency_id = Column(
UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False, index=True
)
dependency_type_id = Column(
UUID(as_uuid=True),
ForeignKey("depends_on_types.id"),
nullable=False,
index=True,
)
semver_range = Column(String, nullable=True)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
class Canon(Base):
__tablename__ = "canons"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
url_id = Column(
UUID(as_uuid=True),
ForeignKey("urls.id"),
nullable=False,
index=True,
unique=True,
)
name_trgm_idx = Index(
"ix_canons_name_trgm",
"name",
postgresql_using="gin",
postgresql_ops={"name": "gin_trgm_ops"},
)
name = Column(String, nullable=False)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
url: Mapped[URL] = relationship()
class CanonPackage(Base):
__tablename__ = "canon_packages"
id = Column(UUID(as_uuid=True), primary_key=True)
canon_id = Column(
UUID(as_uuid=True), ForeignKey("canons.id"), nullable=False, index=True
)
package_id = Column(
UUID(as_uuid=True),
ForeignKey("packages.id"),
nullable=False,
index=True,
unique=True,
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
updated_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
class TeaRankRun(Base):
__tablename__ = "tea_rank_runs"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
run = Column(Integer, nullable=False)
split_ratio = Column(String, nullable=False)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
class TeaRank(Base):
__tablename__ = "tea_ranks"
id = Column(
UUID(as_uuid=True),
primary_key=True,
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
tea_rank_run = Column(Integer, nullable=False, index=True)
canon_id = Column(
UUID(as_uuid=True), ForeignKey("canons.id"), nullable=False, index=True
)
rank = Column(String, nullable=False)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
================================================
FILE: core/requirements.txt
================================================
# This file was autogenerated by uv via the following command:
# uv pip compile --group indexers -o core/requirements.txt
alembic==1.13.2
# via chai (pyproject.toml:indexers)
certifi==2025.4.26
# via
# chai (pyproject.toml:indexers)
# requests
charset-normalizer==3.4.2
# via
# chai (pyproject.toml:indexers)
# requests
filelock==3.18.0
# via tldextract
gitdb==4.0.12
# via gitpython
gitpython==3.1.44
# via chai (pyproject.toml:indexers)
idna==3.10
# via
# chai (pyproject.toml:indexers)
# requests
# tldextract
mako==1.3.10
# via alembic
markupsafe==3.0.2
# via mako
permalint==0.1.15
# via chai (pyproject.toml:indexers)
psycopg2-binary==2.9.10
# via chai (pyproject.toml:indexers)
pyyaml==6.0.2
# via chai (pyproject.toml:indexers)
requests==2.32.4
# via
# chai (pyproject.toml:indexers)
# requests-file
# tldextract
requests-file==2.1.0
# via tldextract
ruff==0.11.13
# via permalint
schedule==1.2.2
# via chai (pyproject.toml:indexers)
smmap==5.0.2
# via gitdb
sqlalchemy==2.0.41
# via
# chai (pyproject.toml:indexers)
# alembic
tldextract==5.3.0
# via permalint
typing-extensions==4.14.0
# via
# alembic
# sqlalchemy
urllib3==2.4.0
# via
# chai (pyproject.toml:indexers)
# requests
================================================
FILE: core/scheduler.py
================================================
import time
from collections.abc import Callable
from os import getenv
from threading import Thread
import schedule
from core.logger import Logger
FREQUENCY = int(getenv("FREQUENCY", 24))
class Scheduler:
def __init__(self, name: str, frequency: int = FREQUENCY):
self.name = name
self.frequency = frequency
self.logger = Logger(f"{name}_scheduler")
self.job = None
self.is_running = False
def start(self, task: Callable, *args):
self.job = schedule.every(self.frequency).hours.do(task, *args)
self.is_running = True
self.logger.log(f"scheduled {self.name} to run every {self.frequency} hours")
def run_schedule():
while self.is_running:
schedule.run_pending()
time.sleep(1)
Thread(target=run_schedule, daemon=True).start()
def stop(self):
if self.job:
schedule.cancel_job(self.job)
self.is_running = False
self.logger.log(f"stopped {self.name} scheduler")
def run_now(self, task: Callable, *args):
self.logger.log(f"running {self.name} now")
task(*args)
================================================
FILE: core/structs.py
================================================
from dataclasses import dataclass
from datetime import datetime
from uuid import UUID
from core.models import URL, LegacyDependency, Package, PackageURL
@dataclass
class CurrentGraph:
package_map: dict[str, Package]
dependencies: dict[UUID, set[LegacyDependency]]
@dataclass(frozen=True)
class URLKey:
url: str
url_type_id: UUID
@dataclass
class CurrentURLs:
url_map: dict[URLKey, URL] # URL and URL Type ID to URL object
package_urls: dict[UUID, set[PackageURL]] # Package ID to PackageURL rows
@dataclass
class Cache:
package_map: dict[str, Package]
url_map: dict[URLKey, URL]
package_urls: dict[UUID, set[PackageURL]]
dependencies: dict[UUID, set[LegacyDependency]]
@dataclass
class DiffResult:
new_packages: list[Package]
new_urls: dict[URLKey, URL]
new_package_urls: list[PackageURL]
updated_packages: list[dict[str, UUID | str | datetime]]
updated_package_urls: list[dict[str, UUID | datetime]]
new_deps: list[LegacyDependency]
removed_deps: list[LegacyDependency]
================================================
FILE: core/test.json
================================================
[
{
'id': UUID('b3133e5e-6d6b-458b-bd83-bf31032875a4'),
'package_id': UUID('7d6c7a3f-2c75-425f-8674-12efd7ce1ca4'),
'url_id': UUID('736acfdc-c3c2-4b53-ae6e-102fdd4f375a'),
'created_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947),
'updated_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947)
}, {'id': UUID('a274bb96-a443-46a7-86ed-71c6ee87a89b'), 'package_id': UUID('506f5abc-f385-4cbf-9fb1-cd34053397f4'), 'url_id': UUID('d0346cef-80b0-456c-8de3-eb1b95481bac'), 'created_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947), 'updated_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947)}, {'id': UUID('17fe8d3a-78d8-42f5-a9f6-7b7abaa37d53'), 'package_id': UUID('a08b41eb-723d-4800-929d-cb6c6d3aeca4'), 'url_id': UUID('334ec74b-dda3-4bb0-99c5-f39abc132f5a'), 'created_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947), 'updated_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947)}, {'id': UUID('5dd47edf-bc5d-43b5-9acd-d099ae9a22f0'), 'package_id': UUID('624c333c-e303-4d9b-a66e-c499bb3b4806'), 'url_id': UUID('6d866142-e2a9-4da0-96de-b5bfadc7cee9'), 'created_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947), 'updated_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947)}, {'id': UUID('c924c668-c6cb-4b6b-bac2-b588377a695d'), 'package_id': UUID('2d182e7a-1960-4376-8272-5ce401c369fd'), 'url_id': UUID('359015d5-8807-4cdc-b1c8-a4771b0069fe'), 'created_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947), 'updated_at': datetime.datetime(2025, 5, 19, 17, 5, 10, 255947)}]
================================================
FILE: core/transformer.py
================================================
import csv
import os
from permalint import normalize_url, possible_names
from sqlalchemy import UUID
from core.db import DB
from core.logger import Logger
# this is a temporary fix, but sometimes the raw files have weird characters
# and lots of data within certain fields
# this fix allows us to read the files with no hassles
csv.field_size_limit(10000000)
# the transformer class knows what files to open, and provide a generic wrapper
# for the data within the files
# each package manager will have its own transformer, that knows what data needs to be
# extracted for our data model
class Transformer:
def __init__(self, name: str):
self.name = name
self.input = f"data/{name}/latest"
self.logger = Logger(f"{name}_transformer")
self.files: dict[str, str] = {
"projects": "",
"versions": "",
"dependencies": "",
"users": "",
"urls": "",
}
self.url_types: dict[str, UUID] = {}
def finder(self, file_name: str) -> str:
input_dir = os.path.realpath(self.input)
for root, _, files in os.walk(input_dir):
if file_name in files:
return os.path.join(root, file_name)
else:
self.logger.error(f"{file_name} not found in {input_dir}")
raise FileNotFoundError(f"Missing {file_name} file")
def open(self, file_name: str) -> str:
file_path = self.finder(file_name)
with open(file_path) as file:
return file.read()
def canonicalize(self, url: str) -> str:
return normalize_url(url)
def guess(self, db_client: DB, url: str, package_managers: list[UUID]) -> list[str]:
names = possible_names(url)
urls = db_client.search_names(names, package_managers)
return urls
================================================
FILE: core/utils.py
================================================
from os import getenv
from os.path import exists, join
from typing import Any
def safe_int(val: str) -> int | None:
if val == "":
return None
return int(val)
# TODO: needs explanation or simplification
def build_query_params(
items: list[dict[str, str]], cache: dict, attr: str
) -> list[str]:
params = set()
for item in items:
if item[attr] not in cache:
params.add(item[attr])
return list(params)
# env vars could be true or 1, or anything else -- here's a centralized location to
# handle that
def env_vars(env_var: str, default: str) -> bool:
var = getenv(env_var, default).lower()
return var == "true" or var == "1"
# convert keys to snake case
def convert_keys_to_snake_case(data: dict[str, Any]) -> dict[str, Any]:
"""Recursively converts dictionary keys from hyphen-case to snake_case."""
if isinstance(data, dict):
new_dict = {}
for key, value in data.items():
new_key = key.replace("-", "_")
new_dict[new_key] = convert_keys_to_snake_case(value) # handle nested
return new_dict
elif isinstance(data, list):
return [convert_keys_to_snake_case(item) for item in data]
else:
return data
def is_github_url(url: str) -> bool:
"""Assumes the url has been canonicalized by permalint"""
return url.startswith("github.com/")
def file_exists(*args) -> str:
"""Confirms if a file exists"""
file_path = join(*args)
if not exists(file_path):
raise FileNotFoundError(f"{file_path} not found")
return file_path
================================================
FILE: db/README.md
================================================
# CHAI Data Model
The CHAI data model is designed to represent the package manager data in a unified and
consistent form. The model's goal is _standardization_ - of the various complexities,
and idiosyncrasies of each individual package manager. We want to provide a standard way
for analysis, querying, and whatever your use case might be.
## Definitions
We use certain nomenclature throughout the codebase:
- `derived_id`: A unique identifier combining the package manager and package name. Like
`crates/serde`, or `homebrew/a2ps`, or `npm/lodash`.
- `import_id`: The original identifier from the source system. Like the `crate_id`
integers provided by crates, or the package name provided by Homebrew
# Core Entities
## Packages
The Package model is a fundamental unit in our system. Each package is uniquely
identified and associated with a specific package manager.
Key fields:
- `derived_id`
- `name`
- `package_manager_id`: Reference to the associated package manager.
- `import_id`: The original identifier from the source system.
- `readme`: Optional field for package documentation.
### Versions
Each version is a different release of a package, and **must** be associated with a
package.
Key fields:
- `package_id`: Reference to the associated package.
- `version`: The version string.
- `import_id`: The original identifier from the source system.
- `size`, `published_at`, `license_id`, `downloads`, `checksum`: Optional metadata
fields.
### Users
The User model represents individuals or entities associated with packages. This is not
necessarily always available, but if it is, it's interesting data.
Key fields:
- `username`: The user's name or identifier.
- `source_id`: Reference to the data source (e.g., GitHub, npm user, crates user, etc).
- `import_id`: The original identifier from the source system.
### URLs
The URL model is populated with all the URLs that are provided by the package manager
source data - this includes documentation, repository, source, issues, and other url
types as well. Each URL is associated with a URL type. The relationships between a URL
and a Package are captured in the PackageURL model.
Key fields:
- `url`: The URL.
- `url_type_id`: Reference to the type of URL. (e.g., homepage, repository, etc)
## Type Models
These models define categorizations and types used across the system. All these values
are loaded from the alembic service, specifically in the
[load-values.sql](../alembic/load-values.sql) script.
### URLType
Represents different types of URLs associated with packages.
Predefined types (from load-values.sql):
- `source`
- `homepage`
- `documentation`
- `repository`
### DependsOnType
Categorizes different types of dependencies between packages.
Predefined types (from load-values.sql):
- `build`
- `development`
- `runtime`
- `test`
- `optional`
- `recommended`
- `uses_from_macos` (Homebrew only)
### Source
Represents the authoritative sources of package data.
- `crates`
- `homebrew`
The below are not yet supported:
- `npm`
- `pypi`
- `rubygems`
- `github`
## Relationship Models
These models establish connections between core entities.
### DependsOn
In our data model, a specific release depends on a specific package. We include a field
`semver_range`, which would represent the range of dependency releases compatible with
that specific release.
> [!NOTE]
> Not all package managers provide semantic versions. Homebrew does not, for example.
> This is why `semver_range` is optional.
>
> On the other hand, the dependency type is non-optional, and the combination of
> `version_id`, `dependency_id`, and `dependency_type_id` must be unique.
Key fields:
- `version_id`: The version that has the dependency.
- `dependency_id`: The package that is depended upon.
- `dependency_type_id`: The type of dependency.
- `semver_range`: The version range for the dependency (optional).
### UserVersion and UserPackage
These models associate users with specific versions and packages, respectively.
### PackageURL
Associates packages with their various URLs.
## Caveats
### `Source` and `PackageManager` Relationship
We've chosen to separate `Source` and `PackageManager` into distinct entities:
- `Source`: Represents data sources that can provide information about packages, users,
or both.
- `PackageManager`: Specifically represents sources that are package managers.
For example, 'crates' functions both as a package manager and as a source of user data.
By keeping these concepts separate, we can accurately represent such systems, and have
one point where we can modify any information about 'crates'.
## Additional Models
### License
Represents software licenses associated with package versions. Great place to start
contributions!
### LoadHistory
Tracks the history of data loads for each package manager, useful for auditing and
incremental updates.
================================================
FILE: db/queries.md
================================================
# Chai Data Exploration
```sql
-- Packages with the longest lifetime
SELECT p.name,
SUM(v.downloads) AS "downloads",
count(v.package_id) AS versions,
min(v.published_at) AS "first published",
max(v.published_at) AS "last published",
max(v.published_at) - min(v.published_at) AS lifetime
FROM packages AS p
JOIN versions v ON v.package_id = p.id
GROUP BY p.name
ORDER BY lifetime DESC limit 100;
-- Packages sorted by dependents
SELECT p.name, count(d.id) AS dependents
FROM packages AS p
JOIN dependencies AS d ON d.dependency_id = p.id
GROUP BY p.name
ORDER BY count(d.id) DESC LIMIT 100;
-- Packages sorted by dependents with lifetime
SELECT p.name,
count(d.id) AS dependents,
min(v.published_at) AS "first published",
max(v.published_at) AS "last published",
max(v.published_at) - min(v.published_at) AS lifetime
FROM packages AS p
JOIN dependencies AS d ON d.dependency_id = p.id
JOIN versions v ON v.package_id = p.id
GROUP BY p.name
ORDER BY count(d.id) DESC LIMIT 100;
-- Packages sorted by dependents with downloads
SELECT p.name,
count(d.id) AS dependents,
sum(v.downloads) AS downloads
FROM packages AS p
JOIN dependencies AS d ON d.dependency_id = p.id
JOIN versions v ON v.package_id = p.id
GROUP BY p.name
ORDER BY count(d.id) DESC LIMIT 100;
-- Packages with most dependents sorted by download/dependent ratio
SELECT name, dependents, downloads, (downloads / dependents) AS ratio FROM
(SELECT p.name,
count(d.id) AS dependents,
sum(v.downloads) AS downloads
FROM packages AS p
JOIN dependencies AS d ON d.dependency_id = p.id
JOIN versions v ON v.package_id = p.id
GROUP BY p.name
ORDER BY count(d.id) DESC LIMIT 1000)
ORDER BY ratio DESC;
```
================================================
FILE: docker-compose.yml
================================================
services:
db:
image: postgres
shm_size: 256m
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=s3cr3t
ports:
- "5435:5432"
volumes:
- ./data/db/data:/var/lib/postgresql
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 5
alembic:
build:
context: .
dockerfile: ./alembic/Dockerfile
environment:
- CHAI_DATABASE_URL=${CHAI_DATABASE_URL:-postgresql://postgres:s3cr3t@db:5432/chai}
- CHAI_DATABASE_ADMIN_URL=${CHAI_DATABASE_ADMIN_URL:-postgresql://postgres:s3cr3t@db:5432/postgres}
- PGPASSWORD=${PGPASSWORD:-s3cr3t}
depends_on:
db:
condition: service_healthy
working_dir: /alembic
entrypoint: ["./run_migrations.sh"]
crates:
build:
context: .
dockerfile: ./package_managers/crates/Dockerfile
environment:
- CHAI_DATABASE_URL=${CHAI_DATABASE_URL:-postgresql://postgres:s3cr3t@db:5432/chai}
- NO_CACHE=${NO_CACHE:-true}
- PYTHONPATH=/
- DEBUG=${DEBUG:-false}
- TEST=${TEST:-false}
- FETCH=${FETCH:-true}
- FREQUENCY=${FREQUENCY:-24}
- ENABLE_SCHEDULER=${ENABLE_SCHEDULER:-true}
volumes:
- ./data/crates:/data/crates
depends_on:
db:
condition: service_healthy
alembic:
condition: service_completed_successfully
homebrew:
build:
context: .
dockerfile: ./package_managers/homebrew/Dockerfile
environment:
- CHAI_DATABASE_URL=${CHAI_DATABASE_URL:-postgresql://postgres:s3cr3t@db:5432/chai}
- NO_CACHE=${NO_CACHE:-false}
- DEBUG=${DEBUG:-false}
- TEST=${TEST:-false}
- FETCH=${FETCH:-true}
- FREQUENCY=${FREQUENCY:-1}
- ENABLE_SCHEDULER=${ENABLE_SCHEDULER:-true}
- PYTHONPATH=/
volumes:
- ./data/homebrew:/data/homebrew
depends_on:
db:
condition: service_healthy
alembic:
condition: service_completed_successfully
api:
build:
context: ./api
dockerfile: Dockerfile
environment:
- DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai
- HOST=0.0.0.0
- PORT=8080
ports:
- "8080:8080"
depends_on:
db:
condition: service_healthy
alembic:
condition: service_completed_successfully
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/heartbeat"]
interval: 30s
timeout: 10s
retries: 3
start_period: 5s
debian:
build:
context: .
dockerfile: ./package_managers/debian/Dockerfile
environment:
- CHAI_DATABASE_URL=${CHAI_DATABASE_URL:-postgresql://postgres:s3cr3t@db:5432/chai}
- NO_CACHE=${NO_CACHE:-false}
- PYTHONPATH=/
- DEBUG=${DEBUG:-false}
- TEST=${TEST:-false}
- FETCH=${FETCH:-true}
- FREQUENCY=${FREQUENCY:-24}
- ENABLE_SCHEDULER=${ENABLE_SCHEDULER:-true}
volumes:
- ./data/debian:/data/debian
depends_on:
db:
condition: service_healthy
alembic:
condition: service_completed_successfully
pkgx:
build:
context: .
dockerfile: ./package_managers/pkgx/Dockerfile
environment:
- CHAI_DATABASE_URL=${CHAI_DATABASE_URL:-postgresql://postgres:s3cr3t@db:5432/chai}
- NO_CACHE=${NO_CACHE:-false}
- PYTHONPATH=/
- DEBUG=${DEBUG:-false}
- TEST=${TEST:-false}
- FETCH=${FETCH:-true}
- FREQUENCY=${FREQUENCY:-24}
- ENABLE_SCHEDULER=${ENABLE_SCHEDULER:-true}
volumes:
- ./data/pkgx:/data/pkgx
depends_on:
db:
condition: service_healthy
alembic:
condition: service_completed_successfully
ranker:
build:
context: .
dockerfile: ./ranker/Dockerfile
environment:
- CHAI_DATABASE_URL=${CHAI_DATABASE_URL:-postgresql://postgres:s3cr3t@db:5432/chai}
- PYTHONPATH=/
- LOAD=${LOAD:-false}
- DEBUG=${DEBUG:-false}
depends_on:
db:
condition: service_healthy
alembic:
condition: service_completed_successfully
deploy:
resources:
limits:
memory: 8G
================================================
FILE: examples/sbom-meta/README.md
================================================
# SBOM-Meta
An example Chai application that displays package metadata for
[SBOMs](https://github.com/anchore/syft) (software bill of materials).
## Installation
1. Start the [Chai DB](https://github.com/teaxyz/chai-oss) with `docker compose up`.
2. Run `go install` or `go build` to generate a binary.
## Usage
Run `sbom-meta` in the root directory of any repository to get a list of
dependencies with metadata.
```bash
git clone git@github.com:starship/starship.git
cd starship
sbom-meta
```
You can sort any of the fields, ascending or descending:
```bash
sbom-meta --sort downloads,desc
sbom-meta --sort published,asc
```
Use the `--json` flag to output JSON:
```bash
sbom-meta --json | jq .[1].name
```
================================================
FILE: examples/sbom-meta/go.mod
================================================
module sbom-meta
go 1.23.2
require (
github.com/anchore/syft v1.14.0
github.com/caarlos0/env v3.5.0+incompatible
github.com/dustin/go-humanize v1.0.1
github.com/fatih/color v1.17.1-0.20241003070628-1c8d8706604e
github.com/jedib0t/go-pretty/v6 v6.6.0
github.com/jmoiron/sqlx v1.4.0
github.com/lib/pq v1.10.9
)
require (
dario.cat/mergo v1.0.1 // indirect
github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 // indirect
github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 // indirect
github.com/BurntSushi/toml v1.4.0 // indirect
github.com/CycloneDX/cyclonedx-go v0.9.1 // indirect
github.com/DataDog/zstd v1.5.5 // indirect
github.com/Masterminds/goutils v1.1.1 // indirect
github.com/Masterminds/semver v1.5.0 // indirect
github.com/Masterminds/semver/v3 v3.3.0 // indirect
github.com/Masterminds/sprig/v3 v3.3.0 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/Microsoft/hcsshim v0.11.7 // indirect
github.com/ProtonMail/go-crypto v1.1.6 // indirect
github.com/acobaugh/osrelease v0.1.0 // indirect
github.com/adrg/xdg v0.5.0 // indirect
github.com/anchore/clio v0.0.0-20240522144804-d81e109008aa // indirect
github.com/anchore/fangs v0.0.0-20240903175602-e716ef12c23d // indirect
github.com/anchore/go-collections v0.0.0-20240216171411-9321230ce537 // indirect
github.com/anchore/go-logger v0.0.0-20230725134548-c21dafa1ec5a // indirect
github.com/anchore/go-macholibre v0.0.0-20220308212642-53e6d0aaf6fb // indirect
github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092 // indirect
github.com/anchore/go-version v1.2.2-0.20200701162849-18adb9c92b9b // indirect
github.com/anchore/packageurl-go v0.1.1-0.20240507183024-848e011fc24f // indirect
github.com/anchore/stereoscope v0.0.4-0.20241005180410-efa76446cc1c // indirect
github.com/andybalholm/brotli v1.0.4 // indirect
github.com/aquasecurity/go-pep440-version v0.0.0-20210121094942-22b2f8951d46 // indirect
github.com/aquasecurity/go-version v0.0.0-20210121072130-637058cfe492 // indirect
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
github.com/becheran/wildmatch-go v1.0.0 // indirect
github.com/bmatcuk/doublestar/v4 v4.6.1 // indirect
github.com/charmbracelet/lipgloss v0.13.0 // indirect
github.com/charmbracelet/x/ansi v0.2.3 // indirect
github.com/cloudflare/circl v1.6.1 // indirect
github.com/containerd/cgroups v1.1.0 // indirect
github.com/containerd/containerd v1.7.28 // indirect
github.com/containerd/containerd/api v1.8.0 // indirect
github.com/containerd/continuity v0.4.4 // indirect
github.com/containerd/errdefs v0.3.0 // indirect
github.com/containerd/fifo v1.1.0 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/containerd/platforms v0.2.1 // indirect
github.com/containerd/stargz-snapshotter/estargz v0.14.3 // indirect
github.com/containerd/ttrpc v1.2.7 // indirect
github.com/containerd/typeurl/v2 v2.1.1 // indirect
github.com/cyphar/filepath-securejoin v0.4.1 // indirect
github.com/deitch/magic v0.0.0-20230404182410-1ff89d7342da // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/cli v27.1.1+incompatible // indirect
github.com/docker/distribution v2.8.3+incompatible // indirect
github.com/docker/docker v27.3.1+incompatible // indirect
github.com/docker/docker-credential-helpers v0.7.0 // indirect
github.com/docker/go-connections v0.4.0 // indirect
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect
github.com/docker/go-units v0.5.0 // indirect
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect
github.com/edsrzf/mmap-go v1.1.0 // indirect
github.com/elliotchance/phpserialize v1.4.0 // indirect
github.com/emirpasic/gods v1.18.1 // indirect
github.com/facebookincubator/nvdtools v0.1.5 // indirect
github.com/felixge/fgprof v0.9.3 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.6 // indirect
github.com/github/go-spdx/v2 v2.3.2 // indirect
github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect
github.com/go-git/go-billy/v5 v5.6.2 // indirect
github.com/go-git/go-git/v5 v5.16.2 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-restruct/restruct v1.2.0-alpha // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/go-containerregistry v0.20.2 // indirect
github.com/google/licensecheck v0.3.1 // indirect
github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gookit/color v1.5.4 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/huandu/xstrings v1.5.0 // indirect
github.com/iancoleman/strcase v0.3.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
github.com/jinzhu/copier v0.4.0 // indirect
github.com/kastenhq/goversion v0.0.0-20230811215019-93b2f8823953 // indirect
github.com/kevinburke/ssh_config v1.2.0 // indirect
github.com/klauspost/compress v1.17.8 // indirect
github.com/klauspost/pgzip v1.2.5 // indirect
github.com/knqyf263/go-rpmdb v0.1.1 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.16 // indirect
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d // indirect
github.com/mholt/archiver/v3 v3.5.1 // indirect
github.com/microsoft/go-rustaudit v0.0.0-20220730194248-4b17361d90a5 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/moby/docker-image-spec v1.3.1 // indirect
github.com/moby/locker v1.0.1 // indirect
github.com/moby/sys/mountinfo v0.7.2 // indirect
github.com/moby/sys/sequential v0.5.0 // indirect
github.com/moby/sys/signal v0.7.0 // indirect
github.com/moby/sys/user v0.3.0 // indirect
github.com/moby/sys/userns v0.1.0 // indirect
github.com/muesli/termenv v0.15.2 // indirect
github.com/nwaples/rardecode v1.1.0 // indirect
github.com/olekukonko/tablewriter v0.0.5 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.0 // indirect
github.com/opencontainers/runtime-spec v1.1.0 // indirect
github.com/opencontainers/selinux v1.11.0 // indirect
github.com/pborman/indent v1.2.1 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/pierrec/lz4/v4 v4.1.19 // indirect
github.com/pjbgf/sha1cd v0.3.2 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pkg/profile v1.7.0 // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/saferwall/pe v1.5.4 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/sassoftware/go-rpmutils v0.4.0 // indirect
github.com/scylladb/go-set v1.0.3-0.20200225121959-cc7b2070d91e // indirect
github.com/secDre4mer/pkcs7 v0.0.0-20240322103146-665324a4461d // indirect
github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect
github.com/shopspring/decimal v1.4.0 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/skeema/knownhosts v1.3.1 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spdx/tools-golang v0.5.5 // indirect
github.com/spf13/afero v1.11.0 // indirect
github.com/spf13/cast v1.7.0 // indirect
github.com/spf13/cobra v1.8.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/spf13/viper v1.19.0 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/sylabs/sif/v2 v2.17.1 // indirect
github.com/sylabs/squashfs v1.0.0 // indirect
github.com/therootcompany/xz v1.0.1 // indirect
github.com/ulikunitz/xz v0.5.12 // indirect
github.com/vbatts/go-mtree v0.5.4 // indirect
github.com/vbatts/tar-split v0.11.3 // indirect
github.com/vifraa/gopom v1.0.0 // indirect
github.com/wagoodman/go-partybus v0.0.0-20230516145632-8ccac152c651 // indirect
github.com/wagoodman/go-progress v0.0.0-20230925121702-07e42b3cdba0 // indirect
github.com/xanzy/ssh-agent v0.3.3 // indirect
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
go.opentelemetry.io/otel v1.24.0 // indirect
go.opentelemetry.io/otel/metric v1.24.0 // indirect
go.opentelemetry.io/otel/trace v1.24.0 // indirect
go.uber.org/atomic v1.9.0 // indirect
go.uber.org/multierr v1.9.0 // indirect
golang.org/x/crypto v0.40.0 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/mod v0.26.0 // indirect
golang.org/x/net v0.42.0 // indirect
golang.org/x/sync v0.16.0 // indirect
golang.org/x/sys v0.34.0 // indirect
golang.org/x/term v0.33.0 // indirect
golang.org/x/text v0.27.0 // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
google.golang.org/genproto v0.0.0-20240213162025-012b6fc9bca9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda // indirect
google.golang.org/grpc v1.62.1 // indirect
google.golang.org/protobuf v1.35.2 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
================================================
FILE: examples/sbom-meta/go.sum
================================================
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU=
cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY=
cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc=
cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To=
cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4=
cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M=
cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc=
cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk=
cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs=
cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc=
cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY=
cloud.google.com/go v0.72.0/go.mod h1:M+5Vjvlc2wnp6tjzE102Dw08nGShTscUx2nZMufOKPI=
cloud.google.com/go v0.74.0/go.mod h1:VV1xSbzvo+9QJOxLDaJfTjx5e+MePCpCWwvftOeQmWk=
cloud.google.com/go v0.78.0/go.mod h1:QjdrLG0uq+YwhjoVOLsS1t7TW8fs36kLs4XO5R5ECHg=
cloud.google.com/go v0.79.0/go.mod h1:3bzgcEeQlzbuEAYu4mrWhKqWjmpprinYgKJLgKHnbb8=
cloud.google.com/go v0.81.0/go.mod h1:mk/AM35KwGk/Nm2YSeZbxXdrNK3KZOYHmLkOqC2V6E0=
cloud.google.com/go v0.83.0/go.mod h1:Z7MJUsANfY0pYPdw0lbnivPx4/vhy/e2FEkSkF7vAVY=
cloud.google.com/go v0.84.0/go.mod h1:RazrYuxIK6Kb7YrzzhPoLmCVzl7Sup4NrbKPg8KHSUM=
cloud.google.com/go v0.87.0/go.mod h1:TpDYlFy7vuLzZMMZ+B6iRiELaY7z/gJPaqbMx6mlWcY=
cloud.google.com/go v0.90.0/go.mod h1:kRX0mNRHe0e2rC6oNakvwQqzyDmg57xJ+SZU1eT2aDQ=
cloud.google.com/go v0.93.3/go.mod h1:8utlLll2EF5XMAV15woO4lSbWQlk8rer9aLOfLh7+YI=
cloud.google.com/go v0.94.1/go.mod h1:qAlAugsXlC+JWO+Bke5vCtc9ONxjQT3drlTTnAplMW4=
cloud.google.com/go v0.97.0/go.mod h1:GF7l59pYBVlXQIBLx3a761cZ41F9bBH3JUlihCt2Udc=
cloud.google.com/go v0.98.0/go.mod h1:ua6Ush4NALrHk5QXDWnjvZHN93OuF0HfuEPq9I1X0cM=
cloud.google.com/go v0.99.0/go.mod h1:w0Xx2nLzqWJPuozYQX+hFfCSI8WioryfRDzkoI/Y2ZA=
cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc=
cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ=
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
cloud.google.com/go/firestore v1.6.1/go.mod h1:asNXNOzBdyVQmEU+ggO8UPodTkEVFW5Qx+rwHnAz+EY=
cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw=
cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA=
cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU=
cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s=
dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU=
github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8=
github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 h1:59MxjQVfjXsBpLy+dbd2/ELV5ofnUkUZBvWSC85sheA=
github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0/go.mod h1:OahwfttHWG6eJ0clwcfBAHoDI6X/LV/15hx/wlMZSrU=
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/toml v0.4.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0=
github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/CycloneDX/cyclonedx-go v0.9.1 h1:yffaWOZsv77oTJa/SdVZYdgAgFioCeycBUKkqS2qzQM=
github.com/CycloneDX/cyclonedx-go v0.9.1/go.mod h1:NE/EWvzELOFlG6+ljX/QeMlVt9VKcTwu8u0ccsACEsw=
github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
github.com/DataDog/zstd v1.5.5 h1:oWf5W7GtOLgp6bciQYDmhHHjdhYkALu6S/5Ni9ZgSvQ=
github.com/DataDog/zstd v1.5.5/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw=
github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI=
github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU=
github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
github.com/Masterminds/semver/v3 v3.3.0 h1:B8LGeaivUe71a5qox1ICM/JLl0NqZSW5CHyL+hmvYS0=
github.com/Masterminds/semver/v3 v3.3.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs=
github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0=
github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/Microsoft/hcsshim v0.11.7 h1:vl/nj3Bar/CvJSYo7gIQPyRWc9f3c6IeSNavBTSZNZQ=
github.com/Microsoft/hcsshim v0.11.7/go.mod h1:MV8xMfmECjl5HdO7U/3/hFVnkmSBjAjmA09d4bExKcU=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8=
github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
github.com/ProtonMail/go-crypto v1.1.6 h1:ZcV+Ropw6Qn0AX9brlQLAUXfqLBc7Bl+f/DmNxpLfdw=
github.com/ProtonMail/go-crypto v1.1.6/go.mod h1:rA3QumHc/FZ8pAHreoekgiAbzpNsfQAosU5td4SnOrE=
github.com/acobaugh/osrelease v0.1.0 h1:Yb59HQDGGNhCj4suHaFQQfBps5wyoKLSSX/J/+UifRE=
github.com/acobaugh/osrelease v0.1.0/go.mod h1:4bFEs0MtgHNHBrmHCt67gNisnabCRAlzdVasCEGHTWY=
github.com/adrg/xdg v0.5.0 h1:dDaZvhMXatArP1NPHhnfaQUqWBLBsmx1h1HXQdMoFCY=
github.com/adrg/xdg v0.5.0/go.mod h1:dDdY4M4DF9Rjy4kHPeNL+ilVF+p2lK8IdM9/rTSGcI4=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/anchore/clio v0.0.0-20240522144804-d81e109008aa h1:pwlAn4O9SBUnlgfa69YcqIynbUyobLVFYu8HxSoCffA=
github.com/anchore/clio v0.0.0-20240522144804-d81e109008aa/go.mod h1:nD3H5uIvjxlfmakOBgtyFQbk5Zjp3l538kxfpHPslzI=
github.com/anchore/fangs v0.0.0-20240903175602-e716ef12c23d h1:ZD4wdCBgJJzJybjTUIEiiupLF7B9H3WLuBTjspBO2Mc=
github.com/anchore/fangs v0.0.0-20240903175602-e716ef12c23d/go.mod h1:Xh4ObY3fmoMzOEVXwDtS1uK44JC7+nRD0n29/1KYFYg=
github.com/anchore/go-collections v0.0.0-20240216171411-9321230ce537 h1:GjNGuwK5jWjJMyVppBjYS54eOiiSNv4Ba869k4wh72Q=
github.com/anchore/go-collections v0.0.0-20240216171411-9321230ce537/go.mod h1:1aiktV46ATCkuVg0O573ZrH56BUawTECPETbZyBcqT8=
github.com/anchore/go-logger v0.0.0-20230725134548-c21dafa1ec5a h1:nJ2G8zWKASyVClGVgG7sfM5mwoZlZ2zYpIzN2OhjWkw=
github.com/anchore/go-logger v0.0.0-20230725134548-c21dafa1ec5a/go.mod h1:ubLFmlsv8/DFUQrZwY5syT5/8Er3ugSr4rDFwHsE3hg=
github.com/anchore/go-macholibre v0.0.0-20220308212642-53e6d0aaf6fb h1:iDMnx6LIjtjZ46C0akqveX83WFzhpTD3eqOthawb5vU=
github.com/anchore/go-macholibre v0.0.0-20220308212642-53e6d0aaf6fb/go.mod h1:DmTY2Mfcv38hsHbG78xMiTDdxFtkHpgYNVDPsF2TgHk=
github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092 h1:aM1rlcoLz8y5B2r4tTLMiVTrMtpfY0O8EScKJxaSaEc=
github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092/go.mod h1:rYqSE9HbjzpHTI74vwPvae4ZVYZd1lue2ta6xHPdblA=
github.com/anchore/go-testutils v0.0.0-20200925183923-d5f45b0d3c04 h1:VzprUTpc0vW0nnNKJfJieyH/TZ9UYAnTZs5/gHTdAe8=
github.com/anchore/go-testutils v0.0.0-20200925183923-d5f45b0d3c04/go.mod h1:6dK64g27Qi1qGQZ67gFmBFvEHScy0/C8qhQhNe5B5pQ=
github.com/anchore/go-version v1.2.2-0.20200701162849-18adb9c92b9b h1:e1bmaoJfZVsCYMrIZBpFxwV26CbsuoEh5muXD5I1Ods=
github.com/anchore/go-version v1.2.2-0.20200701162849-18adb9c92b9b/go.mod h1:Bkc+JYWjMCF8OyZ340IMSIi2Ebf3uwByOk6ho4wne1E=
github.com/anchore/packageurl-go v0.1.1-0.20240507183024-848e011fc24f h1:B/E9ixKNCasntpoch61NDaQyGPDXLEJlL+B9B/PbdbA=
github.com/anchore/packageurl-go v0.1.1-0.20240507183024-848e011fc24f/go.mod h1:Blo6OgJNiYF41ufcgHKkbCKF2MDOMlrqhXv/ij6ocR4=
github.com/anchore/stereoscope v0.0.4-0.20241005180410-efa76446cc1c h1:JXezMk8fF5ns4AgRGW49SGfoRgDjJHsDmcpNw272jkU=
github.com/anchore/stereoscope v0.0.4-0.20241005180410-efa76446cc1c/go.mod h1:GMupz2FoBhy5RTTmawU06c2pZxgVTceahLWiwJef2uI=
github.com/anchore/syft v1.14.0 h1:BeMmc3a9d/63O+nPM8QfV1Olh3r+pYf95JOqbfN4gQg=
github.com/anchore/syft v1.14.0/go.mod h1:8bN2W/Tr4Mmm42h2XB9LPiPOps+NzCFIaQOKLBGb2b8=
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8=
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/aquasecurity/go-pep440-version v0.0.0-20210121094942-22b2f8951d46 h1:vmXNl+HDfqqXgr0uY1UgK1GAhps8nbAAtqHNBcgyf+4=
github.com/aquasecurity/go-pep440-version v0.0.0-20210121094942-22b2f8951d46/go.mod h1:olhPNdiiAAMiSujemd1O/sc6GcyePr23f/6uGKtthNg=
github.co
gitextract_5cor20ar/
├── .dockerignore
├── .github/
│ ├── actions/
│ │ └── complain/
│ │ └── action.yml
│ └── workflows/
│ ├── chai-api.ci.yml
│ ├── ci.yml
│ └── deploy.yml
├── .gitignore
├── .python-version
├── LICENSE
├── README.md
├── alembic/
│ ├── .pkgx.yaml
│ ├── Dockerfile
│ ├── README.md
│ ├── alembic.ini
│ ├── env.py
│ ├── init-script.sql
│ ├── load-values.sql
│ ├── run_migrations.sh
│ ├── script.py.mako
│ └── versions/
│ ├── 20241028_1217-base_migration.py
│ ├── 20250312_0045-add_legacy_dependency_table.py
│ ├── 20250312_2244-canons.py
│ ├── 20250416_0223-add_ranks.py
│ ├── 20250422_0940-add_unique_package_to_canon_packages.py
│ ├── 20250508_1752-add_trgm_indexes.py
│ ├── 20250529_2341-rename_canons_table_and_recreate.py
│ └── 20250529_2345-recreate_canon_foreign_keys.py
├── api/
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Cargo.toml
│ ├── Dockerfile
│ ├── README.md
│ └── src/
│ ├── app_state.rs
│ ├── db.rs
│ ├── handlers.rs
│ ├── logging.rs
│ ├── main.rs
│ └── utils.rs
├── core/
│ ├── README.md
│ ├── config.py
│ ├── db.py
│ ├── fetcher.py
│ ├── logger.py
│ ├── models/
│ │ └── __init__.py
│ ├── requirements.txt
│ ├── scheduler.py
│ ├── structs.py
│ ├── test.json
│ ├── transformer.py
│ └── utils.py
├── db/
│ ├── README.md
│ └── queries.md
├── docker-compose.yml
├── examples/
│ ├── sbom-meta/
│ │ ├── README.md
│ │ ├── go.mod
│ │ ├── go.sum
│ │ └── main.go
│ └── visualizer/
│ ├── README.md
│ ├── main.py
│ └── monitor.py
├── package_managers/
│ ├── crates/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── db.py
│ │ ├── diff.py
│ │ ├── main.py
│ │ ├── structs.py
│ │ └── transformer.py
│ ├── debian/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── db.py
│ │ ├── debian_sources.py
│ │ ├── diff.py
│ │ ├── main.py
│ │ ├── parser.py
│ │ ├── scripts/
│ │ │ ├── investigate_sources.py
│ │ │ └── test_investigate_sources.py
│ │ └── structs.py
│ ├── homebrew/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── db.py
│ │ ├── diff.py
│ │ ├── formulae.py
│ │ ├── main.py
│ │ └── structs.py
│ └── pkgx/
│ ├── Dockerfile
│ ├── db.py
│ ├── diff.py
│ ├── loader.py
│ ├── main.py
│ ├── parser.py
│ └── url.py
├── pkgx.yaml
├── pyproject.toml
├── ranker/
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Dockerfile
│ ├── README.md
│ ├── config.py
│ ├── db.py
│ ├── dedupe.py
│ ├── main.py
│ ├── naming.py
│ ├── requirements.txt
│ ├── rx_graph.py
│ └── utils/
│ ├── analyze_ranks.py
│ └── parse_log.py
├── scripts/
│ ├── chai-legacy-loader/
│ │ ├── README.md
│ │ ├── add_package_fields.py
│ │ ├── batch_insert_package_urls.py
│ │ ├── batch_insert_urls.py
│ │ ├── copy_dependencies_no_thread.py
│ │ ├── pkgx.yaml
│ │ └── sql/
│ │ ├── dependencies.sql
│ │ ├── packages.sql
│ │ └── urls.sql
│ ├── npm-singleton/
│ │ ├── README.md
│ │ ├── pkgx.yaml
│ │ └── single.py
│ ├── package_to_package/
│ │ └── package_dependencies.py
│ └── upgrade_canons/
│ ├── .gitignore
│ ├── README.md
│ ├── create_deleted_canons.py
│ ├── db.py
│ ├── delete_non_canonical_urls.py
│ ├── main.py
│ ├── registered_projects.py
│ └── structs.py
└── tests/
├── README.md
├── conftest.py
├── package_managers/
│ ├── crates/
│ │ ├── conftest.py
│ │ └── test_crates_diff_deps.py
│ ├── debian/
│ │ ├── conftest.py
│ │ ├── test_debian_diff.py
│ │ ├── test_debian_parser.py
│ │ └── test_debian_sources.py
│ ├── homebrew/
│ │ ├── conftest.py
│ │ └── test_homebrew_diff_deps.py
│ └── pkgx/
│ ├── test_pkgx_diff.py
│ └── test_special_case.py
├── ranker/
│ ├── test_compute_canon_name.py
│ ├── test_dedupe.py
│ └── test_rx_graph.py
└── scripts/
└── upgrade_canons/
└── test_analyze_packages_needing_canonicalization.py
SYMBOL INDEX (617 symbols across 86 files)
FILE: alembic/env.py
function run_migrations_offline (line 26) | def run_migrations_offline() -> None:
function run_migrations_online (line 49) | def run_migrations_online() -> None:
FILE: alembic/versions/20241028_1217-base_migration.py
function upgrade (line 22) | def upgrade() -> None:
function downgrade (line 451) | def downgrade() -> None:
FILE: alembic/versions/20250312_0045-add_legacy_dependency_table.py
function upgrade (line 22) | def upgrade() -> None:
function downgrade (line 76) | def downgrade() -> None:
FILE: alembic/versions/20250312_2244-canons.py
function upgrade (line 22) | def upgrade() -> None:
function downgrade (line 70) | def downgrade() -> None:
FILE: alembic/versions/20250416_0223-add_ranks.py
function upgrade (line 22) | def upgrade() -> None:
function downgrade (line 65) | def downgrade() -> None:
FILE: alembic/versions/20250422_0940-add_unique_package_to_canon_packages.py
function upgrade (line 20) | def upgrade() -> None:
function downgrade (line 30) | def downgrade() -> None:
FILE: alembic/versions/20250508_1752-add_trgm_indexes.py
function upgrade (line 20) | def upgrade() -> None:
function downgrade (line 45) | def downgrade() -> None:
FILE: alembic/versions/20250529_2341-rename_canons_table_and_recreate.py
function upgrade (line 23) | def upgrade() -> None:
function downgrade (line 78) | def downgrade() -> None:
FILE: alembic/versions/20250529_2345-recreate_canon_foreign_keys.py
function upgrade (line 20) | def upgrade() -> None:
function downgrade (line 49) | def downgrade() -> None:
FILE: api/src/app_state.rs
constant TTL (line 8) | const TTL: Duration = Duration::from_secs(3600);
type ProjectCacheEntry (line 11) | pub struct ProjectCacheEntry {
method new (line 17) | pub fn new(data: Value) -> Self {
method is_expired (line 24) | pub fn is_expired(&self) -> bool {
type AppState (line 29) | pub struct AppState {
FILE: api/src/db.rs
function create_pool (line 7) | pub async fn create_pool() -> Pool {
function get_tables (line 23) | pub async fn get_tables(client: &Client) -> Vec<String> {
function initialize_db (line 37) | pub async fn initialize_db() -> (Pool, Arc<Vec<String>>) {
FILE: api/src/handlers.rs
constant RESPONSE_LIMIT (line 11) | const RESPONSE_LIMIT: i64 = 1000;
type PaginationParams (line 14) | pub struct PaginationParams {
type PaginatedResponse (line 20) | struct PaginatedResponse {
type LeaderboardRequest (line 31) | pub struct LeaderboardRequest {
type ProjectBatchRequest (line 38) | pub struct ProjectBatchRequest {
function check_table_exists (line 43) | pub fn check_table_exists(table: &str, tables: &[String]) -> Option<Http...
function list_tables (line 56) | pub async fn list_tables(
function heartbeat (line 78) | pub async fn heartbeat(data: web::Data<AppState>) -> impl Responder {
function get_table (line 95) | pub async fn get_table(
function get_table_row (line 154) | pub async fn get_table_row(
function get_project (line 202) | pub async fn get_project(path: web::Path<Uuid>, data: web::Data<AppState...
function list_projects_by_id (line 293) | pub async fn list_projects_by_id(
function list_projects_by_name (line 351) | pub async fn list_projects_by_name(
function get_leaderboard (line 415) | pub async fn get_leaderboard(
function sort_truncate_and_return (line 521) | fn sort_truncate_and_return(projects: Vec<Arc<Value>>, limit: i64) -> ac...
function get_top_projects (line 550) | async fn get_top_projects(data: web::Data<AppState>, limit: i64) -> Http...
FILE: api/src/logging.rs
function setup_logger (line 3) | pub fn setup_logger() {
type Logger (line 7) | pub struct Logger;
method default (line 10) | pub fn default() -> actix_web::middleware::Logger {
FILE: api/src/main.rs
function main (line 21) | async fn main() -> std::io::Result<()> {
FILE: api/src/utils.rs
function get_column_names (line 11) | pub fn get_column_names(rows: &[Row]) -> Vec<String> {
function convert_optional_to_json (line 22) | pub fn convert_optional_to_json<T, E>(result: Result<Option<T>, E>) -> V...
function rows_to_json (line 32) | pub fn rows_to_json(rows: &[Row]) -> Vec<Value> {
type Pagination (line 73) | pub struct Pagination {
method new (line 81) | pub fn new(query: Query<PaginationParams>, total_count: i64) -> Self {
function get_cached_projects (line 98) | pub fn get_cached_projects(
FILE: core/config.py
class PackageManager (line 12) | class PackageManager(Enum):
class ExecConf (line 40) | class ExecConf:
method __init__ (line 45) | def __init__(self) -> None:
method __str__ (line 50) | def __str__(self):
class PMConf (line 54) | class PMConf:
method __init__ (line 59) | def __init__(self, pm: PackageManager, db: ConfigDB):
method __str__ (line 64) | def __str__(self):
class URLTypes (line 68) | class URLTypes:
method __init__ (line 74) | def __init__(self, db: ConfigDB):
method load_url_types (line 77) | def load_url_types(self, db: ConfigDB) -> None:
method __str__ (line 83) | def __str__(self) -> str:
class UserTypes (line 87) | class UserTypes:
method __init__ (line 91) | def __init__(self, db: ConfigDB):
method __str__ (line 95) | def __str__(self) -> str:
class DependencyTypes (line 99) | class DependencyTypes:
method __init__ (line 107) | def __init__(self, db: ConfigDB):
method __str__ (line 115) | def __str__(self) -> str:
class PackageManagers (line 119) | class PackageManagers:
method __init__ (line 126) | def __init__(self, db: ConfigDB):
class Config (line 134) | class Config:
method __init__ (line 142) | def __init__(self, pm: PackageManager) -> None:
method __str__ (line 151) | def __str__(self):
FILE: core/db.py
class DB (line 30) | class DB:
method __init__ (line 31) | def __init__(self, logger_name: str):
method insert_load_history (line 38) | def insert_load_history(self, package_manager_id: str):
method print_statement (line 43) | def print_statement(self, stmt):
method close (line 50) | def close(self):
method search_names (line 54) | def search_names(
method current_graph (line 80) | def current_graph(self, package_manager_id: UUID) -> CurrentGraph:
method _build_current_urls (line 111) | def _build_current_urls(
method current_urls (line 133) | def current_urls(self, urls: set[str]) -> CurrentURLs:
method all_current_urls (line 147) | def all_current_urls(self) -> CurrentURLs:
method load (line 159) | def load(
method batch (line 169) | def batch(
method ingest (line 194) | def ingest(
method execute (line 264) | def execute(self, session: Session, data: list[Any], method: str, log:...
method remove_all (line 280) | def remove_all(self, session: Session, data: list[Any]) -> None:
class ConfigDB (line 285) | class ConfigDB(DB):
method __init__ (line 286) | def __init__(self):
method select_package_manager_by_name (line 289) | def select_package_manager_by_name(self, package_manager: str) -> Pack...
method select_url_types_by_name (line 303) | def select_url_types_by_name(self, name: str) -> URLType:
method select_source_by_name (line 307) | def select_source_by_name(self, name: str) -> Source:
method select_dependency_type_by_name (line 311) | def select_dependency_type_by_name(self, name: str) -> DependsOnType:
FILE: core/fetcher.py
class Data (line 18) | class Data:
class Fetcher (line 24) | class Fetcher:
method __init__ (line 25) | def __init__(self, name: str, source: str, no_cache: bool, test: bool):
method write (line 33) | def write(self, files: list[Data]):
method update_symlink (line 66) | def update_symlink(self, latest_path: str):
method fetch (line 75) | def fetch(self) -> bytes:
method cleanup (line 87) | def cleanup(self):
class TarballFetcher (line 94) | class TarballFetcher(Fetcher):
method __init__ (line 95) | def __init__(self, name: str, source: str, no_cache: bool, test: bool):
method fetch (line 98) | def fetch(self) -> list[Data]:
class GZipFetcher (line 119) | class GZipFetcher(Fetcher):
method __init__ (line 120) | def __init__(
method fetch (line 133) | def fetch(self) -> list[Data]:
class GitFetcher (line 143) | class GitFetcher(Fetcher):
method __init__ (line 144) | def __init__(self, name: str, source: str, no_cache: bool, test: bool):
method fetch (line 147) | def fetch(self) -> str:
FILE: core/logger.py
function as_minutes (line 10) | def as_minutes(seconds: float) -> float:
class Logger (line 14) | class Logger:
method __init__ (line 19) | def __init__(
method print (line 26) | def print(self, msg: str):
method error (line 29) | def error(self, message):
method log (line 32) | def log(self, message):
method debug (line 36) | def debug(self, message):
method warn (line 40) | def warn(self, message):
method is_verbose (line 44) | def is_verbose(self):
method time_diff (line 47) | def time_diff(self):
method exception (line 50) | def exception(self):
method info (line 56) | def info(self, message):
method warning (line 59) | def warning(self, message):
FILE: core/models/__init__.py
class BaseModel (line 30) | class BaseModel:
method to_dict_v2 (line 32) | def to_dict_v2(self) -> dict[str, str | UUID | datetime | int | float]:
class Package (line 44) | class Package(Base):
method to_dict (line 75) | def to_dict(self):
class PackageManager (line 85) | class PackageManager(Base):
class Version (line 104) | class Version(Base):
method to_dict (line 139) | def to_dict(self):
class License (line 152) | class License(Base):
class DependsOn (line 169) | class DependsOn(Base):
method to_dict (line 207) | def to_dict(self):
class DependsOnType (line 216) | class DependsOnType(Base):
class LoadHistory (line 233) | class LoadHistory(Base):
class Source (line 253) | class Source(Base):
class URL (line 271) | class URL(Base):
method to_dict (line 297) | def to_dict(self):
class URLType (line 302) | class URLType(Base):
class User (line 319) | class User(Base):
method to_dict (line 342) | def to_dict(self):
class UserVersion (line 350) | class UserVersion(Base):
method to_dict (line 374) | def to_dict(self):
class UserPackage (line 381) | class UserPackage(Base):
method to_dict (line 405) | def to_dict(self):
class PackageURL (line 412) | class PackageURL(Base):
method to_dict (line 435) | def to_dict(self):
class LegacyDependency (line 442) | class LegacyDependency(Base):
class Canon (line 469) | class Canon(Base):
class CanonPackage (line 501) | class CanonPackage(Base):
class TeaRankRun (line 522) | class TeaRankRun(Base):
class TeaRank (line 537) | class TeaRank(Base):
FILE: core/scheduler.py
class Scheduler (line 13) | class Scheduler:
method __init__ (line 14) | def __init__(self, name: str, frequency: int = FREQUENCY):
method start (line 21) | def start(self, task: Callable, *args):
method stop (line 33) | def stop(self):
method run_now (line 39) | def run_now(self, task: Callable, *args):
FILE: core/structs.py
class CurrentGraph (line 9) | class CurrentGraph:
class URLKey (line 15) | class URLKey:
class CurrentURLs (line 21) | class CurrentURLs:
class Cache (line 27) | class Cache:
class DiffResult (line 35) | class DiffResult:
FILE: core/transformer.py
class Transformer (line 20) | class Transformer:
method __init__ (line 21) | def __init__(self, name: str):
method finder (line 34) | def finder(self, file_name: str) -> str:
method open (line 44) | def open(self, file_name: str) -> str:
method canonicalize (line 49) | def canonicalize(self, url: str) -> str:
method guess (line 52) | def guess(self, db_client: DB, url: str, package_managers: list[UUID])...
FILE: core/utils.py
function safe_int (line 6) | def safe_int(val: str) -> int | None:
function build_query_params (line 13) | def build_query_params(
function env_vars (line 25) | def env_vars(env_var: str, default: str) -> bool:
function convert_keys_to_snake_case (line 31) | def convert_keys_to_snake_case(data: dict[str, Any]) -> dict[str, Any]:
function is_github_url (line 45) | def is_github_url(url: str) -> bool:
function file_exists (line 50) | def file_exists(*args) -> str:
FILE: examples/sbom-meta/main.go
type config (line 23) | type config struct
type packageMeta (line 30) | type packageMeta struct
constant packageMetaFullSQL (line 39) | packageMetaFullSQL = `
constant packageMetaSQL (line 56) | packageMetaSQL = `
function main (line 71) | func main() {
function printPackagesMeta (line 159) | func printPackagesMeta(pms []packageMeta) {
function formatTime (line 178) | func formatTime(val interface{}) string {
function formatNumber (line 185) | func formatNumber(val interface{}) string {
function dedupePackages (line 192) | func dedupePackages(pms []packageMeta) []packageMeta {
function usage (line 204) | func usage() {
FILE: examples/visualizer/main.py
class Package (line 15) | class Package:
method __init__ (line 21) | def __init__(self, id: str):
method __str__ (line 27) | def __str__(self):
class Graph (line 31) | class Graph(rx.PyDiGraph):
method __init__ (line 32) | def __init__(self, *args, **kwargs):
method _get_or_create_package (line 39) | def _get_or_create_package(self, pkg_id: str) -> Package:
method safely_add_node (line 46) | def safely_add_node(self, pkg_id: str) -> int:
method safely_add_nodes (line 55) | def safely_add_nodes(self, nodes: list[str]) -> list[int]:
method pagerank (line 58) | def pagerank(self) -> None:
method nameless_nodes (line 63) | def nameless_nodes(self) -> list[str]:
method max_depth (line 66) | def max_depth(self) -> int:
class DB (line 70) | class DB:
method __init__ (line 73) | def __init__(self):
method connect (line 89) | def connect(self) -> None:
method select_id (line 99) | def select_id(self, package: str) -> int:
method select_deps (line 103) | def select_deps(self, ids: list[str]) -> dict[str, dict[str, str | set...
method select_name (line 119) | def select_name(self, ids: list[str]) -> list[tuple[str, str]]:
function larger_query (line 124) | def larger_query(db: DB, root_package: str, max_depth: int) -> Graph:
function display (line 173) | def display(graph: Graph):
function draw (line 192) | def draw(graph: Graph, package: str, img_type: str = "svg"):
function latest (line 267) | def latest(db: DB, package: str, depth: int, img_type: str):
FILE: examples/visualizer/monitor.py
class Result (line 17) | class Result:
method __init__ (line 18) | def __init__(self, **kwargs):
method __str__ (line 22) | def __str__(self):
class MonitoredDB (line 31) | class MonitoredDB(DB):
method __init__ (line 34) | def __init__(self):
method _monitor_query (line 39) | def _monitor_query(self, func: Callable) -> Callable:
method connect (line 50) | def connect(self):
function run_monitored (line 58) | def run_monitored(func: Callable, package: str) -> Result:
function compare_implementations (line 73) | def compare_implementations(package: str, runs: int = 3) -> dict[str, li...
function compare_results (line 89) | def compare_results(results: dict[str, list[Result]], runs: int) -> None:
FILE: package_managers/crates/db.py
class CratesDB (line 20) | class CratesDB(DB):
method __init__ (line 21) | def __init__(self, config: Config):
method set_current_graph (line 26) | def set_current_graph(self) -> None:
method set_current_urls (line 29) | def set_current_urls(self, urls: set[str]) -> None:
method delete_packages_by_import_id (line 32) | def delete_packages_by_import_id(self, import_ids: set[int]) -> None:
method get_cargo_id_to_chai_id (line 161) | def get_cargo_id_to_chai_id(self) -> dict[str, UUID]:
FILE: package_managers/crates/diff.py
class Diff (line 11) | class Diff:
method __init__ (line 12) | def __init__(self, config: Config, caches: Cache):
method diff_pkg (line 18) | def diff_pkg(self, pkg: Crate) -> tuple[UUID, Package | None, dict | N...
method diff_url (line 59) | def diff_url(self, pkg: Crate, new_urls: dict[URLKey, URL]) -> dict[UU...
method diff_pkg_url (line 110) | def diff_pkg_url(
method diff_deps (line 161) | def diff_deps(
method _resolve_dep_type (line 312) | def _resolve_dep_type(self, dep_type: DependencyType) -> UUID:
FILE: package_managers/crates/main.py
function identify_deletions (line 18) | def identify_deletions(transformer: CratesTransformer, db: CratesDB) -> ...
function main (line 57) | def main(config: Config, db: CratesDB):
FILE: package_managers/crates/structs.py
class DependencyType (line 8) | class DependencyType(IntEnum):
method __str__ (line 25) | def __str__(self):
class CrateDependency (line 30) | class CrateDependency:
class CrateUser (line 38) | class CrateUser:
class CrateLatestVersion (line 46) | class CrateLatestVersion:
class Crate (line 61) | class Crate:
class CanonUpdatePayload (line 74) | class CanonUpdatePayload(TypedDict):
class CanonPackageUpdatePayload (line 82) | class CanonPackageUpdatePayload(TypedDict):
FILE: package_managers/crates/transformer.py
class CratesTransformer (line 16) | class CratesTransformer(Transformer):
method __init__ (line 17) | def __init__(self, config: Config):
method _open_csv (line 32) | def _open_csv(self, file_name: str) -> Generator[dict[str, str], None,...
method parse (line 49) | def parse(self) -> None:
method _load_latest_versions (line 156) | def _load_latest_versions(self) -> tuple[set[int], dict[int, int]]:
method _load_users (line 167) | def _load_users(self) -> dict[int, CrateUser]:
FILE: package_managers/debian/db.py
class DebianDB (line 8) | class DebianDB(DB):
method __init__ (line 9) | def __init__(self, logger_name: str, config: Config):
method set_current_graph (line 13) | def set_current_graph(self) -> None:
method set_current_urls (line 17) | def set_current_urls(self, urls: set[str]) -> None:
method ingest_wrapper (line 21) | def ingest_wrapper(self, diff_result: DiffResult) -> None:
FILE: package_managers/debian/debian_sources.py
function build_package_to_source_mapping (line 6) | def build_package_to_source_mapping(
function enrich_package_with_source (line 46) | def enrich_package_with_source(
FILE: package_managers/debian/diff.py
class DebianDiff (line 16) | class DebianDiff:
method __init__ (line 17) | def __init__(self, config: Config, caches: Cache, db: DebianDB, logger...
method diff_pkg (line 24) | def diff_pkg(
method diff_url (line 68) | def diff_url(
method diff_pkg_url (line 107) | def diff_pkg_url(
method diff_deps (line 144) | def diff_deps(
method _generate_chai_urls (line 280) | def _generate_chai_urls(self, debian_data: DebianData) -> list[URLKey]:
FILE: package_managers/debian/main.py
function fetch (line 26) | def fetch(config: Config, logger: Logger) -> tuple[GZipFetcher, GZipFetc...
function diff (line 65) | def diff(
function run_pipeline (line 148) | def run_pipeline(config: Config, db: DebianDB, logger: Logger):
function main (line 203) | def main(config: Config, db: DebianDB, logger: Logger):
FILE: package_managers/debian/parser.py
class DebianParser (line 10) | class DebianParser:
method __init__ (line 11) | def __init__(self, content: str):
method parse (line 15) | def parse(self) -> Iterator[DebianData]:
method handle_line (line 68) | def handle_line(self, obj: DebianData, line: str) -> None:
method mapper (line 72) | def mapper(self, obj: DebianData, key: str, value: str) -> None:
function handle_depends (line 193) | def handle_depends(dependency: str) -> Depends:
function handle_maintainer (line 219) | def handle_maintainer(value: str) -> Maintainer:
FILE: package_managers/debian/scripts/investigate_sources.py
function parse_sources_file (line 16) | def parse_sources_file(file_path: str) -> dict[str, set[str]]:
function parse_packages_file (line 94) | def parse_packages_file(file_path: str) -> dict[str, str | None]:
function investigate_mapping (line 144) | def investigate_mapping(sources_file: str, packages_file: str) -> None:
function main (line 217) | def main():
FILE: package_managers/debian/scripts/test_investigate_sources.py
function binutils (line 9) | def binutils():
function linux (line 21) | def linux():
function test_binutils (line 86) | def test_binutils(binutils):
function test_linux (line 105) | def test_linux(linux):
FILE: package_managers/debian/structs.py
class Maintainer (line 6) | class Maintainer:
class File (line 12) | class File:
class Depends (line 19) | class Depends:
class Tag (line 25) | class Tag:
class DebianData (line 33) | class DebianData:
FILE: package_managers/homebrew/db.py
class HomebrewDB (line 6) | class HomebrewDB(DB):
method __init__ (line 7) | def __init__(self, logger_name: str, config: Config):
method set_current_graph (line 12) | def set_current_graph(self) -> None:
method set_current_urls (line 17) | def set_current_urls(self, urls: set[str]) -> None:
FILE: package_managers/homebrew/diff.py
class Diff (line 11) | class Diff:
method __init__ (line 12) | def __init__(self, config: Config, caches: Cache):
method diff_pkg (line 18) | def diff_pkg(self, pkg: Actual) -> tuple[UUID, Package | None, dict | ...
method diff_url (line 60) | def diff_url(
method diff_pkg_url (line 104) | def diff_pkg_url(
method diff_deps (line 155) | def diff_deps(
FILE: package_managers/homebrew/formulae.py
class HomebrewFetcher (line 15) | class HomebrewFetcher(Fetcher):
method __init__ (line 16) | def __init__(self, config: Config):
method fetch (line 24) | def fetch(self) -> list[Actual]:
FILE: package_managers/homebrew/main.py
function main (line 15) | def main(config: Config, db: HomebrewDB) -> None:
FILE: package_managers/homebrew/structs.py
class Actual (line 5) | class Actual:
FILE: package_managers/pkgx/db.py
class PkgxDB (line 8) | class PkgxDB(DB):
method __init__ (line 9) | def __init__(self, logger_name: str, config: Config):
method set_current_graph (line 13) | def set_current_graph(self) -> None:
method set_current_urls (line 18) | def set_current_urls(self) -> None:
FILE: package_managers/pkgx/diff.py
class PkgxDiff (line 15) | class PkgxDiff:
method __init__ (line 16) | def __init__(self, config: Config, caches: Cache, db: DB, logger: Logg...
method diff_pkg (line 23) | def diff_pkg(
method diff_url (line 56) | def diff_url(
method diff_pkg_url (line 97) | def diff_pkg_url(
method diff_deps (line 134) | def diff_deps(
FILE: package_managers/pkgx/loader.py
class PkgxLoader (line 17) | class PkgxLoader(DB):
method __init__ (line 18) | def __init__(self, config: Config, data: dict[str, Cache]):
method load_packages (line 25) | def load_packages(self) -> None:
method load_dependencies (line 101) | def load_dependencies(self) -> None:
FILE: package_managers/pkgx/main.py
function fetch (line 26) | def fetch(config: Config) -> GitFetcher:
function run_pipeline (line 45) | def run_pipeline(config: Config, db: PkgxDB):
function main (line 135) | def main():
FILE: package_managers/pkgx/parser.py
class Distributable (line 23) | class Distributable:
class Version (line 32) | class Version:
class Dependency (line 46) | class Dependency:
class EnvironmentVariable (line 52) | class EnvironmentVariable:
class DependencyBlock (line 58) | class DependencyBlock:
class Build (line 64) | class Build:
class Test (line 72) | class Test:
class PkgxPackage (line 80) | class PkgxPackage:
class PkgxParser (line 94) | class PkgxParser:
method __init__ (line 95) | def __init__(self, repo_path: str):
method find_package_yamls (line 98) | def find_package_yamls(self) -> Iterator[tuple[Path, str]]:
method is_vendored (line 116) | def is_vendored(self, data: dict[str, Any]) -> bool:
method parse_package_yaml (line 124) | def parse_package_yaml(self, file_path: Path) -> PkgxPackage | None:
method parse_packages (line 151) | def parse_packages(self) -> Iterator[tuple[PkgxPackage, str]]:
method _parse_dependency_list (line 158) | def _parse_dependency_list(
method _parse_build_section (line 222) | def _parse_build_section(self, build_data: Any, file_path_str: str) ->...
method _parse_test_section (line 266) | def _parse_test_section(self, test_data: Any, file_path_str: str) -> T...
method _parse_versions_section (line 300) | def _parse_versions_section(
method _parse_distributable_section (line 319) | def _parse_distributable_section(
method map_package_yaml_to_pkgx_package (line 339) | def map_package_yaml_to_pkgx_package(
FILE: package_managers/pkgx/url.py
function canonicalize (line 16) | def canonicalize(url: str) -> str:
function guess (line 20) | def guess(db_client: DB, package_managers: list[UUID], url: str) -> list...
function ask_pkgx (line 26) | def ask_pkgx(import_id: str) -> str | None:
function special_case (line 40) | def special_case(import_id: str, logger: Logger) -> str | None:
function generate_chai_urls (line 83) | def generate_chai_urls(
FILE: ranker/config.py
class ConfigDB (line 20) | class ConfigDB(DB):
method __init__ (line 21) | def __init__(self):
method get_homepage_url_type_id (line 24) | def get_homepage_url_type_id(self) -> UUID:
method get_npm_pm_id (line 33) | def get_npm_pm_id(self) -> UUID:
method get_canons_with_source_types (line 36) | def get_canons_with_source_types(
method get_pm_id_by_name (line 53) | def get_pm_id_by_name(self, name: str | list[str]) -> UUID:
class TeaRankConfig (line 69) | class TeaRankConfig:
method __init__ (line 70) | def __init__(self, db: ConfigDB) -> None:
method map_favorites (line 82) | def map_favorites(self, package_managers: list[str]) -> None:
method personalize (line 97) | def personalize(
method __str__ (line 125) | def __str__(self) -> str:
class PMConfig (line 129) | class PMConfig:
method __init__ (line 130) | def __init__(self, db: ConfigDB) -> None:
method __str__ (line 139) | def __str__(self) -> str:
class URLTypes (line 145) | class URLTypes:
method __init__ (line 146) | def __init__(self, db: ConfigDB) -> None:
method __str__ (line 150) | def __str__(self) -> str:
class DedupeConfig (line 154) | class DedupeConfig:
method __init__ (line 155) | def __init__(self, db: ConfigDB) -> None:
method __str__ (line 159) | def __str__(self) -> str:
class Config (line 164) | class Config:
method __init__ (line 165) | def __init__(self, db: ConfigDB) -> None:
method __str__ (line 171) | def __str__(self) -> str:
function load_config (line 175) | def load_config() -> Config:
function load_dedupe_config (line 180) | def load_dedupe_config() -> DedupeConfig:
FILE: ranker/db.py
class GraphDB (line 23) | class GraphDB(DB):
method __init__ (line 24) | def __init__(self, legacy_pm_id: UUID, system_pm_ids: list[UUID]):
method is_canon_populated (line 29) | def is_canon_populated(self) -> bool:
method is_canon_package_populated (line 33) | def is_canon_package_populated(self) -> bool:
method get_all_canons (line 37) | def get_all_canons(self) -> dict[str, UUID]:
method get_packages_with_urls (line 43) | def get_packages_with_urls(self) -> list[tuple[UUID, str, str, str]]:
method load_canonical_packages (line 61) | def load_canonical_packages(self, data: list[Canon]) -> None:
method load_canonical_package_mappings (line 95) | def load_canonical_package_mappings(self, data: list[CanonPackage]) ->...
method get_packages (line 141) | def get_packages(self) -> list[tuple[UUID, UUID]]:
method get_dependencies (line 151) | def get_dependencies(self, package_id: UUID) -> list[tuple[UUID]]:
method get_package_to_canon_mapping (line 162) | def get_package_to_canon_mapping(self) -> dict[UUID, UUID]:
method get_legacy_dependencies (line 172) | def get_legacy_dependencies(self, package_id: UUID) -> list[tuple[UUID]]:
method load_tea_ranks (line 182) | def load_tea_ranks(self, data: list[TeaRank]) -> None:
method load_tea_rank_runs (line 188) | def load_tea_rank_runs(self, data: list[TeaRankRun]) -> None:
method get_current_tea_rank_run (line 194) | def get_current_tea_rank_run(self) -> TeaRankRun | None:
FILE: ranker/dedupe.py
class DedupeDB (line 20) | class DedupeDB(DB):
method __init__ (line 21) | def __init__(self, config: DedupeConfig):
method get_current_canons (line 25) | def get_current_canons(self) -> dict[UUID, Canon]:
method get_current_canon_packages (line 31) | def get_current_canon_packages(self) -> dict[UUID, dict[str, UUID]]:
method get_packages_with_homepages (line 40) | def get_packages_with_homepages(self) -> list[tuple[Package, URL]]:
method get_all_package_names (line 51) | def get_all_package_names(self) -> dict[UUID, str]:
method ingest (line 56) | def ingest(
method add_with_flush (line 78) | def add_with_flush(self, session: Session, rows: list[BaseModel]) -> N...
function get_latest_homepage_per_package (line 83) | def get_latest_homepage_per_package(
function build_canon_update_payload (line 114) | def build_canon_update_payload(
function build_canon_package_update_payload (line 121) | def build_canon_package_update_payload(
function process_deduplication_changes (line 142) | def process_deduplication_changes(
function main (line 255) | def main(config: DedupeConfig, db: DedupeDB):
FILE: ranker/main.py
class PackageInfo (line 28) | class PackageInfo:
function load_graph (line 33) | def load_graph(
function main (line 98) | def main(config: Config, db: GraphDB) -> None:
FILE: ranker/naming.py
function compute_canon_name (line 10) | def compute_canon_name(url: str, package_name: str, existing_name: str =...
function check_if_better (line 34) | def check_if_better(best_guess: str, package_name: str, existing_name: s...
function extract_repo_name_from_url (line 49) | def extract_repo_name_from_url(url: str) -> str:
function score_name (line 67) | def score_name(name: str, best_guess: str) -> int:
function get_effective_canon_name (line 93) | def get_effective_canon_name(
FILE: ranker/rx_graph.py
class PackageNode (line 17) | class PackageNode:
class CHAI (line 27) | class CHAI(rx.PyDiGraph):
method __init__ (line 28) | def __init__(self):
method add_node (line 33) | def add_node(self, node: PackageNode) -> int:
method add_edge (line 40) | def add_edge(self, u: int, v: int, edge_data: Any) -> None:
method generate_personalization (line 47) | def generate_personalization(
method pagerank (line 57) | def pagerank(
method distribute (line 66) | def distribute(
FILE: ranker/utils/analyze_ranks.py
function get_latest_rank_file (line 29) | def get_latest_rank_file() -> Path:
function get_rank_file (line 36) | def get_rank_file(filename: str | None = None) -> Path:
function load_rank_data (line 57) | def load_rank_data(file_path: Path) -> dict[str, float]:
function get_output_filename (line 63) | def get_output_filename(input_path: Path) -> Path:
function get_package_data (line 74) | def get_package_data(ranks: dict[str, float], db_session: Session) -> pd...
function parse_args (line 122) | def parse_args() -> argparse.Namespace:
function main (line 136) | def main() -> None:
FILE: ranker/utils/parse_log.py
function parse_log_line (line 20) | def parse_log_line(line: str) -> tuple[float, int]:
function calculate_metrics (line 39) | def calculate_metrics(log_lines: list[str]) -> tuple[float, float]:
function main (line 84) | def main():
FILE: scripts/chai-legacy-loader/add_package_fields.py
function validate_uuid (line 21) | def validate_uuid(uuid_string: str) -> None:
function process_csv (line 29) | def process_csv(input_file: str, output_file: str, package_manager_id: s...
FILE: scripts/chai-legacy-loader/batch_insert_package_urls.py
class ChaiPackageUrlsDB (line 19) | class ChaiPackageUrlsDB:
method __init__ (line 22) | def __init__(self, logger: Logger):
method load_package_id_cache (line 37) | def load_package_id_cache(self) -> dict[str, uuid.UUID]:
method load_url_id_cache_from_db (line 50) | def load_url_id_cache_from_db(
method batch_insert_package_urls (line 69) | def batch_insert_package_urls(
method close (line 103) | def close(self):
function load_url_id_cache_from_file (line 111) | def load_url_id_cache_from_file(
function process_package_url_associations (line 154) | def process_package_url_associations(
FILE: scripts/chai-legacy-loader/batch_insert_urls.py
class ChaiDB (line 20) | class ChaiDB:
method __init__ (line 23) | def __init__(self):
method batch_insert_urls (line 39) | def batch_insert_urls(
method close (line 89) | def close(self):
function process_urls_for_batch_insert (line 98) | def process_urls_for_batch_insert(
FILE: scripts/chai-legacy-loader/copy_dependencies_no_thread.py
class LegacyDB (line 25) | class LegacyDB:
method __init__ (line 28) | def __init__(self, input_package_manager: PackageManager):
method __del__ (line 39) | def __del__(self):
method get_sql_content (line 44) | def get_sql_content(self, filename: str) -> str:
method create_server_cursor (line 52) | def create_server_cursor(self, sql_file: str, cursor_name: str) -> None:
method fetch_batch (line 77) | def fetch_batch(self, cursor_name: str, batch_size: int) -> list[tuple]:
method close_cursor (line 86) | def close_cursor(self, cursor_name: str) -> None:
class ChaiDB (line 94) | class ChaiDB:
method __init__ (line 97) | def __init__(self, config: Config):
method _get_package_map (line 131) | def _get_package_map(self) -> dict[str, uuid.UUID]:
method _load_existing_dependencies (line 149) | def _load_existing_dependencies(self, batch_size: int = BATCH_SIZE) ->...
method init_copy_expert (line 179) | def init_copy_expert(self) -> None:
method add_rows_to_copy_expert (line 185) | def add_rows_to_copy_expert(self, rows: list[tuple]) -> int:
method add_rows_with_flush (line 220) | def add_rows_with_flush(self, rows: list[tuple], max_buffer_size=10000...
method complete_copy_expert (line 233) | def complete_copy_expert(self):
function main (line 255) | def main(
FILE: scripts/npm-singleton/single.py
class ChaiDB (line 16) | class ChaiDB(DB):
method __init__ (line 17) | def __init__(self):
method check_package_exists (line 20) | def check_package_exists(self, derived_id: str) -> bool:
method get_package_by_derived_id (line 27) | def get_package_by_derived_id(self, derived_id: str) -> Package:
method load (line 33) | def load(
function get_package_info (line 71) | def get_package_info(npm_package: str) -> tuple[bool, dict, str | None]:
function get_homepage (line 86) | def get_homepage(package_info: dict) -> tuple[bool, str | None]:
function get_repository_url (line 95) | def get_repository_url(package_info: dict) -> tuple[bool, str | None]:
function get_source_url (line 104) | def get_source_url(package_info: dict) -> tuple[bool, str | None]:
function canonicalize (line 117) | def canonicalize(url: str) -> str:
function get_latest_version (line 121) | def get_latest_version(package_info: dict) -> tuple[bool, str | None]:
function get_version_info (line 129) | def get_version_info(package_info: dict, version: str) -> tuple[bool, di...
function get_latest_version_dependencies (line 136) | def get_latest_version_dependencies(
function get_latest_version_dev_dependencies (line 152) | def get_latest_version_dev_dependencies(
function check_dependencies_on_chai (line 168) | def check_dependencies_on_chai(
function generate_url (line 189) | def generate_url(url_type_id: UUID, url: str) -> URL:
function generate_legacy_dependencies (line 193) | def generate_legacy_dependencies(
function print_status_report (line 219) | def print_status_report(
function process_package (line 309) | def process_package(package_name: str, dry_run: bool = False) -> bool:
FILE: scripts/package_to_package/package_dependencies.py
function preprocess_version_string (line 26) | def preprocess_version_string(version_str: str) -> str:
function get_latest_version_info (line 176) | def get_latest_version_info(versions: list[Version]) -> Version | None:
function insert_legacy_dependencies (line 232) | def insert_legacy_dependencies(
function process_package_dependencies (line 272) | def process_package_dependencies(config: Config, session: Session) -> None:
FILE: scripts/upgrade_canons/create_deleted_canons.py
function read_package_data_from_csv (line 11) | def read_package_data_from_csv(filename: str) -> list[tuple[str, UUID]]:
function process_deleted_package (line 33) | def process_deleted_package(
function write_failures_csv (line 122) | def write_failures_csv(
function main (line 133) | def main():
FILE: scripts/upgrade_canons/db.py
class DB (line 14) | class DB:
method __init__ (line 15) | def __init__(self):
method get_urls_by_type (line 23) | def get_urls_by_type(
method db_execute_values (line 59) | def db_execute_values(
method ingest (line 77) | def ingest(
method close (line 115) | def close(self):
method get_canons_by_url_ids (line 119) | def get_canons_by_url_ids(self, url_ids: list[UUID]) -> list[tuple[UUI...
FILE: scripts/upgrade_canons/delete_non_canonical_urls.py
function write_to_csv (line 14) | def write_to_csv(filename: str, headers: list[str], data: list[tuple]):
function get_all_urls (line 21) | def get_all_urls(db: DB) -> list[tuple[UUID, str]]:
function identify_non_canonical_urls (line 36) | def identify_non_canonical_urls(urls: list[tuple[UUID, str]]) -> list[UU...
function delete_urls_from_database (line 55) | def delete_urls_from_database(db: DB, url_ids: list[UUID], dry_run: bool...
function main (line 88) | def main(dry_run: bool = False):
FILE: scripts/upgrade_canons/main.py
function is_one_url_canonical (line 17) | def is_one_url_canonical(urls: list[str]) -> bool:
function generate_canonical_url (line 22) | def generate_canonical_url(urls: list[str]) -> str:
function generate_new_url (line 31) | def generate_new_url(url: str, url_type_id: UUID, now: datetime) -> URL:
function generate_new_package_url (line 36) | def generate_new_package_url(
function analyze_packages_needing_canonicalization (line 44) | def analyze_packages_needing_canonicalization(
function create_url_and_package_url_objects (line 77) | def create_url_and_package_url_objects(
function main (line 98) | def main(db: DB, url_type: str, url_type_id: UUID, dry_run: bool):
FILE: scripts/upgrade_canons/registered_projects.py
function read_canon_ids_from_stdin (line 11) | def read_canon_ids_from_stdin() -> list[UUID]:
function process_canon_id (line 24) | def process_canon_id(db: DB, canon_id: UUID, dry_run: bool) -> tuple[boo...
function write_failures_csv (line 106) | def write_failures_csv(
function main (line 117) | def main():
FILE: scripts/upgrade_canons/structs.py
class URL (line 8) | class URL:
class PackageURL (line 17) | class PackageURL:
FILE: tests/conftest.py
function mock_logger (line 28) | def mock_logger():
function mock_url_types (line 39) | def mock_url_types():
function mock_dependency_types (line 58) | def mock_dependency_types():
function mock_sources (line 79) | def mock_sources():
function mock_package_managers (line 95) | def mock_package_managers():
function mock_pm_config (line 113) | def mock_pm_config(mock_package_managers):
function mock_config (line 125) | def mock_config(
function mock_user_types (line 163) | def mock_user_types():
function sample_package_data (line 180) | def sample_package_data():
function mock_csv_reader (line 218) | def mock_csv_reader():
function pytest_configure (line 245) | def pytest_configure(config):
function mock_db (line 257) | def mock_db():
FILE: tests/package_managers/crates/conftest.py
function package_ids (line 16) | def package_ids():
function packages (line 22) | def packages(package_ids):
function diff_instance (line 45) | def diff_instance(mock_config):
function crate_with_dependencies (line 65) | def crate_with_dependencies():
FILE: tests/package_managers/crates/test_crates_diff_deps.py
class TestDiffDeps (line 17) | class TestDiffDeps:
method test_existing_dependency_no_changes (line 20) | def test_existing_dependency_no_changes(
method test_dependency_changed_type (line 59) | def test_dependency_changed_type(
method test_new_dependency (line 110) | def test_new_dependency(
method test_removed_dependency (line 144) | def test_removed_dependency(
method test_multiple_dependency_types_same_package (line 183) | def test_multiple_dependency_types_same_package(
method test_multiple_dependency_types_build_vs_dev (line 234) | def test_multiple_dependency_types_build_vs_dev(
FILE: tests/package_managers/debian/conftest.py
function create_debian_package (line 4) | def create_debian_package(
FILE: tests/package_managers/debian/test_debian_diff.py
class TestDebianDifferentialLoading (line 10) | class TestDebianDifferentialLoading:
method test_package_exists_url_update (line 13) | def test_package_exists_url_update(self, mock_config, mock_logger, moc...
method test_package_exists_dependency_change (line 79) | def test_package_exists_dependency_change(self, mock_config, mock_logg...
method test_completely_new_package (line 161) | def test_completely_new_package(self, mock_config, mock_logger, mock_db):
method test_no_changes_scenario (line 208) | def test_no_changes_scenario(self, mock_config, mock_logger, mock_db):
method test_package_description_update (line 243) | def test_package_description_update(self, mock_config, mock_logger, mo...
method test_missing_dependency_handling (line 280) | def test_missing_dependency_handling(self, mock_config, mock_logger, m...
method test_dependency_type_priority_no_change (line 310) | def test_dependency_type_priority_no_change(
method test_dependency_type_change_runtime_to_build (line 356) | def test_dependency_type_change_runtime_to_build(
method test_dependency_type_change_build_to_runtime (line 408) | def test_dependency_type_change_build_to_runtime(
method test_dependency_type_priority_new_package (line 458) | def test_dependency_type_priority_new_package(
method test_debian_specific_dependencies (line 498) | def test_debian_specific_dependencies(self, mock_config, mock_logger, ...
class TestDebianDiffFunction (line 536) | class TestDebianDiffFunction:
method test_duplicate_package_paragraphs (line 539) | def test_duplicate_package_paragraphs(self, mock_config, mock_logger, ...
FILE: tests/package_managers/debian/test_debian_parser.py
function simple_package (line 14) | def simple_package():
function simple_source (line 37) | def simple_source():
function multiline_binary (line 64) | def multiline_binary():
function build_depends (line 77) | def build_depends():
class TestDebianParser (line 86) | class TestDebianParser:
method test_build_depends (line 89) | def test_build_depends(self, build_depends):
method test_multiline_binary (line 104) | def test_multiline_binary(self, multiline_binary):
method test_parse_package_data (line 121) | def test_parse_package_data(self, simple_package):
method test_parse_source_data (line 157) | def test_parse_source_data(self, simple_source):
FILE: tests/package_managers/debian/test_debian_sources.py
class TestPackageSourceMapping (line 8) | class TestPackageSourceMapping:
method test_build_package_to_source_mapping_with_binary_list (line 11) | def test_build_package_to_source_mapping_with_binary_list(
method test_build_package_to_source_mapping_no_binary_list (line 48) | def test_build_package_to_source_mapping_no_binary_list(
method test_enrich_package_with_explicit_source (line 72) | def test_enrich_package_with_explicit_source(self, mock_logger):
method test_enrich_package_no_explicit_source (line 104) | def test_enrich_package_no_explicit_source(self, mock_logger):
method test_enrich_package_missing_source_warning (line 129) | def test_enrich_package_missing_source_warning(self, caplog, mock_logg...
method test_enrich_package_preserves_existing_fields (line 156) | def test_enrich_package_preserves_existing_fields(self, mock_logger):
FILE: tests/package_managers/homebrew/conftest.py
function package_ids (line 13) | def package_ids() -> dict[str, UUID]:
function packages (line 19) | def packages(package_ids) -> dict[str, Package]:
function diff_instance (line 58) | def diff_instance(mock_config):
function homebrew_formula (line 80) | def homebrew_formula():
FILE: tests/package_managers/homebrew/test_homebrew_diff_deps.py
class TestDiffDeps (line 19) | class TestDiffDeps:
method test_new_package_not_in_cache (line 22) | def test_new_package_not_in_cache(self, packages, diff_instance, homeb...
method test_existing_package_adding_dependency (line 45) | def test_existing_package_adding_dependency(
method test_existing_package_removing_dependency (line 99) | def test_existing_package_removing_dependency(
method test_existing_package_changing_dependency_type (line 147) | def test_existing_package_changing_dependency_type(
method test_existing_package_no_dependency_changes (line 195) | def test_existing_package_no_dependency_changes(
method test_existing_package_same_dependency_multiple_times_no_changes (line 231) | def test_existing_package_same_dependency_multiple_times_no_changes(
method test_existing_package_same_dependency_multiple_times_yes_changes (line 268) | def test_existing_package_same_dependency_multiple_times_yes_changes(
FILE: tests/package_managers/pkgx/test_pkgx_diff.py
function create_pkgx_package (line 18) | def create_pkgx_package(
class TestPkgxDifferentialLoading (line 70) | class TestPkgxDifferentialLoading:
method test_package_exists_url_update (line 73) | def test_package_exists_url_update(self, mock_config, mock_logger, moc...
method test_package_exists_dependency_change (line 150) | def test_package_exists_dependency_change(self, mock_config, mock_logg...
method test_completely_new_package (line 224) | def test_completely_new_package(self, mock_config, mock_logger, mock_db):
method test_no_changes_scenario (line 272) | def test_no_changes_scenario(self, mock_config, mock_logger, mock_db):
method test_missing_dependency_handling (line 305) | def test_missing_dependency_handling(self, mock_config, mock_logger, m...
method test_dependency_type_priority_no_change (line 333) | def test_dependency_type_priority_no_change(
method test_dependency_type_change_runtime_to_build (line 374) | def test_dependency_type_change_runtime_to_build(
method test_dependency_type_change_build_to_runtime (line 421) | def test_dependency_type_change_build_to_runtime(
method test_dependency_type_priority_new_package (line 466) | def test_dependency_type_priority_new_package(
method test_dependency_type_priority_with_test (line 501) | def test_dependency_type_priority_with_test(
FILE: tests/package_managers/pkgx/test_special_case.py
class TestSpecialCase (line 14) | class TestSpecialCase:
method test_special_case_crates_io (line 17) | def test_special_case_crates_io(self, mock_logger):
method test_special_case_x_org (line 24) | def test_special_case_x_org(self, mock_logger):
method test_special_case_pkgx_sh (line 29) | def test_special_case_pkgx_sh(self, mock_logger):
method test_special_case_no_slashes (line 36) | def test_special_case_no_slashes(self, mock_logger):
method test_special_case_double_slashes (line 40) | def test_special_case_double_slashes(self, mock_logger):
FILE: tests/ranker/test_compute_canon_name.py
function test_extract_repo_name_from_url (line 22) | def test_extract_repo_name_from_url(url, best_guess):
function test_score_name (line 37) | def test_score_name(name, best_guess, expected_score):
function test_check_if_better (line 54) | def test_check_if_better(name, best_guess, package_name, expected):
function test_compute_canon_name (line 83) | def test_compute_canon_name(url, package_name, existing_name, expected):
FILE: tests/ranker/test_dedupe.py
function ids (line 20) | def ids():
function test_packages (line 38) | def test_packages(ids):
function test_urls (line 72) | def test_urls(ids):
function mock_dedupe_config (line 104) | def mock_dedupe_config(ids):
function mock_db (line 113) | def mock_db():
function capture_ingest_calls (line 118) | def capture_ingest_calls(mock_db):
class TestDedupe (line 134) | class TestDedupe:
method test_new_canon_new_mapping (line 137) | def test_new_canon_new_mapping(
method test_new_canon_update_mapping (line 185) | def test_new_canon_update_mapping(
method test_no_changes_needed (line 250) | def test_no_changes_needed(
method test_update_existing_mapping (line 296) | def test_update_existing_mapping(
method test_create_new_mapping (line 368) | def test_create_new_mapping(
method test_multiple_packages_same_homepage_creates_single_canon (line 418) | def test_multiple_packages_same_homepage_creates_single_canon(
method test_empty_urls_no_deduplication (line 485) | def test_empty_urls_no_deduplication(
method test_canon_name_update_when_url_changes (line 544) | def test_canon_name_update_when_url_changes(
method test_canon_update_with_multiple_packages (line 614) | def test_canon_update_with_multiple_packages(
method test_skip_when_load_disabled (line 686) | def test_skip_when_load_disabled(self, mock_dedupe_config, mock_db):
FILE: tests/ranker/test_rx_graph.py
function large_chai_graph (line 26) | def large_chai_graph() -> tuple[CHAI, dict[uuid.UUID, Decimal]]:
FILE: tests/scripts/upgrade_canons/test_analyze_packages_needing_canonicalization.py
class TestAnalyzePackagesNeedingCanonicalization (line 11) | class TestAnalyzePackagesNeedingCanonicalization:
method setup_method (line 14) | def setup_method(self):
method test_case_1_should_create_canonical_url (line 23) | def test_case_1_should_create_canonical_url(
method test_case_2_canonical_exists_in_database (line 73) | def test_case_2_canonical_exists_in_database(
method test_case_3_canonical_already_planned (line 101) | def test_case_3_canonical_already_planned(self, mock_normalize, mock_i...
method test_case_4_package_already_has_canonical (line 136) | def test_case_4_package_already_has_canonical(self, mock_is_canonical):
method test_mixed_scenarios (line 167) | def test_mixed_scenarios(self, mock_normalize, mock_is_canonical):
method test_empty_inputs (line 220) | def test_empty_inputs(self):
method test_edge_case_empty_url_list (line 227) | def test_edge_case_empty_url_list(self, mock_normalize, mock_is_canoni...
Condensed preview — 142 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (869K chars).
[
{
"path": ".dockerignore",
"chars": 160,
"preview": "# directories\ndata/\n.venv/\ntests/\nscripts/\nlogs/\ndb/ \n\n# other files\n.gitignore\ndocker-compose.yml\n.DS_Store\n.git\nREADME"
},
{
"path": ".github/actions/complain/action.yml",
"chars": 1387,
"preview": "name: teaxyz/chai/complain\ndescription: creates an issue for any failing tests\n\ninputs:\n test_function:\n description"
},
{
"path": ".github/workflows/chai-api.ci.yml",
"chars": 2063,
"preview": "name: api.ci\n\non:\n push:\n branches: [main]\n paths:\n - \"api/**\"\n pull_request:\n paths:\n - \"api/**\"\n\n"
},
{
"path": ".github/workflows/ci.yml",
"chars": 1541,
"preview": "name: CI\n\non:\n workflow_dispatch:\n inputs:\n env:\n description: \"The environment to test against\"\n "
},
{
"path": ".github/workflows/deploy.yml",
"chars": 6165,
"preview": "name: \"Release Chai\"\nrun-name: Release Chai - ${{ inputs.env || 'auto' }} - ${{ inputs.ref || github.ref }}\n\non:\n push:"
},
{
"path": ".gitignore",
"chars": 3295,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": ".python-version",
"chars": 5,
"preview": "3.11\n"
},
{
"path": "LICENSE",
"chars": 1069,
"preview": "MIT License\n\nCopyright (c) 2024 tea protocol\n\nPermission is hereby granted, free of charge, to any person obtaining a co"
},
{
"path": "README.md",
"chars": 6367,
"preview": "# CHAI\n\nCHAI is an attempt at an open-source data pipeline for package managers. The\ngoal is to have a pipeline that can"
},
{
"path": "alembic/.pkgx.yaml",
"chars": 133,
"preview": "# this .pkgx.yaml file is only for alembic\n\ndependencies:\n postgresql.org: 16\n alembic.sqlalchemy.org: 1\n psycopg.org"
},
{
"path": "alembic/Dockerfile",
"chars": 302,
"preview": "FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim\nRUN apt update && apt -y install postgresql\nRUN uv pip install alembi"
},
{
"path": "alembic/README.md",
"chars": 2203,
"preview": "# CHAI Data Migrations\n\nThis directory contains the Alembic configuration and migration scripts for managing the\ndatabas"
},
{
"path": "alembic/alembic.ini",
"chars": 972,
"preview": "[alembic]\nscript_location = .\nfile_template = %%(year)d%%(month).2d%%(day).2d_%%(hour).2d%%(minute).2d-%%(slug)s\n\nprepen"
},
{
"path": "alembic/env.py",
"chars": 1930,
"preview": "import os\nfrom logging.config import fileConfig\n\nfrom sqlalchemy import engine_from_config, pool\n\nfrom alembic import co"
},
{
"path": "alembic/init-script.sql",
"chars": 159,
"preview": "CREATE DATABASE chai;\n\n\\c chai\n\nCREATE EXTENSION IF NOT EXISTS \"pgcrypto\";\nCREATE EXTENSION IF NOT EXISTS \"uuid-ossp\";\nC"
},
{
"path": "alembic/load-values.sql",
"chars": 720,
"preview": "-- url types\nINSERT INTO \"url_types\" (\"name\")\nVALUES ('source'), ('homepage'), ('documentation'), ('repository')\nON CONF"
},
{
"path": "alembic/run_migrations.sh",
"chars": 760,
"preview": "#!/bin/bash\n\nset -uo pipefail\n\n# This script sets up the database, runs migrations, and loads initial values\n\n# Check if"
},
{
"path": "alembic/script.py.mako",
"chars": 635,
"preview": "\"\"\"${message}\n\nRevision ID: ${up_revision}\nRevises: ${down_revision | comma,n}\nCreate Date: ${create_date}\n\n\"\"\"\nfrom typ"
},
{
"path": "alembic/versions/20241028_1217-base_migration.py",
"chars": 18643,
"preview": "\"\"\"base migration\n\nRevision ID: 238d591d5310\nRevises:\nCreate Date: 2024-10-28 12:17:43.762965\n\n\"\"\"\n\nfrom collections.abc"
},
{
"path": "alembic/versions/20250312_0045-add_legacy_dependency_table.py",
"chars": 2691,
"preview": "\"\"\"add-legacy-dependency-table\n\nRevision ID: 89af630dc946\nRevises: 238d591d5310\nCreate Date: 2025-03-12 00:45:35.727521\n"
},
{
"path": "alembic/versions/20250312_2244-canons.py",
"chars": 2563,
"preview": "\"\"\"canons\n\nRevision ID: e7632ae1aff7\nRevises: 89af630dc946\nCreate Date: 2025-03-12 22:44:45.272179\n\n\"\"\"\n\nfrom collection"
},
{
"path": "alembic/versions/20250416_0223-add_ranks.py",
"chars": 2106,
"preview": "\"\"\"add-ranks\n\nRevision ID: 26e124131bf8\nRevises: e7632ae1aff7\nCreate Date: 2025-04-16 02:23:33.665773\n\n\"\"\"\n\nfrom collect"
},
{
"path": "alembic/versions/20250422_0940-add_unique_package_to_canon_packages.py",
"chars": 881,
"preview": "\"\"\"add-unique-package-to-canon-packages\n\nRevision ID: a41236bd2340\nRevises: 26e124131bf8\nCreate Date: 2025-04-22 09:40:2"
},
{
"path": "alembic/versions/20250508_1752-add_trgm_indexes.py",
"chars": 1483,
"preview": "\"\"\"add_trgm_indexes\n\nRevision ID: 7392d4d74ce2\nRevises: a41236bd2340\nCreate Date: 2025-05-08 17:52:40.417822\n\n\"\"\"\n\nfrom "
},
{
"path": "alembic/versions/20250529_2341-rename_canons_table_and_recreate.py",
"chars": 3499,
"preview": "\"\"\"rename_canons_table_and_recreate\n\nRevision ID: 542d79f30fc9\nRevises: 7392d4d74ce2\nCreate Date: 2025-05-29 23:41:38.46"
},
{
"path": "alembic/versions/20250529_2345-recreate_canon_foreign_keys.py",
"chars": 1571,
"preview": "\"\"\"recreate_canon_foreign_keys\n\nRevision ID: 3de32bb99a71\nRevises: 542d79f30fc9\nCreate Date: 2025-05-29 23:45:12.372951\n"
},
{
"path": "api/.dockerignore",
"chars": 34,
"preview": "/target\n.git\n.gitignore\nREADME.md\n"
},
{
"path": "api/.gitignore",
"chars": 35,
"preview": "/target\n**/*.rs.bk\nCargo.lock\n.env\n"
},
{
"path": "api/Cargo.toml",
"chars": 733,
"preview": "[package]\nname = \"chai-api\"\nversion = \"1.3.0\"\nedition = \"2021\"\nauthors = [\"Jacob Heider <jacob@pkgx.dev>\"]\ndescription ="
},
{
"path": "api/Dockerfile",
"chars": 640,
"preview": "FROM --platform=linux/amd64 lukemathwalker/cargo-chef:latest-rust-1.82.0 as chef\nWORKDIR /app\n\nFROM chef as planner\nCOPY"
},
{
"path": "api/README.md",
"chars": 11239,
"preview": "# CHAI API\n\nCHAI API is a REST API service for accessing the CHAI database, which contains package\nmanager data.\n\n## Fea"
},
{
"path": "api/src/app_state.rs",
"chars": 710,
"preview": "use dashmap::DashMap;\nuse deadpool_postgres::Pool;\nuse serde_json::Value;\nuse std::sync::Arc;\nuse std::time::{Duration, "
},
{
"path": "api/src/db.rs",
"chars": 1381,
"preview": "use deadpool_postgres::{Config, Pool, Runtime};\nuse std::env;\nuse std::sync::Arc;\nuse tokio_postgres::{Client, NoTls};\nu"
},
{
"path": "api/src/handlers.rs",
"chars": 21571,
"preview": "use actix_web::{get, post, web, HttpResponse, Responder};\nuse serde::{Deserialize, Serialize};\nuse serde_json::{json, Va"
},
{
"path": "api/src/logging.rs",
"chars": 314,
"preview": "use env_logger::Env;\n\npub fn setup_logger() {\n env_logger::init_from_env(Env::default().default_filter_or(\"info\"));\n}"
},
{
"path": "api/src/main.rs",
"chars": 1726,
"preview": "mod app_state;\nmod db;\nmod handlers;\nmod logging;\nmod utils;\n\nuse actix_web::{web, App, HttpServer};\nuse dashmap::DashMa"
},
{
"path": "api/src/utils.rs",
"chars": 4133,
"preview": "use actix_web::web::Query;\nuse chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};\nuse dashmap::DashMap;\nuse serde_json::"
},
{
"path": "core/README.md",
"chars": 3586,
"preview": "# Core Tools for CHAI Python Loaders\n\nThis directory contains a set of core tools and utilities to facilitate loading th"
},
{
"path": "core/config.py",
"chars": 4966,
"preview": "from enum import Enum\n\nfrom sqlalchemy import UUID\n\nfrom core.db import ConfigDB\nfrom core.logger import Logger\nfrom cor"
},
{
"path": "core/db.py",
"chars": 12400,
"preview": "import os\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import Any\nfrom uuid import UUID"
},
{
"path": "core/fetcher.py",
"chars": 5308,
"preview": "import gzip\nimport json\nimport os\nimport tarfile\nfrom dataclasses import dataclass\nfrom datetime import datetime\nfrom io"
},
{
"path": "core/logger.py",
"chars": 1483,
"preview": "import sys\nimport time\nimport traceback\n\nfrom core.utils import env_vars\n\nDEBUG = env_vars(\"DEBUG\", \"false\")\n\n\ndef as_mi"
},
{
"path": "core/models/__init__.py",
"chars": 16822,
"preview": "# __init__.py\nfrom __future__ import annotations\n\nfrom datetime import datetime\n\nfrom sqlalchemy import (\n Column,\n "
},
{
"path": "core/requirements.txt",
"chars": 1372,
"preview": "# This file was autogenerated by uv via the following command:\n# uv pip compile --group indexers -o core/requirements"
},
{
"path": "core/scheduler.py",
"chars": 1157,
"preview": "import time\nfrom collections.abc import Callable\nfrom os import getenv\nfrom threading import Thread\n\nimport schedule\n\nfr"
},
{
"path": "core/structs.py",
"chars": 1052,
"preview": "from dataclasses import dataclass\nfrom datetime import datetime\nfrom uuid import UUID\n\nfrom core.models import URL, Lega"
},
{
"path": "core/test.json",
"chars": 1555,
"preview": "[\n {\n 'id': UUID('b3133e5e-6d6b-458b-bd83-bf31032875a4'), \n 'package_id': UUID('7d6c7a3f-2c75-425f-8674"
},
{
"path": "core/transformer.py",
"chars": 1828,
"preview": "import csv\nimport os\n\nfrom permalint import normalize_url, possible_names\nfrom sqlalchemy import UUID\n\nfrom core.db impo"
},
{
"path": "core/utils.py",
"chars": 1591,
"preview": "from os import getenv\nfrom os.path import exists, join\nfrom typing import Any\n\n\ndef safe_int(val: str) -> int | None:\n "
},
{
"path": "db/README.md",
"chars": 4894,
"preview": "# CHAI Data Model\n\nThe CHAI data model is designed to represent the package manager data in a unified and\nconsistent for"
},
{
"path": "db/queries.md",
"chars": 1663,
"preview": "# Chai Data Exploration\n\n```sql\n-- Packages with the longest lifetime\nSELECT p.name,\nSUM(v.downloads) AS \"downloads\",\nco"
},
{
"path": "docker-compose.yml",
"chars": 4217,
"preview": "services:\n db:\n image: postgres\n shm_size: 256m\n environment:\n - POSTGRES_USER=postgres\n - POSTGRES_"
},
{
"path": "examples/sbom-meta/README.md",
"chars": 718,
"preview": "# SBOM-Meta\n\nAn example Chai application that displays package metadata for\n[SBOMs](https://github.com/anchore/syft) (so"
},
{
"path": "examples/sbom-meta/go.mod",
"chars": 10286,
"preview": "module sbom-meta\n\ngo 1.23.2\n\nrequire (\n\tgithub.com/anchore/syft v1.14.0\n\tgithub.com/caarlos0/env v3.5.0+incompatible\n\tgi"
},
{
"path": "examples/sbom-meta/go.sum",
"chars": 128818,
"preview": "cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=\ncloud.google.com/go v0.34.0/go.mod h1"
},
{
"path": "examples/sbom-meta/main.go",
"chars": 5197,
"preview": "package main\n\nimport (\n\t\"context\"\n\t\"encoding/json\"\n\t\"flag\"\n\t\"fmt\"\n\t\"os\"\n\t\"sort\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/dustin/"
},
{
"path": "examples/visualizer/README.md",
"chars": 1352,
"preview": "# Visualizer\n\nAn example Chai application that displays a graphical representation of a specific\npackage.\n\n## Requiremen"
},
{
"path": "examples/visualizer/main.py",
"chars": 9675,
"preview": "import argparse\nimport cProfile\nimport pstats\nfrom os import getenv\nfrom pstats import SortKey\n\nimport psycopg2\nimport r"
},
{
"path": "examples/visualizer/monitor.py",
"chars": 4212,
"preview": "import argparse\nimport time\nfrom collections import defaultdict\nfrom collections.abc import Callable\nfrom functools impo"
},
{
"path": "package_managers/crates/Dockerfile",
"chars": 309,
"preview": "FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim\n\n# Copy everything from the root directory (build context)\nCOPY . .\n\n"
},
{
"path": "package_managers/crates/README.md",
"chars": 3653,
"preview": "# crates\n\nThe crates service uses the database dump provided by crates.io and coerces their data\nmodel into CHAI's. It's"
},
{
"path": "package_managers/crates/db.py",
"chars": 6728,
"preview": "from uuid import UUID\n\nfrom sqlalchemy import select\n\nfrom core.config import Config\nfrom core.db import DB\nfrom core.mo"
},
{
"path": "package_managers/crates/diff.py",
"chars": 12723,
"preview": "from datetime import datetime\nfrom uuid import UUID, uuid4\n\nfrom core.config import Config\nfrom core.logger import Logge"
},
{
"path": "package_managers/crates/main.py",
"chars": 5193,
"preview": "from uuid import UUID\n\nfrom core.config import Config, PackageManager\nfrom core.fetcher import TarballFetcher\nfrom core."
},
{
"path": "package_managers/crates/structs.py",
"chars": 1919,
"preview": "from dataclasses import dataclass, field\nfrom datetime import datetime\nfrom enum import IntEnum\nfrom typing import Typed"
},
{
"path": "package_managers/crates/transformer.py",
"chars": 6355,
"preview": "import csv\nfrom collections.abc import Generator\n\nfrom core.config import Config\nfrom core.transformer import Transforme"
},
{
"path": "package_managers/debian/Dockerfile",
"chars": 268,
"preview": "FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim\n\n# Copy everything\nCOPY . .\n\n# Install core requirements using uv\nWOR"
},
{
"path": "package_managers/debian/README.md",
"chars": 1133,
"preview": "# Debian\n\n## Data Structure\n\n- Source represents the original upstream as Debian receives\n- Package is a binary that use"
},
{
"path": "package_managers/debian/db.py",
"chars": 1152,
"preview": "#!/usr/bin/env pkgx uv run\n\nfrom core.config import Config\nfrom core.db import DB, CurrentURLs\nfrom core.structs import "
},
{
"path": "package_managers/debian/debian_sources.py",
"chars": 3175,
"preview": "from core.logger import Logger\nfrom package_managers.debian.parser import DebianParser\nfrom package_managers.debian.stru"
},
{
"path": "package_managers/debian/diff.py",
"chars": 11767,
"preview": "#!/usr/bin/env pkgx uv run\n\nfrom datetime import datetime\nfrom uuid import UUID, uuid4\n\nfrom core.config import Config\nf"
},
{
"path": "package_managers/debian/main.py",
"chars": 8037,
"preview": "#!/usr/bin/env pkgx uv run\n\nimport os\nimport time\nfrom datetime import datetime\nfrom uuid import UUID\n\nfrom core.config "
},
{
"path": "package_managers/debian/parser.py",
"chars": 9049,
"preview": "import re\nfrom collections.abc import Iterator\n\nfrom permalint import normalize_url\n\nfrom package_managers.debian.struct"
},
{
"path": "package_managers/debian/scripts/investigate_sources.py",
"chars": 9083,
"preview": "#!/usr/bin/env pkgx uv run\n\n\"\"\"\nScript to investigate the relationship between Debian sources and packages files.\nThis h"
},
{
"path": "package_managers/debian/scripts/test_investigate_sources.py",
"chars": 56369,
"preview": "from unittest.mock import mock_open, patch\n\nimport pytest\n\nfrom package_managers.debian.scripts.investigate_sources impo"
},
{
"path": "package_managers/debian/structs.py",
"chars": 2758,
"preview": "from dataclasses import dataclass, field\n\n\n# structures\n@dataclass\nclass Maintainer:\n name: str = field(default_facto"
},
{
"path": "package_managers/homebrew/Dockerfile",
"chars": 311,
"preview": "FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim\n\n# Copy everything from the root directory (build context)\nCOPY . .\n\n"
},
{
"path": "package_managers/homebrew/README.md",
"chars": 1472,
"preview": "# Homebrew\n\nThe Homebrew service uses Homebrew's JSON API Documentation to build the Homebrew\ndata model, using a diff a"
},
{
"path": "package_managers/homebrew/db.py",
"chars": 786,
"preview": "from core.config import Config\nfrom core.db import DB, CurrentURLs\nfrom core.structs import CurrentGraph\n\n\nclass Homebre"
},
{
"path": "package_managers/homebrew/diff.py",
"chars": 10349,
"preview": "from datetime import datetime\nfrom uuid import UUID, uuid4\n\nfrom core.config import Config\nfrom core.logger import Logge"
},
{
"path": "package_managers/homebrew/formulae.py",
"chars": 2750,
"preview": "import re\nfrom typing import Any\n\nfrom permalint import normalize_url\nfrom requests import get\n\nfrom core.config import "
},
{
"path": "package_managers/homebrew/main.py",
"chars": 3317,
"preview": "#! /usr/bin/env pkgx +python@3.11 uv run\n\nfrom datetime import datetime\nfrom uuid import UUID\n\nfrom core.config import C"
},
{
"path": "package_managers/homebrew/structs.py",
"chars": 384,
"preview": "from dataclasses import dataclass\n\n\n@dataclass\nclass Actual:\n formula: str\n description: str\n license: str\n "
},
{
"path": "package_managers/pkgx/Dockerfile",
"chars": 267,
"preview": "FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim\n\n# Copy everything\nCOPY . .\n\n# Install core requirements using uv\nWOR"
},
{
"path": "package_managers/pkgx/db.py",
"chars": 768,
"preview": "#!/usr/bin/env pkgx uv run\n\nfrom core.config import Config\nfrom core.db import DB, CurrentURLs\nfrom core.structs import "
},
{
"path": "package_managers/pkgx/diff.py",
"chars": 10810,
"preview": "#!/usr/bin/env pkgx uv run\n\nfrom datetime import datetime\nfrom uuid import UUID, uuid4\n\nfrom core.config import Config\nf"
},
{
"path": "package_managers/pkgx/loader.py",
"chars": 8317,
"preview": "from sqlalchemy import select\nfrom sqlalchemy.dialects.postgresql import insert as pg_insert\n\nfrom core.config import Co"
},
{
"path": "package_managers/pkgx/main.py",
"chars": 4982,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run\n\nimport os\nimport time\nfrom datetime import datetime\nfrom uuid import UUID\n\nfrom"
},
{
"path": "package_managers/pkgx/parser.py",
"chars": 15751,
"preview": "from collections.abc import Iterator\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing impor"
},
{
"path": "package_managers/pkgx/url.py",
"chars": 3718,
"preview": "import re\nfrom uuid import UUID\n\nfrom permalint import normalize_url, possible_names\nfrom requests import Response, get\n"
},
{
"path": "pkgx.yaml",
"chars": 263,
"preview": "# this is the pkgx config across all the services covered by docker-compose\ndependencies:\n python.org: ~3.11\n xcfile.d"
},
{
"path": "pyproject.toml",
"chars": 1858,
"preview": "[project]\nname = \"chai\"\nversion = \"1.0.0\"\ndescription = \"An open-source data pipeline for all package managers\"\nauthors "
},
{
"path": "ranker/.dockerignore",
"chars": 8,
"preview": "prompts/"
},
{
"path": "ranker/.gitignore",
"chars": 8,
"preview": "prompts/"
},
{
"path": "ranker/Dockerfile",
"chars": 312,
"preview": "FROM python:3.11\n\n# Copy everything\nCOPY . . \n\n# Install core requirements \nWORKDIR /core \nRUN pip install --no-cache-di"
},
{
"path": "ranker/README.md",
"chars": 2096,
"preview": "# ranker\n\ngenerates a deduplicated graph across all CHAI package managers by URL, and publishes a\ntea_rank\n\n## Requireme"
},
{
"path": "ranker/config.py",
"chars": 6313,
"preview": "from dataclasses import dataclass\nfrom decimal import Decimal, getcontext\nfrom uuid import UUID\n\nfrom sqlalchemy import "
},
{
"path": "ranker/db.py",
"chars": 7282,
"preview": "from uuid import UUID\n\nfrom sqlalchemy.dialects.postgresql import insert as pg_insert\n\nfrom core.db import DB\nfrom core."
},
{
"path": "ranker/dedupe.py",
"chars": 11685,
"preview": "#!/usr/bin/env uv run --with sqlalchemy==2.0.34 --with permalint==0.1.12\nfrom datetime import datetime\nfrom uuid import "
},
{
"path": "ranker/main.py",
"chars": 5562,
"preview": "#! /usr/bin/env pkgx +python@3.11 uv run\n\n# /// script\n# dependencies = [\n# \"permalint==0.1.12\",\n# \"sqlalchemy==2.0."
},
{
"path": "ranker/naming.py",
"chars": 3087,
"preview": "#!/usr/bin/env uv run --with permalint==0.1.12\nfrom uuid import UUID\n\nfrom permalint import possible_names\n\nfrom core.mo"
},
{
"path": "ranker/requirements.txt",
"chars": 260,
"preview": "# This file was autogenerated by uv via the following command:\n# uv pip compile --group ranker -o ranker/requirements"
},
{
"path": "ranker/rx_graph.py",
"chars": 4993,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run\n\nfrom collections import defaultdict, deque\nfrom dataclasses import dataclass, f"
},
{
"path": "ranker/utils/analyze_ranks.py",
"chars": 4708,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run --with pandas --with sqlalchemy\n\n\"\"\"Script to analyze rank data and generate for"
},
{
"path": "ranker/utils/parse_log.py",
"chars": 3049,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run\n\n\"\"\"\nParse graph run log to calculate processing metrics.\n\nThis script analyzes "
},
{
"path": "scripts/chai-legacy-loader/README.md",
"chars": 2392,
"preview": "# CHAI Legacy Data Loader\n\nTools for loading legacy CHAI data into the current CHAI database framework.\n\n> [!NOTE]\n> Thi"
},
{
"path": "scripts/chai-legacy-loader/add_package_fields.py",
"chars": 3812,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run\n\n\"\"\"\nFor a csv generated from legacy chai, this script adds the id, created_at, "
},
{
"path": "scripts/chai-legacy-loader/batch_insert_package_urls.py",
"chars": 13982,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run --with psycopg2==2.9.9\n\nimport argparse\nimport csv\nimport os\nimport uuid\nfrom da"
},
{
"path": "scripts/chai-legacy-loader/batch_insert_urls.py",
"chars": 10693,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run --with psycopg2==2.9.9\n\nimport argparse\nimport csv\nimport os\nimport uuid\nfrom da"
},
{
"path": "scripts/chai-legacy-loader/copy_dependencies_no_thread.py",
"chars": 12171,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run\nimport argparse\nimport io\nimport os\nimport uuid\n\nimport psycopg2\nimport psycopg2"
},
{
"path": "scripts/chai-legacy-loader/pkgx.yaml",
"chars": 69,
"preview": "dependencies:\n - python@3.11\n - postgresql.org@16\n - astral.sh/uv\n"
},
{
"path": "scripts/chai-legacy-loader/sql/dependencies.sql",
"chars": 335,
"preview": "-- from old CHAI's structure, the sources table stores dependencies from package to \n-- package\n-- the projects tables s"
},
{
"path": "scripts/chai-legacy-loader/sql/packages.sql",
"chars": 314,
"preview": "-- TODO: swap npm for $1, and update the scripts\nselect \n\tconcat('npm', '/', project_name) as \"derived_id\",\n\tproject_nam"
},
{
"path": "scripts/chai-legacy-loader/sql/urls.sql",
"chars": 204,
"preview": "select \n\tid as import_id,\n\t\"source\", \n\thomepage \nfrom projects\nwhere \n\t'npm' = any(package_managers)\n\tand created_at < '"
},
{
"path": "scripts/npm-singleton/README.md",
"chars": 3877,
"preview": "# NPM Singleton Package Loader\n\nA utility script for loading a single NPM package and its metadata into the CHAI databas"
},
{
"path": "scripts/npm-singleton/pkgx.yaml",
"chars": 52,
"preview": "dependencies:\n xcfile.dev: \"*\"\n python.org: ^3.11\n"
},
{
"path": "scripts/npm-singleton/single.py",
"chars": 13669,
"preview": "#!/usr/bin/env pkgx +python@3.11 uv run --with requests==2.31.0 --with permalint==0.1.15\nimport argparse\nimport sys\nfrom"
},
{
"path": "scripts/package_to_package/package_dependencies.py",
"chars": 14729,
"preview": "#! /usr/bin/env pkgx +python@3.11 uv run\nimport argparse\nimport re\nimport sys\nfrom typing import Any\n\nfrom packaging imp"
},
{
"path": "scripts/upgrade_canons/.gitignore",
"chars": 5,
"preview": "*.csv"
},
{
"path": "scripts/upgrade_canons/README.md",
"chars": 1633,
"preview": "# Upgrade Canons Scripts\n\nCollection of scripts for managing canonical URLs and Canon IDs in CHAI database.\n\n## Scripts "
},
{
"path": "scripts/upgrade_canons/create_deleted_canons.py",
"chars": 5720,
"preview": "#!/usr/bin/env pkgx uv run\n\nimport argparse\nimport csv\nimport sys\nfrom uuid import UUID\n\nfrom scripts.upgrade_canons.db "
},
{
"path": "scripts/upgrade_canons/db.py",
"chars": 4249,
"preview": "from collections import defaultdict\nfrom os import getenv\nfrom uuid import UUID\n\nimport psycopg2\nfrom psycopg2.extras im"
},
{
"path": "scripts/upgrade_canons/delete_non_canonical_urls.py",
"chars": 4513,
"preview": "#!/usr/bin/env pkgx uv run\n\nimport argparse\nimport csv\nimport sys\nimport warnings\nfrom uuid import UUID\n\nfrom permalint "
},
{
"path": "scripts/upgrade_canons/main.py",
"chars": 4847,
"preview": "#!/usr/bin/env uv run --with psycopg2==2.9.10 --with permalint==0.1.14\n\nimport argparse\nimport warnings\nfrom datetime im"
},
{
"path": "scripts/upgrade_canons/registered_projects.py",
"chars": 4603,
"preview": "#!/usr/bin/env pkgx uv run\n\nimport argparse\nimport csv\nimport sys\nfrom uuid import UUID\n\nfrom scripts.upgrade_canons.db "
},
{
"path": "scripts/upgrade_canons/structs.py",
"chars": 428,
"preview": "from dataclasses import dataclass\nfrom datetime import datetime\nfrom uuid import UUID\n\n\n# let's make classes defining th"
},
{
"path": "tests/README.md",
"chars": 8450,
"preview": "# CHAI Test Suite\n\nThis directory contains the test suite for the CHAI package indexer. All tests are written using [pyt"
},
{
"path": "tests/conftest.py",
"chars": 7899,
"preview": "\"\"\"\nCommon test fixtures and configurations for pytest.\n\nThis module provides reusable fixtures for testing the CHAI pac"
},
{
"path": "tests/package_managers/crates/conftest.py",
"chars": 2534,
"preview": "from datetime import datetime\nfrom uuid import uuid4\n\nimport pytest\n\nfrom core.models import Package\nfrom core.structs i"
},
{
"path": "tests/package_managers/crates/test_crates_diff_deps.py",
"chars": 10545,
"preview": "\"\"\"\nTest the diff_deps functionality for the crates package manager.\n\nThis module tests the Diff.diff_deps method which "
},
{
"path": "tests/package_managers/debian/conftest.py",
"chars": 1448,
"preview": "from package_managers.debian.parser import DebianData, Depends\n\n\ndef create_debian_package(\n package: str = \"test-pac"
},
{
"path": "tests/package_managers/debian/test_debian_diff.py",
"chars": 20392,
"preview": "from uuid import uuid4\n\nfrom core.models import URL, LegacyDependency, Package, PackageURL\nfrom core.structs import Cach"
},
{
"path": "tests/package_managers/debian/test_debian_parser.py",
"chars": 7369,
"preview": "\"\"\"\nTest Debian package parser functionality.\n\nThis module tests the DebianParser class which parses Debian package\nand "
},
{
"path": "tests/package_managers/debian/test_debian_sources.py",
"chars": 7219,
"preview": "from package_managers.debian.main import (\n build_package_to_source_mapping,\n enrich_package_with_source,\n)\nfrom t"
},
{
"path": "tests/package_managers/homebrew/conftest.py",
"chars": 2992,
"preview": "from datetime import datetime\nfrom uuid import UUID, uuid4\n\nimport pytest\n\nfrom core.models import Package\nfrom core.str"
},
{
"path": "tests/package_managers/homebrew/test_homebrew_diff_deps.py",
"chars": 11707,
"preview": "\"\"\"\nTest the diff_deps functionality for the homebrew package manager.\n\nThis module tests the Diff.diff_deps method whic"
},
{
"path": "tests/package_managers/pkgx/test_pkgx_diff.py",
"chars": 20209,
"preview": "#!/usr/bin/env pkgx uv run\n\nfrom unittest.mock import patch\nfrom uuid import uuid4\n\nfrom core.models import URL, LegacyD"
},
{
"path": "tests/package_managers/pkgx/test_special_case.py",
"chars": 1577,
"preview": "\"\"\"\nTest special case URL handling in PkgxTransformer.\n\nThis module tests the special_case method which handles URL tran"
},
{
"path": "tests/ranker/test_compute_canon_name.py",
"chars": 2460,
"preview": "#!/usr/bin/env uv run --with pytest\nimport pytest\n\nfrom ranker.naming import (\n check_if_better,\n compute_canon_na"
},
{
"path": "tests/ranker/test_dedupe.py",
"chars": 26126,
"preview": "\"\"\"\nTest the package deduplication functionality in the ranker.\n\nThis module tests the dedupe.main function which handle"
},
{
"path": "tests/ranker/test_rx_graph.py",
"chars": 1822,
"preview": "\"\"\"\nTest the CHAI graph ranking algorithm.\n\nThis module tests the rx_graph module which implements a custom graph-based\n"
},
{
"path": "tests/scripts/upgrade_canons/test_analyze_packages_needing_canonicalization.py",
"chars": 9153,
"preview": "#!/usr/bin/env pkgx uv run\n\nfrom unittest.mock import call, patch\nfrom uuid import UUID\n\nimport pytest\n\nfrom scripts.upg"
}
]
About this extraction
This page contains the full source code of the teaxyz/chai GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 142 files (809.9 KB), approximately 249.4k tokens, and a symbol index with 617 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.