} props.osdViewerRef
* A ref to the active OpenSeadragon viewer. Must expose `setFilterOptions`.
*
* @example
*
*/
export default function ImageFilters({filterPluginRef}) {
const [gamma, setGamma] = useState(1.0);
const [invert, setInvert] = useState(false);
const [threshold, setThreshold] = useState(0);
// Debounced bridge to OSD filter pipeline
const updateFilters = debounce(() => {
// Get the plugin instance from the ref
const plugin = filterPluginRef.current;
if (!plugin) return;
const processors = [];
if (gamma !== 1 && gamma >= 0 && gamma <= 5) {
processors.push(GAMMA(gamma));
}
if (invert) {
processors.push(INVERT());
}
if (threshold > 0 && threshold <= 255) {
processors.push(THRESHOLDING(threshold));
}
//Call setFilterOptions on the PLUGIN
plugin.setFilterOptions({
filters: {processors},
});
}, 100);
// Apply filters when any control changes
useEffect(() => {
updateFilters();
return updateFilters.cancel; // cleanup debounce
}, [gamma, invert, threshold]); // eslint-disable-line react-hooks/exhaustive-deps
const handleReset = () => {
setGamma(1.0);
setInvert(false);
setThreshold(0);
};
return (
);
}
================================================
FILE: frontend/src/viewer/InvertFilterForm.jsx
================================================
/**
* InvertFilterForm
*
* Simple on/off control for an invert color filter. Parent manages state and
* passes the current value plus a setter.
*
* @component
* @param {Object} props
* @param {boolean} props.invert - Current invert state
* @param {(value: boolean) => void} props.setInvert - Setter for invert state
*/
export default function InvertFilterForm({invert, setInvert}) {
const handleChange = (e) => {
setInvert(e.target.checked);
};
const handleReset = () => {
setInvert(false);
};
return (
);
}
================================================
FILE: frontend/src/viewer/KeyboardHelpModal.jsx
================================================
import KeyboardShortcutRow from './KeyboardShortcutRow';
/*
KeyboardHelpModal
Bootstrap modal that lists viewer keyboard shortcuts. Rows are rendered
with KeyboardShortcutRow.
Usage:
- Trigger with data-bs-target="#keyboard-help-modal"
- Presentational only
Accessibility:
- Uses role="dialog" and Bootstrap aria attributes
- Close button has aria-label
*/
export default function KeyboardHelpModal() {
return (
);
}
================================================
FILE: frontend/src/viewer/KeyboardShortcutRow.jsx
================================================
import React from 'react';
/**
* KeyboardShortcutRow
*
* Renders one table row for a keyboard shortcut. Shows the key sequence in a
* row header cell and the action description in an adjacent cell.
*
* Rendering:
* - Keys are comma separated with a space
* - Keys are placed in a | , description in a |
*
* Accessibility:
* - provides semantic markup for key names
* - Consumers should ensure the surrounding table has proper headers or caption
*
* @param {Array<{text: string, wrap: boolean}>} keys - Ordered keys to display.
* When wrap is true the key is wrapped in , otherwise rendered as plain
* text.
* @param {string} description - Human readable description of the shortcut
* action.
* @returns {JSX.Element}
*/
export default function KeyboardShortcutRow({keys, description}) {
return (
|
{keys.map((key, i) => (
{key.wrap ? {key.text} : key.text}
{i < keys.length - 1 && ', '}
))}
|
{description} |
);
}
================================================
FILE: frontend/src/viewer/ThresholdFilterForm.jsx
================================================
/**
* Controls the binarization threshold used by the image viewer filter.
*
* Behavior:
* - Number input and range slider stay in sync.
* - Up and down arrow buttons change the value by 1 within 0-255.
* - Reset sets the threshold to 0.
*
* Accessibility:
* - Inputs have associated labels with visually hidden text.
* - Increment and decrement buttons include hidden text for screen readers.
*
* @param {number} threshold - Current threshold value in the range 0-255.
* @param {Function} setThreshold - Setter to update the threshold.
* @returns {JSX.Element}
*/
export default function ThresholdFilterForm({threshold, setThreshold}) {
const handleNumberChange = (e) => {
setThreshold(parseInt(e.target.value, 10));
};
const handleRangeChange = (e) => {
setThreshold(parseInt(e.target.value, 10));
};
const handleReset = () => {
setThreshold(0);
};
const stepUp = () => {
setThreshold((prev) => Math.min(prev + 1, 255));
};
const stepDown = () => {
setThreshold((prev) => Math.max(prev - 1, 0));
};
return (
);
}
================================================
FILE: frontend/src/viewer/Viewer.jsx
================================================
import React, {useEffect, useRef, useState} from 'react';
import OpenSeadragon from 'openseadragon';
import {initializeFiltering} from 'openseadragon-filters';
import screenfull from 'screenfull';
import {prefixUrl, contactUrl} from '../config.js';
import ViewerControls from './Controls';
import ImageFilters from './ImageFilters';
import KeyboardHelpModal from './KeyboardHelpModal';
/**
* Viewer
*
* Mounts an OpenSeadragon instance, wires up UI controls and filter panels,
* and exposes a fullscreen toggle. Cleans up the viewer on unmount.
*
* Behavior:
* - Initializes OpenSeadragon with filtering support and common UI buttons
* - On "open" event, recenters via viewport.goHome(true)
* - On "open-failed", logs an error and shows an alert with a contact URL
* - Stores the live OSD instance on window.seadragonViewer for external use
* - Destroys the OSD instance during cleanup to avoid leaks
*
* Dependencies:
* - Requires the "openseadragon-filters" plugin to be imported once
* - Uses the "screenfull" library for fullscreen where available
*
* @param {string} imageUrl - Source image URL used by OpenSeadragon.
* @param {Function} onLayoutHorizontal - Callback to switch to horizontal layout.
* @param {Function} onLayoutVertical - Callback to switch to vertical layout.
* @returns {JSX.Element}
*/
export default function Viewer({
imageUrl,
onLayoutHorizontal,
onLayoutVertical,
}) {
const viewerRef = useRef(null); // For OSD
const containerRef = useRef(null); // For Fullscreen wrapper
const osdViewerRef = useRef(null);
const filterPluginRef = useRef(null);
// State to track fullscreen changes
const [isFullscreen, setIsFullscreen] = useState(false);
// Add listener for fullscreen changes
useEffect(() => {
const handler = () => {
setIsFullscreen(screenfull.isFullscreen);
};
if (screenfull.isEnabled) {
screenfull.on('change', handler);
}
return () => {
if (screenfull.isEnabled) {
screenfull.off('change', handler);
}
};
}, []);
useEffect(() => {
if (!viewerRef.current || !imageUrl) return;
osdViewerRef.current = OpenSeadragon({
element: viewerRef.current,
prefixUrl: prefixUrl,
tileSources: {
type: 'image',
url: `${imageUrl}?canvas`,
},
gestureSettingsTouch: {
pinchRotate: true,
},
showNavigator: true,
showRotationControl: true,
showFlipControl: true,
zoomInButton: 'viewer-zoom-in',
zoomOutButton: 'viewer-zoom-out',
homeButton: 'viewer-home',
rotateLeftButton: 'viewer-rotate-left',
rotateRightButton: 'viewer-rotate-right',
flipButton: 'viewer-flip',
crossOriginPolicy: 'Anonymous',
drawer: 'canvas',
defaultZoomLevel: 0,
homeFillsView: false,
});
window.seadragonViewer = osdViewerRef.current;
osdViewerRef.current.addHandler('open', () => {
setTimeout(() => {
osdViewerRef.current.viewport.goHome(true);
}, 0);
});
osdViewerRef.current.addHandler('open-failed', () => {
console.error('Unable to display image');
alert(`Unable to display image. Contact us at ${contactUrl}`);
});
// Initialize the plugin instance - filtering using the ESM method
filterPluginRef.current = initializeFiltering(osdViewerRef.current);
return () => {
if (osdViewerRef.current) {
osdViewerRef.current.destroy();
osdViewerRef.current = null;
}
// Clear the plugin ref on unmount
filterPluginRef.current = null;
};
}, [imageUrl]);
const toggleFullscreen = (e) => {
e.preventDefault();
if (!screenfull.isEnabled) return;
if (screenfull.isFullscreen) {
screenfull.exit();
} else {
// Request fullscreen on the wrapper, not just the image
screenfull.request(containerRef.current);
}
};
return (
);
}
================================================
FILE: frontend/vite.config.js
================================================
import {defineConfig} from 'vite';
import react from '@vitejs/plugin-react';
import {viteStaticCopy} from 'vite-plugin-static-copy';
export default defineConfig({
base: '/static/frontend/',
plugins: [
react(),
viteStaticCopy({
targets: [
{
src: 'node_modules/openseadragon/build/openseadragon/images/*',
dest: 'openseadragon-images',
},
],
}),
],
build: {
outDir: '../static/frontend',
minify: false,
emptyOutDir: true,
rollupOptions: {
output: {
entryFileNames: 'js/[name].js',
chunkFileNames: 'js/[name].js',
assetFileNames: ({name}) =>
name && name.endsWith('.css')
? 'css/[name][extname]'
: 'assets/[name][extname]',
},
},
},
});
================================================
FILE: importer/Dockerfile
================================================
FROM python:3.12-slim-bookworm
## Add the wait script to the image
ADD https://github.com/ufoscout/docker-compose-wait/releases/download/2.2.1/wait /wait
RUN chmod +x /wait
ENV DEBIAN_FRONTEND="noninteractive"
RUN apt-get update -qy && apt-get install -qy curl
# Ensure that the Library's certificate authority is trusted so the tampering
# proxy will not break TLS validation. See
# https://staff.loc.gov/wikis/display/SE/Configuring+HTTPS+clients+for+the+HTTPS+tampering+proxy.
RUN curl -fso /etc/ssl/certs/LOC-ROOT-CA-1.crt http://crl.loc.gov/LOC-ROOT-CA-1.crt && openssl x509 -inform der -in /etc/ssl/certs/LOC-ROOT-CA-1.crt -outform pem -out /etc/ssl/certs/LOC-ROOT-CA-1.pem && c_rehash
RUN apt-get update -qy && apt-get dist-upgrade -qy && apt-get install -o Dpkg::Options::='--force-confnew' -qy \
git \
libmemcached-dev \
# Pillow/Imaging: https://pillow.readthedocs.io/en/latest/installation.html#external-libraries
libz-dev libfreetype6-dev \
libtiff-dev libjpeg-dev libopenjp2-7-dev libwebp-dev zlib1g-dev \
# Postgres client library to build psycopg
libpq-dev \
locales \
# Weasyprint requirements
libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 \
gcc && apt-get -qy autoremove && apt-get -qy autoclean
RUN locale-gen en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US.UTF-8
ENV PYTHONUNBUFFERED=1 \
PYTHONPATH=/app
ENV DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE:-concordia.settings_docker}
RUN pip install --upgrade pip
RUN pip install --no-cache-dir pipenv
WORKDIR /app
COPY . /app
RUN pipenv install --system --dev --deploy && rm -rf ~/.cache/
CMD /wait && ./importer/entrypoint.sh
================================================
FILE: importer/README.md
================================================
# Importer
This is a Django app which uses celery to download images from a
collection on loc.gov. It also uploads those images to an S3 bucket.
## Prerequisites
1. If uploading to S3 bucket, AWS S3 bucket created and your environment is configured for the awscli tool
1. If running in dev mode, HTTP access to tile-dev.loc.gov and dev.loc.gov
## Usage
1. Start the Python shell:
```bash
$ docker-compose up
$ docker exec -it concordia_importer_1 bash
root@62e3ebef4de2:/app# python3 ./manage.py shell
```
1. Run some test imports:
```Python console
Python 3.6.5rc1 (default, Mar 14 2018, 06:54:23) [GCC 7.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from importer.importer.tasks import download_async_campaign, check_completeness
>>> result = download_async_campaign.delay("https://www.loc.gov/collections/clara-barton-papers/?fa=partof:clara+barton+papers:++diaries+and+journals,+1849-1911")
>>> result.ready()
>>> result.get()
>>> result2 = check_completeness.delay()
>>> result2.ready()
>>> result2.get()
```
To count the files and check disk usage in `/concordia_images` after download is
complete:
```console
$ docker exec -it concordia_app_1 bash
$ find /concordia_images -type f | wc -l
$ df -kh
```
## Integration
After the images have been downloaded in the docker environment:
1. Copy the images from the docker volume to the running docker app container.
```bash
$ ubuntu@ip-172-31-94-65:~/concordia$ sudo docker exec -it concordia_app_1 bash
$ root@6eca4f3cd16d:/app# cp -R /concordia_images/mss* concordia/static/img/
```
1. Run the migrations in the docker app to load Clara Barton Diaries and Branch
Rickey collections to concordia.
```bash
$ root@6eca4f3cd16d:/app# python3 ./manage.py migrate
```
================================================
FILE: importer/__init__.py
================================================
"""
Design
======
The importer currently only supports loading items from www.loc.gov
General goals:
* All state is stored in the database and visible for reporting
* Celery tasks are ephemeral and while they may be configured to retry they will
always check the database to avoid conflicts and use transactions to prevent
race conditions
The import process works like this:
1. A user submits a request to import a URL. This can be an item page, a
collection page, or an arbitrary search result set.
2. An ImportJob is created which records that request and a background Celery
task is launched to determine what items it contains (this can potentially be
well into the thousands)
3. For collection and search URLs (which share a common data format) the task
loads the JSON representation and queues item import tasks for each item. For
item URLs, the item import task is directly queued.
4. When the item import task runs it creates an ImportItem record, loads the
item metadata, and creates ImportItem and ImportItemAsset records to track
subsequent import work. It creates the Item and Asset records which will hold
the actual item data as well because this allows review while a large import
is in progress and our community managers quality review items before making
them visible to the community. The asset import tasks are queued at the end
of this step.
5. When the asset import task runs, it downloads the remote file and saves it in
Concordia's working storage. Each asset is processed independently so
completed downloads will not consume local storage until the [potentially
very large] item has completely downloaded, which could potentially take
hours or days if there are service availability issues requiring retries.
6. When all of the asset tasks are completed the item will be marked as
completed.
7. When all of the item tasks are completed the job will be marked as completed.
"""
================================================
FILE: importer/admin.py
================================================
from django.contrib import admin, messages
from django.contrib.humanize.templatetags.humanize import naturaltime
from django.db.models import Count, F, Max, Q, QuerySet
from django.http import HttpRequest
from django.utils.translation import gettext_lazy as _
from concordia.admin.filters import (
CampaignListFilter,
CampaignProjectListFilter,
NullableTimestampFilter,
)
from concordia.models import Campaign
from importer.tasks.assets import download_asset_task
from .models import (
DownloadAssetImageJob,
ImportItem,
ImportItemAsset,
ImportJob,
VerifyAssetImageJob,
)
@admin.action(description="Retry import")
def retry_download_task(
modeladmin: admin.ModelAdmin,
request: HttpRequest,
queryset: QuerySet[ImportItemAsset],
) -> None:
"""
Queue the asset download Celery task again for selected rows.
Args:
modeladmin (admin.ModelAdmin): Admin class invoking the action.
request (HttpRequest): Current admin request.
queryset (QuerySet[ImportItemAsset]): Selected ImportItemAsset rows.
Returns:
None
"""
pks = queryset.values_list("pk", flat=True)
for pk in pks:
download_asset_task.delay(pk)
messages.add_message(request, messages.INFO, "Queued %d tasks" % len(pks))
class LastStartedFilter(NullableTimestampFilter):
"""Filter by whether a task has a 'last_started' timestamp."""
title = "Last Started"
parameter_name = "last_started"
lookup_labels = ("Unstarted", "Started")
class CompletedFilter(NullableTimestampFilter):
"""Filter by whether a task has a 'completed' timestamp."""
title = "Completed"
parameter_name = "completed"
lookup_labels = ("Incomplete", "Completed")
class FailedFilter(NullableTimestampFilter):
"""Filter by whether a task has a 'failed' timestamp."""
title = "Failed"
parameter_name = "failed"
lookup_labels = ("Has not failed", "Has failed")
class ImportJobProjectListFilter(CampaignProjectListFilter):
"""Project filter for ImportJob rows."""
parameter_name = "project__in"
related_filter_parameter = "project__campaign__id__exact"
project_ref = "project_id"
class ImportJobItemProjectListFilter(CampaignProjectListFilter):
"""Project filter for ImportItem rows (via job)."""
parameter_name = "job__project__in"
related_filter_parameter = "job__project__campaign__id__exact"
project_ref = "job__project_id"
class ImportJobAssetProjectListFilter(CampaignProjectListFilter):
"""Project filter for ImportItemAsset rows (via job)."""
parameter_name = "import_item__job__project__in"
related_filter_parameter = "import_item__job__project__campaign__id__exact"
project_ref = "import_item__job__project_id"
class ImportCampaignListFilter(CampaignListFilter):
"""Campaign filter that excludes retired campaigns."""
def lookups(
self,
request: HttpRequest,
model_admin: admin.ModelAdmin,
) -> list[tuple[int | str, str]]:
"""
Provide (id, title) choices for non-retired campaigns.
Args:
request (HttpRequest): Current admin request.
model_admin (admin.ModelAdmin): Admin class in use.
Returns:
list[tuple[int | str, str]]: Campaign id/title pairs.
"""
queryset = Campaign.objects.exclude(status=Campaign.Status.RETIRED)
return list(queryset.values_list("id", "title").order_by("title"))
class ImportJobCampaignListFilter(ImportCampaignListFilter):
"""Campaign filter for ImportJob rows."""
parameter_name = "project__campaign"
status_filter_parameter = "project__campaign__status"
class ImportItemCampaignListFilter(ImportCampaignListFilter):
"""Campaign filter for ImportItem rows (via job)."""
parameter_name = "job__project__campaign"
status_filter_parameter = "job__project__campaign__status"
class ImportItemAssetCampaignListFilter(ImportCampaignListFilter):
"""Campaign filter for ImportItemAsset rows (via job)."""
parameter_name = "import_item__job__project__campaign"
status_filter_parameter = "import_item__job__project__campaign__status"
class BatchFilter(admin.SimpleListFilter):
"""Compact batch filter showing recent/incomplete and last complete batches."""
title = _("Batch")
parameter_name = "batch"
def lookups(
self,
request: HttpRequest,
model_admin: admin.ModelAdmin,
) -> list[tuple[str, str]]:
"""
Show up to five batches with incomplete jobs, plus the currently filtered
batch, and the most recent fully complete batch. Fill with more completed
batches if there are fewer than five batches shown.
Args:
request (HttpRequest): Current admin request.
model_admin (admin.ModelAdmin): Admin class in use.
Returns:
list[tuple[str, str]]: (value, label) pairs for batch selection.
"""
queryset = model_admin.get_queryset(request)
# Get up to 5 batches with incomplete jobs
incomplete_batches = (
queryset.filter(completed__isnull=True)
.exclude(batch__isnull=True)
.values("batch")
.annotate(latest_created=Max("created"))
.order_by("-latest_created")[:5]
)
batch_choices = {str(batch["batch"]) for batch in incomplete_batches}
# Ensure the currently filtered batch is included
current_batch = self.value()
if current_batch:
batch_choices.add(current_batch)
# Fetch the most recent fully completed batch
most_recent_complete_batch = (
queryset.filter(batch__isnull=False)
.values("batch")
.annotate(
latest_created=Max("created"),
total_jobs=Count("id"),
completed_jobs=Count("id", filter=Q(completed__isnull=False)),
)
.filter(total_jobs=F("completed_jobs")) # Only fully completed batches
.order_by("-latest_created")
.first()
)
if most_recent_complete_batch:
batch_choices.add(str(most_recent_complete_batch["batch"]))
# If we still have fewer than 5, add more completed batches
if len(batch_choices) < 5:
additional_complete_batches = (
queryset.filter(~Q(batch__in=batch_choices), batch__isnull=False)
.values("batch")
.annotate(
latest_created=Max("created"),
total_jobs=Count("id"),
completed_jobs=Count("id", filter=Q(completed__isnull=False)),
)
.filter(total_jobs=F("completed_jobs")) # Only fully completed batches
.order_by("-latest_created")
)
for batch in additional_complete_batches:
if len(batch_choices) >= 5:
break
batch_choices.add(str(batch["batch"]))
return [(batch, batch[:12] + "...") for batch in batch_choices]
def queryset(
self,
request: HttpRequest,
queryset: QuerySet,
) -> QuerySet:
"""
Filter the queryset to a specific batch when a value is selected.
Args:
request (HttpRequest): Current admin request.
queryset (QuerySet): Base queryset for the changelist.
Returns:
QuerySet: Filtered queryset limited to the chosen batch.
"""
batch_value = self.value()
if batch_value:
return queryset.filter(batch=batch_value)
return queryset
class TaskStatusModelAdmin(admin.ModelAdmin):
"""
Base ModelAdmin for task-like models with standard readonly fields.
Also adds human-friendly timestamp display properties (e.g., "3 minutes
ago") for common lifecycle fields.
"""
readonly_fields = (
"created",
"modified",
"last_started",
"completed",
"failed",
"status",
"task_id",
"failure_reason",
"retry_count",
"failure_history",
"status_history",
)
@staticmethod
def generate_natural_timestamp_display_property(field_name: str):
"""
Build a `naturaltime` display function for a timestamp field.
The returned function is suitable for inclusion in `list_display`.
It sets `short_description` and `admin_order_field` to match the
provided field.
Args:
field_name (str): Name of the timestamp field on the model.
Returns:
callable: A function that takes an object and returns a
human-readable string (or `None` when unset).
"""
def inner(obj):
try:
value = getattr(obj, field_name)
except AttributeError:
return None
if value:
return naturaltime(value)
else:
return value
inner.short_description = field_name.replace("_", " ").title()
inner.admin_order_field = field_name
return inner
def __init__(self, *args, **kwargs):
"""
Initialize and attach dynamic display_* timestamp helpers.
For each known timestamp field, a `display_` method is created
that renders a human-friendly relative time and can be used in
`list_display`.
"""
for field_name in (
"created",
"modified",
"last_started",
"completed",
"failed",
):
setattr(
self,
f"display_{field_name}",
self.generate_natural_timestamp_display_property(field_name),
)
super().__init__(*args, **kwargs)
@admin.register(ImportJob)
class ImportJobAdmin(TaskStatusModelAdmin):
"""Admin configuration for `ImportJob`."""
readonly_fields = TaskStatusModelAdmin.readonly_fields + (
"project",
"created_by",
"url",
)
list_display = (
"display_created",
"display_modified",
"display_last_started",
"display_completed",
"url",
"status",
)
list_filter = (
LastStartedFilter,
CompletedFilter,
FailedFilter,
("created_by", admin.RelatedOnlyFieldListFilter),
ImportJobCampaignListFilter,
ImportJobProjectListFilter,
)
search_fields = ("url", "status")
@admin.register(ImportItem)
class ImportItemAdmin(TaskStatusModelAdmin):
"""Admin configuration for `ImportItem`."""
readonly_fields = TaskStatusModelAdmin.readonly_fields + ("job", "item")
list_display = (
"display_created",
"display_modified",
"display_last_started",
"display_completed",
"url",
"status",
)
list_filter = (
LastStartedFilter,
CompletedFilter,
FailedFilter,
("job__created_by", admin.RelatedOnlyFieldListFilter),
ImportItemCampaignListFilter,
ImportJobItemProjectListFilter,
)
search_fields = ("url", "status")
@admin.register(ImportItemAsset)
class ImportItemAssetAdmin(TaskStatusModelAdmin):
"""Admin configuration for `ImportItemAsset`."""
readonly_fields = TaskStatusModelAdmin.readonly_fields + (
"import_item",
"asset",
"sequence_number",
)
list_display = (
"display_created",
"display_last_started",
"display_completed",
"url",
"failure_reason",
"status",
)
list_filter = (
LastStartedFilter,
CompletedFilter,
FailedFilter,
"failure_reason",
("import_item__job__created_by", admin.RelatedOnlyFieldListFilter),
ImportItemAssetCampaignListFilter,
ImportJobAssetProjectListFilter,
)
search_fields = ("url", "status")
actions = (retry_download_task,)
@admin.register(VerifyAssetImageJob)
class VerifyAssetImageJobAdmin(TaskStatusModelAdmin):
"""Admin configuration for `VerifyAssetImageJob`."""
readonly_fields = TaskStatusModelAdmin.readonly_fields + ("asset", "batch")
list_display = (
"display_created",
"display_last_started",
"asset",
"batch",
"failure_reason",
"status",
)
list_filter = (
LastStartedFilter,
CompletedFilter,
FailedFilter,
"failure_reason",
BatchFilter,
)
search_fields = ("status",)
@admin.register(DownloadAssetImageJob)
class DownloadAssetImageJobAdmin(TaskStatusModelAdmin):
"""Admin configuration for `DownloadAssetImageJob`."""
readonly_fields = TaskStatusModelAdmin.readonly_fields + ("asset", "batch")
list_display = (
"display_created",
"display_last_started",
"asset",
"batch",
"failure_reason",
"status",
)
list_filter = (
LastStartedFilter,
CompletedFilter,
FailedFilter,
"failure_reason",
BatchFilter,
)
search_fields = ("status",)
================================================
FILE: importer/apps.py
================================================
from django.apps import AppConfig
class ImporterAppConfig(AppConfig):
name = "importer"
================================================
FILE: importer/celery.py
================================================
import importlib
import pkgutil
from celery import Celery
app = Celery("importer")
# Using a string here means the worker doesn't have to serialize
# the configuration object to child processes.
# - namespace='CELERY' means all celery-related configuration keys
# should have a `CELERY_` prefix.
app.config_from_object("django.conf:settings", namespace="CELERY")
# Load task modules from all registered Django app configs.
app.autodiscover_tasks()
def import_all_submodules(package_name: str):
"""
Import a package and recursively import all submodules.
Used sparingly at Celery startup to ensure all task modules are loaded.
"""
pkg = importlib.import_module(package_name)
if not hasattr(pkg, "__path__"):
return
for mod in pkgutil.walk_packages(pkg.__path__, pkg.__name__ + "."):
importlib.import_module(mod.name)
# Import all task modules under these packages
# We do this because celery autodiscovery won't
# find anything not in tasks.py or tasks/__init__.py
# We need to defer this until after Django is fully loaded
@app.on_after_finalize.connect
def _load_all_task_modules(sender, **kwargs):
import_all_submodules("concordia.tasks")
import_all_submodules("importer.tasks")
================================================
FILE: importer/config.py
================================================
================================================
FILE: importer/entrypoint.sh
================================================
#!/bin/bash
set -e -u # Exit immediately for unhandled errors or undefined variables
mkdir -p /app/logs
touch /app/logs/concordia.log
# To avoid trace and reporting of errors in the X-Ray SDK
export AWS_XRAY_CONTEXT_MISSING=LOG_ERROR
echo "Running celery worker"
celery -A concordia worker -l info -c 10
================================================
FILE: importer/exceptions.py
================================================
class ImageImportFailure(Exception):
"""
Raised when an image import operation fails.
This exception signals a failure while importing or downloading an asset
image. Callers should include a concise human-readable reason in the
exception message to aid in debugging and logging.
"""
pass
================================================
FILE: importer/migrations/0001_initial.py
================================================
# Generated by Django 2.0.7 on 2018-07-09 08:02
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="CampaignItemAssetCount",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("campaign_slug", models.SlugField()),
("campaign_item_identifier", models.CharField(max_length=50)),
("campaign_item_asset_count", models.IntegerField()),
],
),
migrations.CreateModel(
name="CampaignTaskDetails",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("campaign_name", models.CharField(max_length=50)),
("campaign_slug", models.SlugField(unique=True)),
("campaign_page_count", models.IntegerField()),
("campaign_item_count", models.IntegerField()),
("campaign_asset_count", models.IntegerField()),
("campaign_task_id", models.CharField(max_length=100)),
],
),
]
================================================
FILE: importer/migrations/0001_squashed_0015_auto_20180925_1851.py
================================================
# Generated by Django 2.0.9 on 2018-10-04 15:00
import django.core.validators
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
replaces = [
("importer", "0001_initial"),
("importer", "0002_auto_20180709_0833"),
("importer", "0003_auto_20180709_0933"),
("importer", "0004_auto_20180812_1007"),
("importer", "0005_auto_20180816_1702"),
("importer", "0006_auto_20180912_0229"),
("importer", "0007_auto_20180917_1654"),
("importer", "0008_campaigntaskdetails_project"),
("importer", "0009_convert_project_text_to_keys"),
("importer", "0010_auto_20180920_2013"),
("importer", "0011_auto_20180922_0208"),
("importer", "0012_auto_20180923_0231"),
("importer", "0013_auto_20180924_1318"),
("importer", "0014_auto_20180924_1943"),
("importer", "0015_auto_20180925_1851"),
]
initial = True
dependencies = [
("concordia", "0019_auto_20180920_1503"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("concordia", "0021_auto_20180922_0202"),
("concordia", "0024_auto_20180924_1529"),
]
operations = [
migrations.CreateModel(
name="ImportItem",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Last time when a worker started processing this job", # NOQA
),
),
(
"completed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job completed processing",
),
),
(
"failed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job failed and will not be restarted", # NOQA
),
),
(
"status",
models.TextField(
blank=True,
null=True,
verbose_name="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
null=True,
verbose_name="UUID of the last Celery task to process this record", # NOQA
),
),
("url", models.URLField()),
(
"item",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="concordia.Item"
),
),
],
options={"abstract": False},
),
migrations.CreateModel(
name="ImportItemAsset",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
(
"completed",
models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
(
"failed",
models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
(
"status",
models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
("url", models.URLField()),
(
"sequence_number",
models.PositiveIntegerField(
validators=[django.core.validators.MinValueValidator(1)]
),
),
(
"asset",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="concordia.Asset",
),
),
(
"import_item",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="assets",
to="importer.ImportItem",
),
),
],
options={"abstract": False},
),
migrations.CreateModel(
name="ImportJob",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
(
"completed",
models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
(
"failed",
models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
(
"status",
models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
("url", models.URLField(verbose_name="Source URL for the entire job")),
(
"created_by",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
),
),
(
"project",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="concordia.Project",
),
),
],
options={"abstract": False},
),
migrations.AddField(
model_name="importitem",
name="job",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="items",
to="importer.ImportJob",
),
),
migrations.AlterField(
model_name="importitem",
name="status",
field=models.TextField(
blank=True,
default="",
verbose_name="Status message, if any, from the last worker",
),
),
migrations.AlterField(
model_name="importitem",
name="completed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
migrations.AlterField(
model_name="importitem",
name="failed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
migrations.AlterField(
model_name="importitem",
name="last_started",
field=models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
migrations.AlterField(
model_name="importitem",
name="status",
field=models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
migrations.AlterField(
model_name="importitem",
name="task_id",
field=models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
migrations.AlterUniqueTogether(
name="importitem", unique_together={("job", "item")}
),
migrations.AlterUniqueTogether(
name="importitemasset",
unique_together={
("import_item", "asset"),
("import_item", "sequence_number"),
},
),
]
================================================
FILE: importer/migrations/0002_auto_20180709_0833.py
================================================
# Generated by Django 2.0.7 on 2018-07-09 08:33
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0001_initial")]
operations = [
migrations.AlterField(
model_name="campaignitemassetcount",
name="campaign_item_asset_count",
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_asset_count",
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_item_count",
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_page_count",
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_task_id",
field=models.CharField(blank=True, max_length=100, null=True),
),
]
================================================
FILE: importer/migrations/0003_auto_20180709_0933.py
================================================
# Generated by Django 2.0.7 on 2018-07-09 09:33
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0002_auto_20180709_0833")]
operations = [
migrations.AlterField(
model_name="campaignitemassetcount",
name="campaign_item_asset_count",
field=models.IntegerField(blank=True, default=0, null=True),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_asset_count",
field=models.IntegerField(blank=True, default=0, null=True),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_item_count",
field=models.IntegerField(blank=True, default=0, null=True),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_page_count",
field=models.IntegerField(blank=True, default=0, null=True),
),
]
================================================
FILE: importer/migrations/0004_auto_20180812_1007.py
================================================
# Generated by Django 2.0.8 on 2018-08-12 10:07
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0003_auto_20180709_0933")]
operations = [
migrations.RemoveField(
model_name="campaignitemassetcount", name="campaign_slug"
),
migrations.AddField(
model_name="campaignitemassetcount",
name="campaign_task",
field=models.ForeignKey(
default=1,
on_delete=django.db.models.deletion.CASCADE,
to="importer.CampaignTaskDetails",
),
preserve_default=False,
),
migrations.AddField(
model_name="campaignitemassetcount",
name="item_task_id",
field=models.CharField(blank=True, max_length=100, null=True),
),
migrations.AddField(
model_name="campaigntaskdetails",
name="project_name",
field=models.CharField(default=1, max_length=250),
preserve_default=False,
),
migrations.AddField(
model_name="campaigntaskdetails",
name="project_slug",
field=models.SlugField(default=1, max_length=250, unique=True),
preserve_default=False,
),
migrations.RemoveField(
model_name="campaigntaskdetails", name="campaign_page_count"
),
migrations.AlterUniqueTogether(
name="campaigntaskdetails",
unique_together={("campaign_slug", "project_slug")},
),
]
================================================
FILE: importer/migrations/0005_auto_20180816_1702.py
================================================
# Generated by Django 2.0.8 on 2018-08-16 17:02
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0004_auto_20180812_1007")]
operations = [
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_slug",
field=models.SlugField(),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="project_slug",
field=models.SlugField(max_length=250),
),
]
================================================
FILE: importer/migrations/0006_auto_20180912_0229.py
================================================
# Generated by Django 2.0.8 on 2018-09-12 02:29
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0005_auto_20180816_1702")]
operations = [
migrations.AlterField(
model_name="campaignitemassetcount",
name="campaign_item_identifier",
field=models.CharField(max_length=500),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_name",
field=models.CharField(max_length=500),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_slug",
field=models.SlugField(max_length=500),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="project_name",
field=models.CharField(max_length=500),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="project_slug",
field=models.SlugField(max_length=500),
),
]
================================================
FILE: importer/migrations/0007_auto_20180917_1654.py
================================================
# Generated by Django 2.0.8 on 2018-09-17 16:54
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0006_auto_20180912_0229")]
operations = [
migrations.AlterField(
model_name="campaignitemassetcount",
name="campaign_item_identifier",
field=models.CharField(max_length=80),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_name",
field=models.CharField(max_length=80),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="campaign_slug",
field=models.SlugField(max_length=80),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="project_name",
field=models.CharField(max_length=250),
),
migrations.AlterField(
model_name="campaigntaskdetails",
name="project_slug",
field=models.SlugField(max_length=250),
),
]
================================================
FILE: importer/migrations/0008_campaigntaskdetails_project.py
================================================
# Generated by Django 2.0.8 on 2018-09-20 20:05
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0007_auto_20180917_1654")]
operations = [
migrations.AddField(
model_name="campaigntaskdetails",
name="project",
field=models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.CASCADE,
to="concordia.Project",
),
)
]
================================================
FILE: importer/migrations/0009_convert_project_text_to_keys.py
================================================
# Generated by Django 2.0.8 on 2018-09-20 20:06
import logging
from django.db import migrations
def convert_slugs_to_references(apps, schema_editor):
Project = apps.get_model("concordia", "Project")
CampaignTaskDetails = apps.get_model("importer", "CampaignTaskDetails")
for ctd in CampaignTaskDetails.objects.all():
try:
ctd.project = Project.objects.get(
slug=ctd.project_slug, campaign__slug=ctd.campaign_slug
)
ctd.save()
except Project.DoesNotExist:
logging.error("%s references a non-existent project! Deleting it!", ctd)
ctd.delete()
class Migration(migrations.Migration):
dependencies = [("importer", "0008_campaigntaskdetails_project")]
operations = [migrations.RunPython(convert_slugs_to_references)]
================================================
FILE: importer/migrations/0010_auto_20180920_2013.py
================================================
# Generated by Django 2.0.8 on 2018-09-20 20:13
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0009_convert_project_text_to_keys")]
operations = [
migrations.AlterField(
model_name="campaigntaskdetails",
name="project",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="concordia.Project"
),
),
migrations.AlterUniqueTogether(
name="campaigntaskdetails", unique_together=set()
),
migrations.RemoveField(model_name="campaigntaskdetails", name="campaign_name"),
migrations.RemoveField(model_name="campaigntaskdetails", name="campaign_slug"),
migrations.RemoveField(model_name="campaigntaskdetails", name="project_name"),
migrations.RemoveField(model_name="campaigntaskdetails", name="project_slug"),
]
================================================
FILE: importer/migrations/0011_auto_20180922_0208.py
================================================
# Generated by Django 2.0.8 on 2018-09-22 02:08
import django.core.validators
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("concordia", "0021_auto_20180922_0202"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("importer", "0010_auto_20180920_2013"),
]
operations = [
migrations.CreateModel(
name="ImportItem",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Last time when a worker started processing this job", # NOQA
),
),
(
"completed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job completed processing",
),
),
(
"failed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job failed and will not be restarted", # NOQA
),
),
(
"status",
models.TextField(
blank=True,
null=True,
verbose_name="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
null=True,
verbose_name="UUID of the last Celery task to process this record", # NOQA
),
),
("url", models.URLField()),
(
"item",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="concordia.Item"
),
),
],
options={"abstract": False},
),
migrations.CreateModel(
name="ImportItemAsset",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Last time when a worker started processing this job", # NOQA
),
),
(
"completed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job completed processing",
),
),
(
"failed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job failed and will not be restarted", # NOQA
),
),
(
"status",
models.TextField(
blank=True,
null=True,
verbose_name="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
null=True,
verbose_name="UUID of the last Celery task to process this record", # NOQA
),
),
("url", models.URLField()),
(
"sequence_number",
models.PositiveIntegerField(
validators=[django.core.validators.MinValueValidator(1)]
),
),
(
"asset",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="concordia.Asset",
),
),
(
"import_item",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="assets",
to="importer.ImportItem",
),
),
],
options={"abstract": False},
),
migrations.CreateModel(
name="ImportJob",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Last time when a worker started processing this job", # NOQA
),
),
(
"completed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job completed processing",
),
),
(
"failed",
models.DateTimeField(
blank=True,
null=True,
verbose_name="Time when the job failed and will not be restarted", # NOQA
),
),
(
"status",
models.TextField(
blank=True,
null=True,
verbose_name="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
null=True,
verbose_name="UUID of the last Celery task to process this record", # NOQA
),
),
(
"source_url",
models.URLField(verbose_name="Source URL for the entire job"),
),
(
"created_by",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
),
),
(
"project",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="concordia.Project",
),
),
],
options={"abstract": False},
),
migrations.RemoveField(
model_name="campaignitemassetcount", name="campaign_task"
),
migrations.RemoveField(model_name="campaigntaskdetails", name="project"),
migrations.DeleteModel(name="CampaignItemAssetCount"),
migrations.DeleteModel(name="CampaignTaskDetails"),
migrations.AddField(
model_name="importitem",
name="job",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="items",
to="importer.ImportJob",
),
),
]
================================================
FILE: importer/migrations/0012_auto_20180923_0231.py
================================================
# Generated by Django 2.0.8 on 2018-09-23 02:31
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0011_auto_20180922_0208")]
operations = [
migrations.AlterField(
model_name="importitem",
name="status",
field=models.TextField(
blank=True,
default="",
verbose_name="Status message, if any, from the last worker",
),
),
migrations.AlterField(
model_name="importitemasset",
name="status",
field=models.TextField(
blank=True,
default="",
verbose_name="Status message, if any, from the last worker",
),
),
migrations.AlterField(
model_name="importjob",
name="status",
field=models.TextField(
blank=True,
default="",
verbose_name="Status message, if any, from the last worker",
),
),
]
================================================
FILE: importer/migrations/0013_auto_20180924_1318.py
================================================
# Generated by Django 2.0.8 on 2018-09-24 13:18
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [("importer", "0012_auto_20180923_0231")]
operations = [
migrations.RenameField(
model_name="importjob", old_name="source_url", new_name="url"
)
]
================================================
FILE: importer/migrations/0014_auto_20180924_1943.py
================================================
# Generated by Django 2.0.8 on 2018-09-24 19:43
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("concordia", "0024_auto_20180924_1529"),
("importer", "0013_auto_20180924_1318"),
]
operations = [
migrations.AlterUniqueTogether(
name="importitem", unique_together={("job", "item")}
),
migrations.AlterUniqueTogether(
name="importitemasset",
unique_together={
("import_item", "sequence_number"),
("import_item", "asset"),
},
),
]
================================================
FILE: importer/migrations/0015_auto_20180925_1851.py
================================================
# Generated by Django 2.0.8 on 2018-09-25 18:51
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("importer", "0014_auto_20180924_1943")]
operations = [
migrations.AlterField(
model_name="importitem",
name="completed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
migrations.AlterField(
model_name="importitem",
name="failed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
migrations.AlterField(
model_name="importitem",
name="last_started",
field=models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
migrations.AlterField(
model_name="importitem",
name="status",
field=models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
migrations.AlterField(
model_name="importitem",
name="task_id",
field=models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
migrations.AlterField(
model_name="importitemasset",
name="completed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
migrations.AlterField(
model_name="importitemasset",
name="failed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
migrations.AlterField(
model_name="importitemasset",
name="last_started",
field=models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
migrations.AlterField(
model_name="importitemasset",
name="status",
field=models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
migrations.AlterField(
model_name="importitemasset",
name="task_id",
field=models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
migrations.AlterField(
model_name="importjob",
name="completed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
migrations.AlterField(
model_name="importjob",
name="failed",
field=models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
migrations.AlterField(
model_name="importjob",
name="last_started",
field=models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
migrations.AlterField(
model_name="importjob",
name="status",
field=models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
migrations.AlterField(
model_name="importjob",
name="task_id",
field=models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
]
================================================
FILE: importer/migrations/0016_importitem_failure_reason_and_more.py
================================================
# Generated by Django 4.2.16 on 2025-02-20 16:30
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("importer", "0001_squashed_0015_auto_20180925_1851"),
]
operations = [
migrations.AddField(
model_name="importitem",
name="failure_reason",
field=models.CharField(
blank=True, choices=[("Image", "Image")], default="", max_length=50
),
),
migrations.AddField(
model_name="importitemasset",
name="failure_reason",
field=models.CharField(
blank=True, choices=[("Image", "Image")], default="", max_length=50
),
),
migrations.AddField(
model_name="importjob",
name="failure_reason",
field=models.CharField(
blank=True, choices=[("Image", "Image")], default="", max_length=50
),
),
]
================================================
FILE: importer/migrations/0017_importitem_failure_history_importitem_retry_count_and_more.py
================================================
# Generated by Django 4.2.16 on 2025-03-03 20:49
import django.core.serializers.json
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("importer", "0016_importitem_failure_reason_and_more"),
]
operations = [
migrations.AddField(
model_name="importitem",
name="failure_history",
field=models.JSONField(
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
migrations.AddField(
model_name="importitem",
name="retry_count",
field=models.IntegerField(
default=0, help_text="Number of times the task was retried"
),
),
migrations.AddField(
model_name="importitemasset",
name="failure_history",
field=models.JSONField(
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
migrations.AddField(
model_name="importitemasset",
name="retry_count",
field=models.IntegerField(
default=0, help_text="Number of times the task was retried"
),
),
migrations.AddField(
model_name="importjob",
name="failure_history",
field=models.JSONField(
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
migrations.AddField(
model_name="importjob",
name="retry_count",
field=models.IntegerField(
default=0, help_text="Number of times the task was retried"
),
),
migrations.AlterField(
model_name="importitem",
name="failure_reason",
field=models.CharField(
blank=True,
choices=[("Image", "Image"), ("Retries", "Retries")],
default="",
help_text="Reason the task failed, if one was provided",
max_length=50,
),
),
migrations.AlterField(
model_name="importitemasset",
name="failure_reason",
field=models.CharField(
blank=True,
choices=[("Image", "Image"), ("Retries", "Retries")],
default="",
help_text="Reason the task failed, if one was provided",
max_length=50,
),
),
migrations.AlterField(
model_name="importjob",
name="failure_reason",
field=models.CharField(
blank=True,
choices=[("Image", "Image"), ("Retries", "Retries")],
default="",
help_text="Reason the task failed, if one was provided",
max_length=50,
),
),
]
================================================
FILE: importer/migrations/0018_importitem_status_history_and_more.py
================================================
# Generated by Django 4.2.16 on 2025-03-06 16:04
import django.core.serializers.json
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("concordia", "0103_alter_item_title"),
("importer", "0017_importitem_failure_history_importitem_retry_count_and_more"),
]
operations = [
migrations.AddField(
model_name="importitem",
name="status_history",
field=models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Previous statuses on the task, if any",
),
),
migrations.AddField(
model_name="importitemasset",
name="status_history",
field=models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Previous statuses on the task, if any",
),
),
migrations.AddField(
model_name="importjob",
name="status_history",
field=models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Previous statuses on the task, if any",
),
),
migrations.AlterField(
model_name="importitem",
name="failure_history",
field=models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
migrations.AlterField(
model_name="importitem",
name="retry_count",
field=models.IntegerField(
blank=True, default=0, help_text="Number of times the task was retried"
),
),
migrations.AlterField(
model_name="importitemasset",
name="failure_history",
field=models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
migrations.AlterField(
model_name="importitemasset",
name="retry_count",
field=models.IntegerField(
blank=True, default=0, help_text="Number of times the task was retried"
),
),
migrations.AlterField(
model_name="importjob",
name="failure_history",
field=models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
migrations.AlterField(
model_name="importjob",
name="retry_count",
field=models.IntegerField(
blank=True, default=0, help_text="Number of times the task was retried"
),
),
migrations.CreateModel(
name="VerifyAssetImageJob",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
(
"completed",
models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
(
"failed",
models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
(
"status",
models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
(
"failure_reason",
models.CharField(
blank=True,
choices=[("Image", "Image"), ("Retries", "Retries")],
default="",
help_text="Reason the task failed, if one was provided",
max_length=50,
),
),
(
"retry_count",
models.IntegerField(
blank=True,
default=0,
help_text="Number of times the task was retried",
),
),
(
"failure_history",
models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
(
"status_history",
models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Previous statuses on the task, if any",
),
),
("batch", models.UUIDField(blank=True, editable=False)),
(
"asset",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="concordia.asset",
),
),
],
options={
"abstract": False,
},
),
migrations.CreateModel(
name="DownloadAssetImageJob",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"last_started",
models.DateTimeField(
blank=True,
help_text="Last time when a worker started processing this job",
null=True,
),
),
(
"completed",
models.DateTimeField(
blank=True,
help_text="Time when the job completed without error",
null=True,
),
),
(
"failed",
models.DateTimeField(
blank=True,
help_text="Time when the job failed due to an error",
null=True,
),
),
(
"status",
models.TextField(
blank=True,
default="",
help_text="Status message, if any, from the last worker",
),
),
(
"task_id",
models.UUIDField(
blank=True,
help_text="UUID of the last Celery task to process this record",
null=True,
),
),
(
"failure_reason",
models.CharField(
blank=True,
choices=[("Image", "Image"), ("Retries", "Retries")],
default="",
help_text="Reason the task failed, if one was provided",
max_length=50,
),
),
(
"retry_count",
models.IntegerField(
blank=True,
default=0,
help_text="Number of times the task was retried",
),
),
(
"failure_history",
models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Information about previous failures of the task, if any",
),
),
(
"status_history",
models.JSONField(
blank=True,
default=list,
encoder=django.core.serializers.json.DjangoJSONEncoder,
help_text="Previous statuses on the task, if any",
),
),
("batch", models.UUIDField(blank=True, editable=False)),
(
"asset",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="concordia.asset",
),
),
],
options={
"abstract": False,
},
),
]
================================================
FILE: importer/migrations/0019_alter_downloadassetimagejob_batch_and_more.py
================================================
# Generated by Django 4.2.16 on 2025-03-07 19:19
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("importer", "0018_importitem_status_history_and_more"),
]
operations = [
migrations.AlterField(
model_name="downloadassetimagejob",
name="batch",
field=models.UUIDField(blank=True, editable=False, null=True),
),
migrations.AlterField(
model_name="verifyassetimagejob",
name="batch",
field=models.UUIDField(blank=True, editable=False, null=True),
),
]
================================================
FILE: importer/migrations/0020_alter_downloadassetimagejob_unique_together_and_more.py
================================================
# Generated by Django 4.2.16 on 2025-03-18 20:01
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("concordia", "0103_alter_item_title"),
("importer", "0019_alter_downloadassetimagejob_batch_and_more"),
]
operations = [
migrations.AlterUniqueTogether(
name="downloadassetimagejob",
unique_together={("asset", "batch")},
),
migrations.AlterUniqueTogether(
name="verifyassetimagejob",
unique_together={("asset", "batch")},
),
]
================================================
FILE: importer/migrations/__init__.py
================================================
================================================
FILE: importer/models.py
================================================
from logging import getLogger
from uuid import UUID
from django.core.serializers.json import DjangoJSONEncoder
from django.core.validators import MinValueValidator
from django.db import models
from django.urls import reverse
from django.utils import timezone
from configuration.utils import configuration_value
from importer import tasks
logger = getLogger(__name__)
class TaskStatusModel(models.Model):
"""
Abstract base model that tracks task lifecycle and outcomes.
Subclasses get standard timestamp fields, a free-form status, failure
bookkeeping (reason, history, retry count), and the last Celery task ID
that processed the record.
"""
class FailureReason(models.TextChoices):
IMAGE = "Image"
RETRIES = "Retries"
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
last_started = models.DateTimeField(
help_text="Last time when a worker started processing this job",
null=True,
blank=True,
)
completed = models.DateTimeField(
help_text="Time when the job completed without error", null=True, blank=True
)
failed = models.DateTimeField(
help_text="Time when the job failed due to an error", null=True, blank=True
)
status = models.TextField(
help_text="Status message, if any, from the last worker", blank=True, default=""
)
task_id = models.UUIDField(
help_text="UUID of the last Celery task to process this record",
null=True,
blank=True,
)
failure_reason = models.CharField(
help_text="Reason the task failed, if one was provided",
max_length=50,
blank=True,
default="",
choices=FailureReason.choices,
)
retry_count = models.IntegerField(
help_text="Number of times the task was retried", default=0, blank=True
)
failure_history = models.JSONField(
help_text="Information about previous failures of the task, if any",
encoder=DjangoJSONEncoder,
default=list,
blank=True,
)
status_history = models.JSONField(
help_text="Previous statuses on the task, if any",
encoder=DjangoJSONEncoder,
default=list,
blank=True,
)
class Meta:
abstract = True
def retry_if_possible(self) -> bool:
"""
Attempt to schedule a retry for this task if policy allows.
Subclasses should override this to implement their own logic.
Returns:
bool: True if a retry was scheduled, otherwise False.
"""
return False
def update_failure_history(self, do_save: bool = True) -> None:
"""
Append the current failure details to the failure history.
Args:
do_save (bool): If True, save the model after updating.
"""
self.failure_history.append(
{
"failed": self.failed,
"failure_reason": self.failure_reason,
"status": self.status,
}
)
if do_save:
self.save()
def update_status(self, status: str, do_save: bool = True) -> None:
"""
Append the previous status to the history and set a new status.
Args:
status (str): The new status value to set.
do_save (bool): If True, save the model after updating.
"""
self.status_history.append(
{
"status": self.status,
"timestamp": self.modified,
}
)
self.status = status
if do_save:
self.save()
def reset_for_retry(self) -> bool:
"""
Reset failure fields and prepare the record for retry.
When the instance is currently marked as failed, move the failure
details into history, clear failure markers, increment retry count,
and set a transitional status.
Returns:
bool: True if the record was reset, otherwise False.
"""
if self.failed:
logger.info(
"Resetting task %s for retrying",
self,
)
self.update_failure_history(do_save=False)
self.failed = None
self.failure_reason = ""
self.update_status("Retrying", do_save=False)
self.retry_count += 1
self.save()
return True
else:
new_status = (
"Task was not marked as failed, so it will "
"not be reset for retrying."
)
self.update_status(new_status)
logger.warning(
"Task %s was not marked as failed, so it will not be "
"reset for retrying",
self,
)
return False
class BatchedJob(TaskStatusModel):
"""
Abstract base model for jobs grouped into batches.
The optional `batch` UUID groups related jobs for scheduling and
admin filtering. Use `batch_admin_url` or `get_batch_admin_url`
to link to the admin list filtered by the batch.
"""
# Allows grouping jobs by batch.
# `batch` is used by the task system to group jobs
# and run them in smaller groups rather than spawning
# an arbitrarily large number at once
# It's also used to group jobs in the admin, allowing
# filtering to see all the jobs spawned by a particular
# action
batch = models.UUIDField(blank=True, null=True, editable=False)
class Meta:
abstract = True
@classmethod
def get_batch_admin_url(cls, batch: UUID | str | None) -> str:
"""
Build the admin changelist URL filtered to the provided batch.
Args:
batch (UUID | str | None): Batch identifier to filter by. Must be
provided.
Returns:
str: Admin changelist URL with the batch query string applied.
Raises:
ValueError: If `batch` is falsy.
"""
if not batch:
raise ValueError("A batch value must be provided.")
app_label = cls._meta.app_label
model_name = cls._meta.model_name
admin_url = reverse(f"admin:{app_label}_{model_name}_changelist")
return f"{admin_url}?batch={batch}"
@property
def batch_admin_url(self) -> str | None:
"""
Convenience property to get the admin URL for this instance's batch.
Returns:
str | None: Admin URL filtered by the instance's batch, or None
when no batch is set.
"""
# Allows getting the batch url from an instance, automatically
# using self.batch rather than needing to call the class method
# get_batch_admin_url if you have an instance
return self.__class__.get_batch_admin_url(self.batch) if self.batch else None
class ImportJob(TaskStatusModel):
"""
Represents a request by a user to import item(s) from a remote URL.
"""
created_by = models.ForeignKey("auth.User", null=True, on_delete=models.SET_NULL)
project = models.ForeignKey("concordia.Project", on_delete=models.CASCADE)
url = models.URLField(verbose_name="Source URL for the entire job")
def __str__(self) -> str:
return "ImportJob(created_by=%s, project=%s, url=%s)" % (
self.created_by.username if self.created_by else None,
self.project.title,
self.url,
)
class ImportItem(TaskStatusModel):
"""
Record of the task status for each Item being imported.
"""
job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items")
url = models.URLField()
item = models.ForeignKey("concordia.Item", on_delete=models.CASCADE)
class Meta:
unique_together = (("job", "item"),)
def __str__(self) -> str:
return "ImportItem(job=%s, url=%s)" % (self.job, self.url)
class ImportItemAsset(TaskStatusModel):
"""
Record of the task status for each Asset being imported.
"""
import_item = models.ForeignKey(
ImportItem, on_delete=models.CASCADE, related_name="assets"
)
url = models.URLField()
sequence_number = models.PositiveIntegerField(validators=[MinValueValidator(1)])
asset = models.ForeignKey("concordia.Asset", on_delete=models.CASCADE)
class Meta:
unique_together = (("import_item", "sequence_number"), ("import_item", "asset"))
def __str__(self) -> str:
return "ImportItemAsset(import_item=%s, url=%s)" % (self.import_item, self.url)
def retry_if_possible(self) -> bool:
"""
Attempt to schedule a retry when the failure was an image error.
Uses two configuration values:
- `asset_image_import_max_retries`: Maximum number of retries allowed.
- `asset_image_import_max_retry_delay`: Delay (minutes) before retry.
When eligible and reset succeeds, schedules a Celery task via
`download_asset_task.apply_async(...)`.
Returns:
bool: True if a retry was scheduled, otherwise False.
"""
if self.failure_reason == TaskStatusModel.FailureReason.IMAGE:
max_retries = configuration_value("asset_image_import_max_retries")
retry_delay = configuration_value("asset_image_import_max_retry_delay")
if self.retry_count < max_retries and retry_delay > 0:
if self.reset_for_retry():
return bool(
tasks.assets.download_asset_task.apply_async(
(self.pk,), countdown=retry_delay * 60
)
)
else:
logger.warning(
"Task %s was not reset for retrying, so it will not be retried",
self,
)
return False
else:
logger.warning(
"Task %s has reached the maximum number of retries (%s) "
"and will not be repeated",
self,
max_retries,
)
self.update_failure_history(do_save=False)
self.failed = timezone.now()
new_status = (
"Maximum number of retries reached while retrying "
"image download for asset. The failure reason before retrying "
f"was {self.failure_reason} and the status was {self.status}"
)
self.update_status(new_status, do_save=False)
self.failure_reason = TaskStatusModel.FailureReason.RETRIES
self.save()
return False
return False
class VerifyAssetImageJob(BatchedJob):
"""
Job that verifies a previously downloaded asset image.
"""
asset = models.ForeignKey("concordia.Asset", on_delete=models.CASCADE)
def __str__(self) -> str:
return f"VerifyAssetImageJob for {self.asset}"
class Meta:
unique_together = (("asset", "batch"),)
class DownloadAssetImageJob(BatchedJob):
"""
Job that downloads an asset image for later verification.
"""
asset = models.ForeignKey("concordia.Asset", on_delete=models.CASCADE)
def __str__(self) -> str:
return f"DownloadAssetImageJob for {self.asset}"
class Meta:
unique_together = (("asset", "batch"),)
================================================
FILE: importer/setup.py
================================================
#!/usr/bin/env python
from setuptools import find_packages, setup
VERSION = __import__("importer").get_version()
INSTALL_REQUIREMENTS = ["boto3", "celery", "requests", "Django>=2.1.5", "Pillow"]
DESCRIPTION = "Download collections of images from loc.gov"
CLASSIFIERS = """
Environment :: Web Environment
Framework :: Django :: 2.0
Development Status :: 2 - Pre-Alpha
Programming Language :: Python
Programming Language :: Python :: 3.12
""".splitlines()
with open("README.rst", "r") as f:
LONG_DESCRIPTION = f.read()
setup(
name="importer",
version=VERSION,
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
packages=find_packages(),
include_package_data=True,
install_requires=INSTALL_REQUIREMENTS,
classifiers=CLASSIFIERS,
)
================================================
FILE: importer/tasks/__init__.py
================================================
import concurrent.futures
from logging import getLogger
from typing import Iterable
from .items import import_item_count_from_url
logger = getLogger(__name__)
def fetch_all_urls(items: Iterable[str]) -> tuple[list[str], int]:
"""
Fetch counts for many item URLs concurrently.
Uses a thread pool to call ``import_item_count_from_url`` for each input
URL. Aggregates the returned values and the total score.
Args:
items: Iterable of item URLs.
Returns:
A 2-tuple of:
- list of values returned for each URL, in the map order
- integer sum of all scores
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
result = executor.map(import_item_count_from_url, items)
finals: list[str] = []
totals: int = 0
for value, score in result:
totals += score
finals.append(value)
return finals, totals
================================================
FILE: importer/tasks/assets.py
================================================
import hashlib
import os
from logging import getLogger
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
import boto3
import requests
from celery import Task
from django.conf import settings
from flags.state import flag_enabled
from requests.exceptions import HTTPError
from concordia.storage import ASSET_STORAGE
from importer import models
from importer.celery import app
from importer.exceptions import ImageImportFailure
from .decorators import update_task_status
logger = getLogger(__name__)
@app.task(
bind=True,
autoretry_for=(HTTPError,),
retry_backoff=60 * 60,
retry_backoff_max=8 * 60 * 60,
retry_jitter=True,
retry_kwargs={"max_retries": 3},
rate_limit=1,
)
def download_asset_task(self: Task, import_asset_pk: int) -> None:
"""
Download and persist an asset image for the given ImportItemAsset.
Looks up the ImportItemAsset with related objects to reduce queries, then
delegates to ``download_asset``. Retries on ``HTTPError`` per task config.
Args:
import_asset_pk: Primary key of the ImportItemAsset to process.
Raises:
models.ImportItemAsset.DoesNotExist: If the job row does not exist.
ImageImportFailure: If the download or verification fails.
"""
# Use select_related since slugs from the container objects form the path.
qs = models.ImportItemAsset.objects.select_related(
"import_item__item__project__campaign"
)
try:
import_asset = qs.get(pk=import_asset_pk)
except models.ImportItemAsset.DoesNotExist:
logger.exception(
"ImportItemAsset %s could not be found while attempting to "
"spawn download_asset task",
import_asset_pk,
)
raise
download_asset(self, import_asset)
@update_task_status
def download_asset(self: Task, job: "models.ImportItemAsset") -> None:
"""
Download the image for the given job and save it to working storage.
The URL is taken from ``job.url`` when present, otherwise from
``job.asset.download_url``. The extension is inferred from the URL path
and normalized so ``jpeg`` becomes ``jpg``. On success the asset's
``storage_image`` field is updated.
Args:
job: ImportItemAsset containing the target asset and optional URL.
Raises:
ImageImportFailure: If the download, upload or checksum check fails.
"""
asset = job.asset
download_url: str = job.url if hasattr(job, "url") else asset.download_url
file_extension = (
os.path.splitext(urlparse(download_url).path)[1].lstrip(".").lower()
)
if not file_extension or file_extension == "jpeg":
file_extension = "jpg"
asset_image_filename = asset.get_asset_image_filename(file_extension)
storage_image = download_and_store_asset_image(download_url, asset_image_filename)
logger.info(
"Download and storage of asset image %s complete. Setting "
"storage_image on asset %s (%s)",
storage_image,
asset,
asset.id,
)
asset.storage_image = storage_image
asset.save()
def download_and_store_asset_image(download_url: str, asset_image_filename: str) -> str:
"""
Stream a remote image to a temp file, upload it to storage, then verify.
The file is streamed and hashed with MD5, uploaded to ``ASSET_STORAGE``,
then the object metadata is fetched via S3 ``head_object`` and the ETag is
compared to the computed MD5. When the ``IMPORT_IMAGE_CHECKSUM`` flag is
enabled a mismatch raises ``ImageImportFailure``. When disabled a warning
is logged.
Args:
download_url: HTTP(S) URL of the image to fetch.
asset_image_filename: Destination key or path in storage.
Returns:
The storage key that was written.
Raises:
ImageImportFailure: On HTTP errors, I/O errors or checksum mismatch.
"""
try:
hasher = hashlib.md5(usedforsecurity=False)
# Download the remote file to a temp file then upload to storage.
with NamedTemporaryFile(mode="x+b") as temp_file:
resp = requests.get(download_url, stream=True, timeout=30)
resp.raise_for_status()
for chunk in resp.iter_content(chunk_size=256 * 1024):
temp_file.write(chunk)
hasher.update(chunk)
temp_file.flush()
temp_file.seek(0)
ASSET_STORAGE.save(asset_image_filename, temp_file)
except Exception as exc:
logger.exception(
"Unable to download %s to %s", download_url, asset_image_filename
)
raise ImageImportFailure(
f"Unable to download {download_url} to {asset_image_filename}"
) from exc
filehash = hasher.hexdigest()
response = boto3.client("s3").head_object(
Bucket=settings.AWS_STORAGE_BUCKET_NAME, Key=asset_image_filename
)
etag = response.get("ETag")[1:-1] # trim quotes around hash
if filehash != etag:
if flag_enabled("IMPORT_IMAGE_CHECKSUM"):
logger.error(
"ETag (%s) for %s did not match calculated md5 hash (%s) and "
"the IMPORT_IMAGE_CHECKSUM flag is enabled",
etag,
asset_image_filename,
filehash,
)
raise ImageImportFailure(
f"ETag {etag} for {asset_image_filename} did not match "
f"calculated md5 hash {filehash}"
)
else:
logger.warning(
"ETag (%s) for %s did not match calculated md5 hash (%s) but "
"the IMPORT_IMAGE_CHECKSUM flag is disabled",
etag,
asset_image_filename,
filehash,
)
else:
logger.info(
"Checksums for %s matched. Upload successful.",
asset_image_filename,
)
return asset_image_filename
================================================
FILE: importer/tasks/collections.py
================================================
from logging import getLogger
from typing import Optional
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
import requests
from celery import Task
from django.core.cache import cache
from requests import Session
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from importer import models
from importer.celery import app
from .decorators import update_task_status
from .items import create_item_import_task, get_item_info_from_result
logger = getLogger(__name__)
# Tasks
@app.task(bind=True)
def import_collection_task(
self: Task, import_job_pk: int, redownload: bool = False
) -> None:
"""
Celery entrypoint to import all items from a P1 collection or search URL.
Looks up the ``ImportJob`` and delegates to ``import_collection``.
Args:
import_job_pk: Primary key of the ImportJob.
redownload: If true, force re-download of assets when creating tasks.
"""
import_job = models.ImportJob.objects.get(pk=import_job_pk)
import_collection(self, import_job, redownload)
@update_task_status
def import_collection(
self: Task, import_job: models.ImportJob, redownload: bool = False
) -> None:
"""
Enqueue item import tasks for every item in a normalized collection URL.
Args:
import_job: The ImportJob that initiated the collection import.
redownload: If true, force re-download of assets.
"""
item_info = get_collection_items(normalize_collection_url(import_job.url))
for _, item_url in item_info:
create_item_import_task.delay(import_job.pk, item_url, redownload)
# End tasks
def requests_retry_session(
retries: int = 3,
backoff_factor: float = 60 * 60,
status_forcelist: tuple[int, ...] = (429, 500, 502, 503, 504),
session: Optional[Session] = None,
) -> Session:
"""
Build a ``requests.Session`` with retry behavior for transient failures.
Args:
retries: Total number of retry attempts.
backoff_factor: Multiplier for exponential backoff in seconds.
status_forcelist: HTTP status codes that trigger a retry.
session: Optional existing session to configure.
Returns:
A ``requests.Session`` with retry adapters mounted.
"""
sess = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
sess.mount("http://", adapter)
sess.mount("https://", adapter)
return sess
def normalize_collection_url(original_url: str) -> str:
"""
Normalize a P1 collection or search URL for import.
Rewrites query params needed for JSON output and pagination. Leaves other
filters intact.
Args:
original_url: The source collection or search URL.
Returns:
A normalized URL with ``fo=json`` and without conflicting params.
"""
parsed_url = urlsplit(original_url)
new_qs = [("fo", "json")]
for k, v in parse_qsl(parsed_url.query):
if k not in ("fo", "at", "sp"):
new_qs.append((k, v))
return urlunsplit(
(parsed_url.scheme, parsed_url.netloc, parsed_url.path, urlencode(new_qs), None)
)
def get_collection_items(collection_url: str) -> list[tuple[str, str]]:
"""
Walk a P1 collection or search endpoint and collect item IDs and URLs.
Caches each page response for 48 hours to reduce repeated network calls.
Args:
collection_url: URL of a loc.gov collection or search results page.
Returns:
A list of ``(item_id, item_url)`` tuples discovered across pages.
"""
items: list[tuple[str, str]] = []
current_page_url: Optional[str] = collection_url
while current_page_url:
resp = cache.get(current_page_url)
if resp is None:
resp = requests_retry_session().get(current_page_url)
# 48-hour timeout
cache.set(current_page_url, resp, timeout=(3600 * 48))
data = resp.json()
results = data.get("results", None)
if results:
for result in results:
try:
item_info = get_item_info_from_result(result)
if item_info:
items.append(item_info)
except Exception:
logger.warning(
"Skipping result from %s which did not match expected format:",
current_page_url,
exc_info=True,
extra={"data": {"result": result, "url": current_page_url}},
)
else:
logger.error('Expected URL %s to include "results"', current_page_url)
current_page_url = data.get("pagination", {}).get("next", None)
if not items:
logger.warning("No valid items found for collection url: %s", collection_url)
return items
================================================
FILE: importer/tasks/decorators.py
================================================
from functools import wraps
from logging import getLogger
from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
from celery import Task
from django.utils.timezone import now
from importer import models
from importer.exceptions import ImageImportFailure
logger = getLogger(__name__)
P = ParamSpec("P")
R = TypeVar("R")
def update_task_status(
f: Callable[Concatenate[Task, Any, P], R],
) -> Callable[Concatenate[Task, Any, P], R]:
"""
Decorator to track lifecycle and failure state for a task-like function.
The wrapped function must take the Celery task self as the first argument
and a TaskStatusModel instance as the second argument. On entry records
last_started and task_id. On success sets completed and clears failure
fields. On exception updates status, marks failed, sets failure_reason for
known error types, saves the model, then attempts retry_if_possible before
re-raising.
Also guards against re-running a task already marked completed.
Args:
f: The function to wrap. Must accept
``(self, task_status_object, *args, **kwargs)``.
Returns:
A callable with the same signature as ``f``.
"""
@wraps(f)
def inner(
self: Task, task_status_object: Any, *args: P.args, **kwargs: P.kwargs
) -> R:
# Sanity guard: if another worker already completed this task, skip work.
guard_qs = task_status_object.__class__._default_manager.filter(
pk=task_status_object.pk, completed__isnull=False
)
if guard_qs.exists():
logger.warning(
"Task %s was already completed and will not be repeated",
task_status_object,
extra={
"data": {
"object": task_status_object,
"args": args,
"kwargs": kwargs,
}
},
)
return # noqa: RET504
task_status_object.last_started = now()
task_status_object.task_id = self.request.id
task_status_object.save()
try:
result = f(self, task_status_object, *args, **kwargs)
task_status_object.completed = now()
task_status_object.failed = None
task_status_object.failure_reason = ""
task_status_object.update_status("Completed")
return result
except Exception as exc:
new_status = "{}\n\nUnhandled exception: {}".format(
task_status_object.status, exc
).strip()
task_status_object.update_status(new_status, do_save=False)
task_status_object.failed = now()
if isinstance(exc, ImageImportFailure):
task_status_object.failure_reason = (
models.TaskStatusModel.FailureReason.IMAGE
)
task_status_object.save()
retry_result = task_status_object.retry_if_possible()
if retry_result:
task_status_object.last_started = now()
task_status_object.task_id = retry_result.id
task_status_object.save()
else:
logger.info("Retrying task %s was not possible", task_status_object)
raise
return inner
================================================
FILE: importer/tasks/images.py
================================================
from logging import getLogger
from typing import Any, Optional, Sequence
from uuid import UUID
from celery import Task, chord
from PIL import Image
from requests.exceptions import HTTPError
from concordia.models import Asset
from concordia.storage import ASSET_STORAGE
from importer import models
from importer.celery import app
from .assets import download_asset
from .decorators import update_task_status
logger = getLogger(__name__)
@app.task(
bind=True,
autoretry_for=(HTTPError,),
retry_backoff=60 * 60,
retry_backoff_max=8 * 60 * 60,
retry_jitter=True,
retry_kwargs={"max_retries": 3},
rate_limit=1,
)
def redownload_image_task(self: Task, asset_pk: int) -> None:
"""
Re-download an asset's image and persist it to storage, replacing any
existing image.
Looks up the Asset, creates a DownloadAssetImageJob to track work, then
delegates to download_asset.
Args:
asset_pk: Primary key of the Asset to re-download.
"""
asset = Asset.objects.get(pk=asset_pk)
logger.info("Redownloading %s to %s", asset.download_url, asset.get_absolute_url())
# Create a tracking job so download_asset can run under update_task_status.
job = models.DownloadAssetImageJob.objects.create(asset=asset, batch=None)
download_asset(self, job)
@app.task()
def batch_verify_asset_images_task_callback(
results: Sequence[bool],
batch: UUID,
concurrency: int,
failures_detected: bool,
) -> None:
"""
Callback after a chord of VerifyAssetImageJobs completes.
If no prior failure was noted and any result is False, mark failures as
detected. In all cases enqueue the next verification batch.
Args:
results: Verification outcomes for this chord (True or False).
batch: Identifier for the active batch.
concurrency: Number of jobs to run in the next batch.
failures_detected: Whether a failure was already seen.
"""
# We only care if there are any failures, not exactly which or how many, since we
# automatically create a DownliadImageAssetJob for each failure already, so here
# we skip this check if we already have a detected failure
if not failures_detected:
# No failures so far, so we need to check the results from this latest
# chord of tasks
if any(result is False for result in results):
logger.info(
"At least one verification failure detected for batch %s", batch
)
failures_detected = True
batch_verify_asset_images_task.delay(batch, concurrency, failures_detected)
@app.task(bind=True)
def batch_verify_asset_images_task(
self: Task, batch: UUID, concurrency: int = 2, failures_detected: bool = False
) -> None:
"""
Process VerifyAssetImageJobs in groups of size concurrency.
After processing:
- If any failure was detected, start a DownloadAssetImageJob batch.
- Otherwise, end cleanly.
Args:
batch: Identifier for the batch to process.
concurrency: Number of jobs to process at once. Defaults to 2.
failures_detected: Whether earlier groups reported a failure.
Defaults to False.
"""
logger.info(
"Processing next %s VerifyAssetImageJobs for batch %s", concurrency, batch
)
jobs_to_process = models.VerifyAssetImageJob.objects.filter(
batch=batch, completed__isnull=True, failed__isnull=True
).order_by("created")
if not jobs_to_process.exists():
logger.info("No VerifyAssetImageJobs remain for batch %s", batch)
if failures_detected:
logger.info(
"Failures in VerifyAssetImageJobs in batch %s detected, so "
"starting DownloadAssetImageJob batch",
batch,
)
batch_download_asset_images_task(batch, concurrency)
else:
logger.info(
"No failures in VerifyAssetImageJob batch %s. Ending task.", batch
)
return
task_group = [
verify_asset_image_task.s(job.asset_id, batch)
for job in jobs_to_process[:concurrency]
]
chord(task_group)(
batch_verify_asset_images_task_callback.s(batch, concurrency, failures_detected)
)
@app.task(
bind=True,
autoretry_for=(HTTPError,),
retry_backoff=60 * 60,
retry_backoff_max=8 * 60 * 60,
retry_jitter=True,
retry_kwargs={"max_retries": 3},
rate_limit=1,
)
def verify_asset_image_task(
self: Task, asset_pk: int, batch: Optional[UUID] = None, create_job: bool = False
) -> bool:
"""
Verify that an asset's storage image exists and is readable.
Creates or retrieves a VerifyAssetImageJob, runs verification and updates
status. Retries on HTTPError using exponential backoff.
Args:
asset_pk: Primary key of the Asset to verify.
batch: Identifier for the verification batch, if any.
create_job: If True, create a new job; otherwise fetch an existing one.
Returns:
True if verification succeeds, False otherwise.
Raises:
Asset.DoesNotExist: When the Asset cannot be found.
models.VerifyAssetImageJob.DoesNotExist: When fetching a job that does
not exist.
"""
try:
asset = Asset.objects.get(pk=asset_pk)
except Asset.DoesNotExist:
logger.exception(
"Asset %s could not be found while attempting to "
"spawn verify_asset_image task",
asset_pk,
)
raise
if create_job:
job = models.VerifyAssetImageJob.objects.create(asset=asset, batch=batch)
else:
try:
job = models.VerifyAssetImageJob.objects.get(
asset=asset, batch=batch, completed=None
)
except models.VerifyAssetImageJob.DoesNotExist:
logger.exception(
"Uncompleted VerifyAssetImageJob for asset %s and batch %s could not "
"be found while attempting to spawn verify_asset_image task",
asset,
batch,
)
raise
result = verify_asset_image(self, job)
if result is True:
job.update_status("Storage image verified")
return result
def create_download_asset_image_job(asset: Asset, batch: Optional[UUID]) -> None:
"""
Ensure a DownloadAssetImageJob exists for the given asset and batch.
Args:
asset: Asset to download.
batch: Batch identifier or None.
"""
existing_job = models.DownloadAssetImageJob.objects.filter(
asset=asset, batch=batch
).first()
if not existing_job:
models.DownloadAssetImageJob.objects.create(asset=asset, batch=batch)
@update_task_status
def verify_asset_image(task: Task, job: Any) -> bool:
"""
Verify the presence and integrity of an Asset's storage image.
Checks that a storage image is set, the object exists in storage, and that
the image bytes are not corrupt. On failure, updates job status and creates
a DownloadAssetImageJob.
Args:
task: Celery task instance.
job: VerifyAssetImageJob instance.
Returns:
True if verification succeeds, False otherwise.
"""
asset = job.asset
if not asset.storage_image or not asset.storage_image.name:
status = f"No storage image set on {asset} ({asset.id})"
logger.info(status)
job.update_status(status)
create_download_asset_image_job(asset, job.batch)
return False
else:
logger.info("Storage image set on %s (%s)", asset, asset.id)
if not ASSET_STORAGE.exists(asset.storage_image.name):
status = f"Storage image for {asset} ({asset.id}) missing from storage"
logger.info(status)
job.update_status(status)
create_download_asset_image_job(asset, job.batch)
return False
else:
logger.info("Storage image for %s (%s) found in storage", asset, asset.id)
try:
with ASSET_STORAGE.open(asset.storage_image.name, "rb") as image_file:
with Image.open(image_file) as image:
image.verify()
logger.info("Storage image for %s (%s) is not corrupt", asset, asset.id)
except Exception as exc:
status = (
f"Storage image for {asset} ({asset.id}), {asset.storage_image.name}, "
f"is corrupt. The exception raised was Type: {type(exc).__name__}, "
f"Message: {exc}"
)
logger.info(status)
job.update_status(status)
create_download_asset_image_job(asset, job.batch)
return False
logger.info(
"Storage image for %s (%s), %s, verified successfully",
asset,
asset.id,
asset.storage_image.name,
)
return True
@app.task()
def batch_download_asset_images_task_callback(
results: Sequence[Any], batch: UUID, concurrency: int
) -> None:
"""
Callback after a chord of DownloadAssetImageJobs completes.
Results are ignored. Enqueue the next download batch.
Args:
results: Ignored chord results.
batch: Identifier for the batch.
concurrency: Number of jobs to run in the next batch.
"""
# We do not care about the results of these tasks, so we simply call the
# original task again to continue processing the batch.
batch_download_asset_images_task.delay(batch, concurrency)
@app.task(bind=True)
def batch_download_asset_images_task(
self: Task, batch: UUID, concurrency: int = 10
) -> None:
"""
Process DownloadAssetImageJobs in groups of size concurrency.
Retrieves pending jobs for the batch, runs up to concurrency tasks, then
schedules the next group via a chord callback until none remain.
Args:
batch: Identifier for the batch to process.
concurrency: Number of concurrent tasks per group. Defaults to 10.
"""
logger.info(
"Processing next %s DownloadAssetImageJobs for batch %s", concurrency, batch
)
jobs_to_process = models.DownloadAssetImageJob.objects.filter(
batch=batch, completed__isnull=True, failed__isnull=True
).order_by("created")
if not jobs_to_process.exists():
logger.info("No DownloadAssetImageJobs found for batch %s", batch)
return
task_groups = [
download_asset_image_task.s(job.asset.pk, batch)
for job in jobs_to_process[:concurrency]
]
# Use a chord so when the tasks finish it calls the callback to start the
# remaining jobs until no more remain. The callback just re-invokes this
# task with the same batch and concurrency.
chord(task_groups)(batch_download_asset_images_task_callback.s(batch, concurrency))
@app.task(
bind=True,
autoretry_for=(HTTPError,),
retry_backoff=60 * 60,
retry_backoff_max=8 * 60 * 60,
retry_jitter=True,
retry_kwargs={"max_retries": 3},
rate_limit=1,
)
def download_asset_image_task(
self: Task, asset_pk: int, batch: Optional[UUID] = None, create_job: bool = False
) -> None:
"""
Download an asset's image and track it via DownloadAssetImageJob.
Creates or retrieves a job and delegates to download_asset. Retries on
HTTPError using exponential backoff.
Args:
asset_pk: Primary key of the Asset to download.
batch: Identifier for the batch, if any.
create_job: If True, create a new job; otherwise fetch an existing one.
Raises:
Asset.DoesNotExist: When the Asset cannot be found.
models.DownloadAssetImageJob.DoesNotExist: When fetching a job that
does not exist.
"""
try:
asset = Asset.objects.get(pk=asset_pk)
except Asset.DoesNotExist:
logger.exception(
"Asset %s could not be found while attempting to "
"spawn verify_asset_image task",
asset_pk,
)
raise
if create_job:
job = models.DownloadAssetImageJob.objects.create(asset=asset, batch=batch)
else:
try:
job = models.DownloadAssetImageJob.objects.get(
asset=asset, batch=batch, completed=None
)
except models.DownloadAssetImageJob.DoesNotExist:
logger.exception(
"Uncompleted DownloadAssetImageJob for asset %s and batch %s could not "
"be found while attempting to spawn download_asset_image task",
asset,
batch,
)
raise
return download_asset(self, job)
================================================
FILE: importer/tasks/items.py
================================================
import io
import mimetypes
import os
import re
from logging import getLogger
from typing import Any, List, Optional, Tuple
from urllib.parse import urljoin, urlparse
import requests
from celery import Task, group
from django.core.exceptions import ValidationError
from django.core.files.base import ContentFile
from django.db import transaction
from django.utils.text import slugify
from django.utils.timezone import now
from PIL import Image, UnidentifiedImageError
from requests.exceptions import HTTPError
from concordia.models import Asset, Item, MediaType
from importer import models
from importer.celery import app
from .assets import download_asset_task
from .decorators import update_task_status
#: P1 has generic search / item pages and a number of top-level format-specific
#: "context portals" which expose the same JSON interface.
#: jq 'to_entries[] | select(.value.type == "context-portal") | .key' < manifest.json
ACCEPTED_P1_URL_PREFIXES = [
"collections",
"search",
"item",
"audio",
"books",
"film-and-videos",
"manuscripts",
"maps",
"newspapers",
"notated-music",
"photos",
"websites",
]
logger = getLogger(__name__)
# Tasks
@app.task(
bind=True,
autoretry_for=(HTTPError,),
retry_backoff=60 * 60,
retry_backoff_max=8 * 60 * 60,
retry_jitter=True,
retry_kwargs={"max_retries": 3},
rate_limit=2,
)
def create_item_import_task(
self: Task, import_job_pk: int, item_url: str, redownload: bool = False
) -> Any:
"""
Create an ImportItem for the given job and item URL, then enqueue its
import.
Fetches item metadata from the remote URL, ensures the Item and
ImportItem exist, skips fully-imported items when not redownloading, and
finally schedules ``import_item_task``.
Args:
import_job_pk: Primary key of the ImportJob.
item_url: Absolute item URL on loc.gov.
redownload: Reprocess an existing item even if it has all assets.
Returns:
The AsyncResult returned by ``import_item_task.delay``.
"""
import_job = models.ImportJob.objects.get(pk=import_job_pk)
# Load the Item record with metadata from the remote URL:
resp = requests.get(item_url, params={"fo": "json"}, timeout=30)
resp.raise_for_status()
item_data = resp.json()
item, item_created = Item.objects.get_or_create(
item_id=get_item_id_from_item_url(item_data["item"]["id"]),
defaults={"item_url": item_url, "project": import_job.project},
)
import_item, import_item_created = import_job.items.get_or_create(
url=item_url, item=item
)
if not item_created and redownload is False:
# Item has already been imported and we are not redownloading all items.
asset_urls, item_resource_url = get_asset_urls_from_item_resources(
item.metadata.get("resources", [])
)
if item.asset_set.count() >= len(asset_urls):
# The item has all of its assets, so we can skip it.
logger.warning("Not reprocessing existing item with all assets: %s", item)
import_item.update_status(
f"Not reprocessing existing item with all assets: {item}",
do_save=False,
)
import_item.completed = import_item.last_started = now()
import_item.task_id = self.request.id
import_item.full_clean()
import_item.save()
return
else:
# The item is missing one or more of its assets, so reprocess it.
logger.warning("Reprocessing existing item %s that is missing assets", item)
import_item.item.metadata.update(item_data)
thumbnail_url = populate_item_from_data(import_item.item, item_data["item"])
try:
item.full_clean()
item.save()
except Exception as exc:
# We create the import jobs here, so we cannot rely on the decorator to
# update status. Update the ImportItem status manually then re-raise.
logger.exception("Unhandled exception when importing item %s", item)
new_status = "{}\n\nUnhandled exception: {}".format(
import_item.status, exc
).strip()
import_item.update_status(new_status, do_save=False)
import_item.failed = now()
import_item.task_id = self.request.id
import_item.save()
raise
download_and_set_item_thumbnail(item, thumbnail_url)
return import_item_task.delay(import_item.pk)
@app.task(bind=True)
def import_item_task(self: Task, import_item_pk: int) -> Any:
"""
Enqueue downloads for all assets of a previously created ImportItem.
Args:
import_item_pk: Primary key of the ImportItem to process.
Returns:
The result of the celery group that downloads assets.
"""
i = models.ImportItem.objects.select_related("item").get(pk=import_item_pk)
return import_item(self, i)
@update_task_status
def import_item(self: Task, import_item: Any) -> Any:
"""
Create Asset rows for an ImportItem, create ImportItemAsset rows, then
enqueue downloads for all assets.
Wrapped with ``update_task_status`` to keep job fields updated.
Args:
self: Celery Task instance.
import_item: ImportItem instance being processed.
Returns:
A celery group result for the scheduled download tasks.
"""
# Using transaction.atomic here ensures the data is available in the
# database for the download_asset_task calls. If we do not do this some
# tasks could execute before the transaction is committed, causing failures.
with transaction.atomic():
item_assets: List[Asset] = []
import_assets: List[Any] = []
item_resource_url: Optional[str] = None
asset_urls, item_resource_url = get_asset_urls_from_item_resources(
import_item.item.metadata.get("resources", [])
)
relative_asset_file_path = "/".join(
[
import_item.item.project.campaign.slug,
import_item.item.project.slug,
import_item.item.item_id,
]
)
for sequence, asset_url in enumerate(asset_urls, start=1):
asset_title = f"{import_item.item.item_id}-{sequence}"
file_extension = (
os.path.splitext(urlparse(asset_url).path)[1].lstrip(".").lower()
)
item_asset = Asset(
item=import_item.item,
campaign=import_item.item.project.campaign,
title=asset_title,
slug=slugify(asset_title, allow_unicode=True),
sequence=sequence,
media_type=MediaType.IMAGE,
download_url=asset_url,
resource_url=item_resource_url,
storage_image="/".join(
[relative_asset_file_path, f"{sequence}.{file_extension}"]
),
)
# Previously any asset that raised a validation error was ignored.
# We want validation errors to fail the import.
try:
item_asset.full_clean()
except ValidationError as exc:
raise ValidationError(
f"Importing asset with slug '{item_asset.slug}' for "
f"item '{item_asset.item}' with resource URL "
f"'{item_asset.resource_url}' failed with the following "
f"exception: {exc}"
) from exc
item_assets.append(item_asset)
Asset.objects.bulk_create(item_assets)
for asset in item_assets:
import_asset = models.ImportItemAsset(
import_item=import_item,
asset=asset,
url=asset.download_url,
sequence_number=asset.sequence,
)
import_asset.full_clean()
import_assets.append(import_asset)
import_item.assets.bulk_create(import_assets)
import_item.full_clean()
import_item.save()
download_asset_group = group(download_asset_task.s(i.pk) for i in import_assets)
return download_asset_group()
# End tasks
def import_item_count_from_url(import_url: str) -> Tuple[str, int]:
"""
Return a tuple of status string and asset count for a loc.gov item URL.
Args:
import_url: Absolute item URL.
Returns:
A pair of ``(status_message, count)``. On error returns a message and
count 0.
"""
try:
resp = requests.get(import_url, params={"fo": "json"}, timeout=30)
resp.raise_for_status()
item_data = resp.json()
output = len(item_data["resources"][0]["files"])
return f"{import_url} - Asset Count: {output}", output
except Exception as exc:
return f"Unhandled exception importing {import_url} {exc}", 0
def get_item_info_from_result(
result: dict,
) -> Optional[Tuple[str, str]]:
"""
Extract an item_id and item_url from a P1 search result.
Skips results with unsupported formats or without an image_url.
Args:
result: A single result object from the P1 JSON response.
Returns:
``(item_id, item_url)`` when supported, otherwise None.
"""
ignored_formats = {"collection", "web page"}
item_id = result["id"]
original_format = result["original_format"]
if ignored_formats.intersection(original_format):
logger.info(
"Skipping result %s because it contains an unsupported format: %s",
item_id,
original_format,
extra={"data": {"result": result}},
)
return None
image_url = result.get("image_url")
if not image_url:
logger.info(
"Skipping result %s because it lacks an image_url",
item_id,
extra={"data": {"result": result}},
)
return None
item_url = result["url"]
m = re.search(r"loc.gov/item/([^/]+)", item_url)
if not m:
logger.info(
"Skipping %s because the URL %s doesn't appear to be an item!",
item_id,
item_url,
extra={"data": {"result": result}},
)
return None
return m.group(1), item_url
def get_item_id_from_item_url(item_url: str) -> str:
"""
Extract the item_id component from a loc.gov item URL.
Args:
item_url: Absolute item URL.
Returns:
The item_id string.
"""
if item_url.endswith("/"):
item_id = item_url.split("/")[-2]
else:
item_id = item_url.split("/")[-1]
return item_id
def import_items_into_project_from_url(
requesting_user: Any, project: Any, import_url: str, redownload: bool = False
) -> Any:
"""
Create an ImportJob for the given URL and enqueue item or collection import.
Determines whether the URL is an item or a collection/search URL and
schedules the appropriate task.
Args:
requesting_user: User creating the ImportJob.
project: Project that will own the imported Items.
import_url: loc.gov item or collection/search URL.
redownload: Reprocess existing items even if they have all assets.
Returns:
The created ImportJob instance.
"""
parsed_url = urlparse(import_url)
m = re.match(
r"^/(%s)/" % "|".join(map(re.escape, ACCEPTED_P1_URL_PREFIXES)), parsed_url.path
)
if not m:
raise ValueError(
f"{import_url} doesn't match one of the known importable patterns"
)
url_type = m.group(1)
import_job = models.ImportJob(
project=project, created_by=requesting_user, url=import_url
)
import_job.full_clean()
import_job.save()
if url_type == "item":
create_item_import_task.delay(import_job.pk, import_url, redownload)
else:
# Both collections and search results return the same format JSON
# response so we can use the same code to process them.
from .collections import import_collection_task
import_collection_task.delay(import_job.pk, redownload)
return import_job
def populate_item_from_data(item: Item, item_info: dict) -> Optional[str]:
"""
Populate an Item from a loc.gov item JSON fragment.
Sets title and description when present. Chooses a JPG thumbnail URL if
available, stores it on the Item, and returns the resolved URL.
Args:
item: The Item instance to update.
item_info: The ``item`` object from the P1 response.
Returns:
The resolved thumbnail URL when found, otherwise None.
"""
for k in ("title", "description"):
v = item_info.get(k)
if v:
setattr(item, k, v)
# FIXME: this was never set before so we do not have selection logic.
thumb_urls = [i for i in item_info["image_url"] if ".jpg" in i]
if thumb_urls:
item.thumbnail_url = urljoin(item.item_url, thumb_urls[0])
try:
image_urls = item_info.get("image_url") or []
thumb_urls = [u for u in image_urls if ".jpg" in u]
except Exception:
thumb_urls = []
if thumb_urls:
resolved = urljoin(item.item_url, thumb_urls[0])
# TODO: remove setting thumbnail_url once field is removed.
item.thumbnail_url = resolved
return resolved
return None
def get_asset_urls_from_item_resources(
resources: List[dict],
) -> Tuple[List[str], str]:
"""
From a P1 resources list, pick best image URL per file.
Prefers the largest JPEG variant per file. If no JPEGs exist, falls back
to the largest GIF. Also returns the item resource URL when available.
Args:
resources: The ``resources`` array from the P1 response.
Returns:
A tuple of ``(asset_urls, item_resource_url)``.
"""
assets: List[str] = []
try:
item_resource_url = resources[0]["url"] or ""
except (IndexError, KeyError):
item_resource_url = ""
for resource in resources:
# Each "file" contains a set of variants. Select the largest preferred
# type per file.
for item_file in resource.get("files", []):
candidates: List[Tuple[str, int]] = []
backup_candidates: List[Tuple[str, int]] = []
for variant in item_file:
if any(i for i in ("url", "height", "width") if i not in variant):
continue
url = variant["url"]
height = variant["height"]
width = variant["width"]
mimetype = variant.get("mimetype")
# Prefer JPEG; if none exist use GIF.
if mimetype == "image/jpeg":
candidates.append((url, height * width))
elif mimetype == "image/gif":
backup_candidates.append((url, height * width))
if candidates:
candidates.sort(key=lambda i: i[1], reverse=True)
assets.append(candidates[0][0])
elif backup_candidates:
backup_candidates.sort(key=lambda i: i[1], reverse=True)
assets.append(backup_candidates[0][0])
return assets, item_resource_url
def _guess_extension(content_type: Optional[str], url_path: str) -> str:
"""Guess a safe extension from Content-Type or URL, defaulting to .bin."""
if content_type:
ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
if ext:
return ext
_, ext = os.path.splitext(url_path)
if ext:
return ext.lower()
return ".bin"
def _safe_filename(item: Item, ext: str) -> str:
"""Build a filename for the item's thumbnail."""
base = slugify(item.item_id or f"item-{item.pk}") or f"item-{item.pk}"
return f"{base}{ext}"
def download_and_set_item_thumbnail(
item: Item,
url: str,
force: bool = False,
connect_timeout: float = 5.0,
read_timeout: float = 30.0,
) -> str:
"""
Download an image from url and save it to item.thumbnail_image.
The image is validated with Pillow. The function will not set a new
thumbnail_image if one already exists unless ``force=True``. Filename is
stable per item and inferred from Content-Type or URL with a safe fallback.
Args:
item: The Item instance to modify and save.
url: Absolute URL for the image to download.
force: Overwrite an existing thumbnail if True.
connect_timeout: Requests connect timeout in seconds.
read_timeout: Requests read timeout in seconds.
Returns:
The storage path of the saved image, or a message if skipped.
Raises:
ValueError: If the image is invalid.
requests.RequestException: Network errors during download.
"""
# Lock the row briefly to avoid pointless work if someone else is writing.
with transaction.atomic():
locked = (
Item.objects.select_for_update(of=("self",))
.only("id", "thumbnail_image")
.get(pk=item.pk)
)
if locked.thumbnail_image and not force:
msg = "Thumbnail already exists; skipping (use force=True to overwrite)."
logger.warning(
"download_and_set_item_thumbnail: %s item_pk=%s", msg, item.pk
)
return msg
timeout = (connect_timeout, read_timeout)
logger.info(
"download_and_set_item_thumbnail: downloading url=%s item_pk=%s",
url,
item.pk,
)
with requests.get(url, stream=True, timeout=timeout) as resp:
resp.raise_for_status()
content_type = (resp.headers.get("Content-Type") or "").lower()
buf = io.BytesIO()
for chunk in resp.iter_content(chunk_size=64 * 1024):
if not chunk:
continue
buf.write(chunk)
# Validate image integrity with Pillow.
try:
buf.seek(0)
with Image.open(buf) as img:
img.verify()
except UnidentifiedImageError as exc:
raise ValueError("Downloaded file is not a valid image.") from exc
# Decide file extension. Try header, URL, then Pillow.
url_path = urlparse(url).path
ext = _guess_extension(content_type, url_path)
# If we got a blank or .bin extension we could not infer it from headers
# or URL. Inspect bytes with Pillow, default to jpg.
if ext in (".bin", ""):
try:
buf.seek(0)
with Image.open(buf) as probe:
fmt = (probe.format or "").lower()
ext = {
"jpeg": ".jpg",
"jpg": ".jpg",
"png": ".png",
"gif": ".gif",
"webp": ".webp",
"tiff": ".tif",
"bmp": ".bmp",
}.get(fmt, ".jpg")
finally:
buf.seek(0)
filename = _safe_filename(item, ext)
content = ContentFile(buf.getvalue())
with transaction.atomic():
locked = Item.objects.select_for_update(of=("self",)).get(pk=item.pk)
if locked.thumbnail_image and not force:
msg = (
"Thumbnail already present after download; skipping save. "
"Use force=True to overwrite."
)
logger.warning(
"download_and_set_item_thumbnail: %s item_id=%s", msg, item.pk
)
return msg
locked.thumbnail_image.save(filename, content, save=True)
logger.info(
"download_and_set_item_thumbnail: saved as %s item_id=%s",
locked.thumbnail_image.name,
locked.pk,
)
return locked.thumbnail_image.name
================================================
FILE: importer/tests/README.md
================================================
# Importer Tests
This directory contains tests for the importer application. It has a
combination of Django TestCases (which will create a test database
before running each test), and pyunit tests.
## Pre-requisites
- Regarding Django TestCases, since these tests create a test database, the docker container with the db must be running — for example:
```console
$ docker-compose up -d db
```
- Use the settings module with defaults appropriate for testing:
```console
$ export DJANGO_SETTINGS_MODULE=concordia.settings_test
```
or
```console
$ pipenv run manage.py test --settings=concordia.settings_test
```
## Running the tests
- To run all tests:
```console
$ python manage.py test importer
```
- To run a single unittest module:
```console
$ python manage.py test importer.tests.test_importer
```
- To run a single unittest in a django unittest module:
```console
$ python manage.py test
importer.tests.test_importer.CreateCampaignViewTest.test_create_item_campaign
```
================================================
FILE: importer/tests/__init__.py
================================================
================================================
FILE: importer/tests/test_admin.py
================================================
import uuid
from unittest import mock
from django.contrib import messages
from django.test import RequestFactory, TestCase
from django.utils import timezone
from concordia.models import Campaign
from concordia.tests.utils import create_asset, create_campaign
from importer.admin import (
BatchFilter,
ImportCampaignListFilter,
TaskStatusModelAdmin,
retry_download_task,
)
from importer.models import ImportItemAsset, VerifyAssetImageJob
from .utils import create_import_asset, create_verify_asset_image_job
@mock.patch("importer.admin.download_asset_task.delay", autospec=True)
@mock.patch("importer.admin.messages.add_message", autospec=True)
class ActionTests(TestCase):
def test_retry_download_task(self, messages_mock, task_mock):
import_asset1 = create_import_asset(0)
import_assets = [import_asset1] + [
create_import_asset(i, import_item=import_asset1.import_item)
for i in range(1, 10)
]
import_asset_count = len(import_assets)
import_asset_args = [(import_asset.pk,) for import_asset in import_assets]
modeladmin_mock = mock.MagicMock()
request = RequestFactory().get("/")
retry_download_task(modeladmin_mock, request, ImportItemAsset.objects.all())
args_list = [arg for arg, kwargs in task_mock.call_args_list]
self.assertEqual(task_mock.call_count, import_asset_count)
self.assertEqual(args_list, import_asset_args)
self.assertEqual(messages_mock.call_count, 1)
self.assertEqual(
messages_mock.call_args.args,
(request, messages.INFO, f"Queued {import_asset_count} tasks"),
)
class ImportCampaignListFilterTest(TestCase):
def test_lookups(self):
class TestImportCampaignListFilter(ImportCampaignListFilter):
# We need a subclass because ImportCampaignListFilter itself
# isn't meant to be used directly, and can't be due
# to not having a parameter_name configured
parameter_name = "campaign"
campaigns = [create_campaign(slug=f"test-campaign-{i}") for i in range(5)]
campaigns += [
create_campaign(
slug="test-campaign-completed", status=Campaign.Status.COMPLETED
)
]
retired_campaign = create_campaign(
slug="test-campaign-retired",
title="Retired Campaign",
status=Campaign.Status.RETIRED,
)
philter = TestImportCampaignListFilter(
None, {}, mock.MagicMock(), mock.MagicMock()
)
values_list = philter.lookups(mock.MagicMock(), mock.MagicMock())
self.assertEqual(len(values_list), len(campaigns))
for idx, title in values_list:
self.assertNotEqual(idx, retired_campaign.id)
self.assertNotIn("Retired", title)
@mock.patch("importer.admin.naturaltime")
class TaskStatusModelAdminTest(TestCase):
def test_generate_natural_timestamp_display_property(self, naturaltime_mock):
inner = TaskStatusModelAdmin.generate_natural_timestamp_display_property(
"test_field"
)
obj = mock.MagicMock()
value = inner(obj)
self.assertTrue(naturaltime_mock.called)
naturaltime_mock.reset_mock()
obj = mock.MagicMock(spec=["test_field"])
obj.test_field = None
value = inner(obj)
self.assertEqual(value, None)
self.assertFalse(naturaltime_mock.called)
naturaltime_mock.reset_mock()
# Passing an empty list to spec means there are no
# attributes on the mock, so accessing any attribute
# will raise an AttributeError
obj = mock.MagicMock(spec=[])
value = inner(obj)
self.assertEqual(value, None)
self.assertFalse(naturaltime_mock.called)
class BatchFilterTests(TestCase):
def setUp(self):
self.request = mock.MagicMock()
self.model_admin = mock.MagicMock()
self.filter = BatchFilter(
self.request, {}, VerifyAssetImageJob, self.model_admin
)
self.batch1 = str(uuid.uuid4())
self.batch2 = str(uuid.uuid4())
self.batch3 = str(uuid.uuid4())
self.batch4 = str(uuid.uuid4())
self.batch5 = str(uuid.uuid4())
self.batch6 = str(uuid.uuid4())
asset1 = create_asset()
asset2 = create_asset(item=asset1.item, slug="test-asset-2")
asset3 = create_asset(item=asset1.item, slug="test-asset-3")
create_verify_asset_image_job(asset=asset1, batch=self.batch1, completed=None)
create_verify_asset_image_job(asset=asset2, batch=self.batch2, completed=None)
create_verify_asset_image_job(asset=asset3, batch=self.batch3, completed=None)
create_verify_asset_image_job(asset=asset3, batch=self.batch4, completed=None)
create_verify_asset_image_job(asset=asset3, batch=self.batch5, completed=None)
create_verify_asset_image_job(asset=asset3, batch=self.batch6, completed=None)
@mock.patch("importer.admin.BatchFilter.value", return_value=None)
def test_lookups_incomplete_batches(self, mock_value):
self.model_admin.get_queryset.return_value = VerifyAssetImageJob.objects.all()
lookups = self.filter.lookups(self.request, self.model_admin)
self.assertEqual(len(lookups), 5)
@mock.patch("importer.admin.BatchFilter.value", return_value=None)
def test_lookups_includes_current_batch(self, mock_value):
mock_value.return_value = self.batch2
self.model_admin.get_queryset.return_value = VerifyAssetImageJob.objects.all()
lookups = self.filter.lookups(self.request, self.model_admin)
batch_ids = [batch[0] for batch in lookups]
self.assertIn(self.batch2, batch_ids)
@mock.patch("importer.admin.BatchFilter.value", return_value=None)
def test_lookups_includes_recent_completed_batch(self, mock_value):
VerifyAssetImageJob.objects.filter(batch=self.batch6).update(
completed=timezone.now()
)
self.model_admin.get_queryset.return_value = VerifyAssetImageJob.objects.all()
lookups = self.filter.lookups(self.request, self.model_admin)
batch_ids = [batch[0] for batch in lookups]
self.assertIn(self.batch6, batch_ids)
@mock.patch("importer.admin.BatchFilter.value", return_value=None)
def test_lookups_fills_with_completed_batches(self, mock_value):
batch_list = [self.batch1, self.batch2, self.batch3, self.batch4, self.batch5]
VerifyAssetImageJob.objects.filter(batch__in=batch_list).update(
completed=timezone.now()
)
self.model_admin.get_queryset.return_value = VerifyAssetImageJob.objects.all()
lookups = self.filter.lookups(self.request, self.model_admin)
self.assertEqual(len(lookups), 5)
@mock.patch("importer.admin.BatchFilter.value", return_value=None)
def test_queryset_filters_correctly(self, mock_value):
mock_value.return_value = self.batch1
queryset = self.filter.queryset(self.request, VerifyAssetImageJob.objects.all())
batch_ids = queryset.values_list("batch", flat=True)
self.assertTrue(all(str(batch) == self.batch1 for batch in batch_ids))
@mock.patch("importer.admin.BatchFilter.value", return_value=None)
def test_queryset_returns_all_when_no_batch_selected(self, mock_value):
mock_value.return_value = None
queryset = self.filter.queryset(self.request, VerifyAssetImageJob.objects.all())
self.assertEqual(queryset.count(), VerifyAssetImageJob.objects.count())
================================================
FILE: importer/tests/test_celery.py
================================================
import tempfile
from types import SimpleNamespace
from unittest import mock
from django.test import TestCase
import importer.celery as celery_mod
from importer.celery import import_all_submodules
class ImporterCeleryTests(TestCase):
def test_returns_early_for_non_package(self):
mock_pkg = SimpleNamespace(__name__="not_a_pkg") # no __path__
with (
mock.patch.object(
celery_mod.importlib, "import_module", return_value=mock_pkg
) as mock_import,
mock.patch.object(celery_mod.pkgutil, "walk_packages") as mock_walk,
):
import_all_submodules("not_a_pkg")
mock_import.assert_called_once_with("not_a_pkg")
mock_walk.assert_not_called()
def test_imports_all_submodules_for_package(self):
sub1 = SimpleNamespace(name="dummy_pkg.sub1")
sub2 = SimpleNamespace(name="dummy_pkg.sub2")
with tempfile.TemporaryDirectory() as td:
mock_pkg = SimpleNamespace(__name__="dummy_pkg", __path__=[td])
with (
mock.patch.object(celery_mod.importlib, "import_module") as mock_import,
mock.patch.object(
celery_mod.pkgutil, "walk_packages", return_value=[sub1, sub2]
) as mock_walk,
):
def side_effect(name):
if name == "dummy_pkg":
return mock_pkg
return SimpleNamespace(__name__=name)
mock_import.side_effect = side_effect
import_all_submodules("dummy_pkg")
mock_walk.assert_called_once()
args, _kwargs = mock_walk.call_args
self.assertEqual(args[0], mock_pkg.__path__)
self.assertEqual(args[1], mock_pkg.__name__ + ".")
self.assertIn(mock.call("dummy_pkg"), mock_import.mock_calls)
self.assertIn(mock.call("dummy_pkg.sub1"), mock_import.mock_calls)
self.assertIn(mock.call("dummy_pkg.sub2"), mock_import.mock_calls)
def test_package_with_no_submodules(self):
with tempfile.TemporaryDirectory() as td:
mock_pkg = SimpleNamespace(__name__="empty_pkg", __path__=[td])
with (
mock.patch.object(celery_mod.importlib, "import_module") as mock_import,
mock.patch.object(
celery_mod.pkgutil, "walk_packages", return_value=[]
) as mock_walk,
):
mock_import.side_effect = lambda name: (
mock_pkg if name == "empty_pkg" else SimpleNamespace(__name__=name)
)
import_all_submodules("empty_pkg")
mock_walk.assert_called_once()
mock_import.assert_called_once_with("empty_pkg")
def test__load_all_task_modules_invokes_imports(self):
with mock.patch.object(celery_mod, "import_all_submodules") as mock_import_all:
celery_mod._load_all_task_modules(sender=celery_mod.app)
mock_import_all.assert_has_calls(
[
mock.call("concordia.tasks"),
mock.call("importer.tasks"),
],
any_order=False,
)
def test_on_after_finalize_signal_triggers_handler(self):
with mock.patch.object(celery_mod, "import_all_submodules") as mock_import_all:
celery_mod.app.on_after_finalize.send(sender=celery_mod.app)
mock_import_all.assert_has_calls(
[mock.call("concordia.tasks"), mock.call("importer.tasks")],
any_order=False,
)
self.assertEqual(mock_import_all.call_count, 2)
================================================
FILE: importer/tests/test_models.py
================================================
import uuid
from django.test import TestCase
from django.urls import reverse
from django.utils import timezone
from concordia.tests.utils import CreateTestUsers, create_asset, create_project
from importer.models import TaskStatusModel
from .utils import (
create_download_asset_image_job,
create_import_asset,
create_import_item,
create_import_job,
create_verify_asset_image_job,
)
class ImportJobTests(TestCase, CreateTestUsers):
def test_str(self):
user = self.create_test_user()
project = create_project()
url = "http://example.com"
job = create_import_job(project=project)
self.assertEqual(
str(job), f"ImportJob(created_by=None, project={project.title}, url=)"
)
job.created_by = user
job.url = url
self.assertEqual(
str(job),
f"ImportJob(created_by={user.username}, "
f"project={project.title}, url={url})",
)
def test_retry_if_possible(self):
# This method is just a placeholder for this model,
# so we're just testing to make sure it doesn't error
# and returns False, since any other value will cause issues
job = create_import_job()
self.assertFalse(job.retry_if_possible())
def test_update_failure_history(self):
job = create_import_job()
job.failed = timezone.now()
job.failure_reason = TaskStatusModel.FailureReason.IMAGE
job.status = "Test failure status"
job.failure_history = []
job.save()
job.update_failure_history()
failure_history = job.failure_history
self.assertEqual(len(failure_history), 1)
self.assertNotEqual(failure_history[0]["failed"], "")
self.assertEqual(
failure_history[0]["failure_reason"], TaskStatusModel.FailureReason.IMAGE
)
self.assertEqual(failure_history[0]["status"], "Test failure status")
class ImportItemTests(TestCase, CreateTestUsers):
def test_str(self):
job = create_import_job()
url = "http://example.com"
item = create_import_item(import_job=job)
self.assertEqual(str(item), f"ImportItem(job={job}, url=)")
item.url = url
self.assertEqual(str(item), f"ImportItem(job={job}, url={url})")
def test_retry_if_possible(self):
# This method is just a placeholder for this model,
# so we're just testing to make sure it doesn't error
# and returns False, since any other value will cause issues
item = create_import_item()
self.assertFalse(item.retry_if_possible())
class ImportItemAssetTests(TestCase, CreateTestUsers):
def test_str(self):
item = create_import_item()
url = "http://example.com"
asset = create_import_asset(import_item=item)
self.assertEqual(str(asset), f"ImportItemAsset(import_item={item}, url=)")
asset.url = url
self.assertEqual(str(asset), f"ImportItemAsset(import_item={item}, url={url})")
class VerifyAssetImageJobTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
self.job = create_verify_asset_image_job(asset=self.asset, batch=self.batch_id)
def test_str_representation(self):
self.assertEqual(str(self.job), f"VerifyAssetImageJob for {self.asset}")
def test_batch_admin_url(self):
expected_url = (
reverse("admin:importer_verifyassetimagejob_changelist")
+ f"?batch={self.batch_id}"
)
self.assertEqual(self.job.batch_admin_url, expected_url)
def test_get_batch_admin_url(self):
expected_url = (
reverse("admin:importer_verifyassetimagejob_changelist")
+ f"?batch={self.batch_id}"
)
url = self.job.__class__.get_batch_admin_url(self.batch_id)
self.assertEqual(url, expected_url)
def test_get_batch_admin_url_error(self):
with self.assertRaises(ValueError):
self.job.__class__.get_batch_admin_url("")
def test_update_failure_history(self):
self.job.failed = timezone.now()
self.job.failure_reason = "Image"
self.job.status = "Failed due to image error"
self.job.update_failure_history()
self.assertEqual(len(self.job.failure_history), 1)
self.assertEqual(self.job.failure_history[0]["failure_reason"], "Image")
def test_update_status(self):
self.job.update_status("Processing")
self.assertEqual(self.job.status, "Processing")
self.assertEqual(len(self.job.status_history), 1)
self.assertEqual(self.job.status_history[0]["status"], "")
def test_reset_for_retry(self):
self.job.failed = timezone.now()
self.assertTrue(self.job.reset_for_retry())
self.assertIsNone(self.job.failed)
self.assertEqual(self.job.retry_count, 1)
def test_reset_for_retry_when_not_failed(self):
self.assertFalse(self.job.reset_for_retry())
self.assertEqual(
self.job.status,
"Task was not marked as failed, so it will not be reset for retrying.",
)
class DownloadAssetImageJobTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
self.job = create_download_asset_image_job(
asset=self.asset, batch=self.batch_id
)
def test_str_representation(self):
self.assertEqual(str(self.job), f"DownloadAssetImageJob for {self.asset}")
def test_batch_admin_url(self):
expected_url = (
reverse("admin:importer_downloadassetimagejob_changelist")
+ f"?batch={self.batch_id}"
)
self.assertEqual(self.job.batch_admin_url, expected_url)
def test_get_batch_admin_url(self):
expected_url = (
reverse("admin:importer_downloadassetimagejob_changelist")
+ f"?batch={self.batch_id}"
)
url = self.job.__class__.get_batch_admin_url(self.batch_id)
self.assertEqual(url, expected_url)
def test_get_batch_admin_url_error(self):
with self.assertRaises(ValueError):
self.job.__class__.get_batch_admin_url("")
def test_update_failure_history(self):
self.job.failed = timezone.now()
self.job.failure_reason = "Image"
self.job.status = "Failed due to image error"
self.job.update_failure_history()
self.assertEqual(len(self.job.failure_history), 1)
self.assertEqual(self.job.failure_history[0]["failure_reason"], "Image")
def test_update_status(self):
self.job.update_status("Processing")
self.assertEqual(self.job.status, "Processing")
self.assertEqual(len(self.job.status_history), 1)
self.assertEqual(self.job.status_history[0]["status"], "")
def test_reset_for_retry(self):
self.job.failed = timezone.now()
self.assertTrue(self.job.reset_for_retry())
self.assertIsNone(self.job.failed)
self.assertEqual(self.job.retry_count, 1)
def test_reset_for_retry_when_not_failed(self):
self.assertFalse(self.job.reset_for_retry())
self.assertEqual(
self.job.status,
"Task was not marked as failed, so it will not be reset for retrying.",
)
================================================
FILE: importer/tests/test_tasks_assets.py
================================================
import uuid
from unittest import mock
import requests
from django.core.cache import caches
from django.db.models import Max
from django.test import TestCase, override_settings
from django.utils import timezone
from PIL import UnidentifiedImageError
from concordia.models import Asset
from concordia.tests.utils import create_asset
from configuration.models import Configuration
from importer import exceptions, tasks
from importer.models import (
DownloadAssetImageJob,
ImportItemAsset,
TaskStatusModel,
VerifyAssetImageJob,
)
from .utils import (
create_download_asset_image_job,
create_import_asset,
create_verify_asset_image_job,
)
class RedownloadImageTaskTests(TestCase):
@mock.patch("importer.tasks.images.download_asset")
def test_redownload_image_task(self, mock_download):
tasks.images.redownload_image_task(create_asset().pk)
self.assertTrue(mock_download.called)
class AssetImportTests(TestCase):
def setUp(self):
for cache in caches.all():
cache.clear()
self.import_asset = create_import_asset(url="http://example.com")
self.asset = self.import_asset.asset
self.job = create_download_asset_image_job(asset=self.asset)
# It's difficult/impossible to cleanly mock a decorator due to the way
# they're applied when the decorated object/function is evaluated on
# import, so we unfortunately have to handle the update_task_status
# decorator, so we need a mock object that can pass for a Celery task
# object so update_task_status doesn't error during the test
self.task_mock = mock.MagicMock()
self.task_mock.request.id = "f81d4fae-7dec-11d0-a765-00a0c91e6bf6"
self.get_return_value = [b"chunk1", b"chunk2"]
self.valid_hash = "097c42989a9e5d9dcced7b35ec4b0486"
self.invalid_hash = "bad-hash"
self.filename = self.asset.get_asset_image_filename()
self.head_object_mock = mock.MagicMock()
self.s3_client_mock = mock.MagicMock()
self.s3_client_mock.head_object = self.head_object_mock
def tearDown(self):
for cache in caches.all():
cache.clear()
def test_get_asset_urls_from_item_resources_empty(self):
self.assertEqual(tasks.items.get_asset_urls_from_item_resources([]), ([], ""))
def test_get_asset_urls_from_item_resources_url_only(self):
results = tasks.items.get_asset_urls_from_item_resources(
[{"url": "http://example.com"}]
)
self.assertEqual(results, ([], "http://example.com"))
def test_get_asset_urls_from_item_resources_valid(self):
results = tasks.items.get_asset_urls_from_item_resources(
[
{
"url": "http://example.com",
"files": [
[
{
"url": "http://example.com/1.jpg",
"height": 1,
"width": 1,
"mimetype": "image/jpeg",
},
{"url": "http://example.com/2.jpg"},
{
"url": "http://example.com/3.jpg",
"height": 2,
"width": 2,
"mimetype": "image/jpeg",
},
{
"url": "http://example.com/4.jpg",
"height": 100,
"width": 100,
"mimetype": "image/gif",
},
]
],
}
]
)
self.assertEqual(results, (["http://example.com/3.jpg"], "http://example.com"))
def test_get_asset_urls_from_item_resource_no_valid(self):
results = tasks.items.get_asset_urls_from_item_resources(
[
{
"url": "http://example.com",
"files": [
[
{
"url": "http://example.com/1.jpg",
"height": 1,
"width": 1,
"mimetype": "file/pdf",
},
{"url": "http://example.com/2.jpg"},
{
"url": "http://example.com/3.jpg",
"height": 2,
"width": 2,
"mimetype": "video/mov",
},
{
"url": "http://example.com/4.jpg",
"height": 100,
"width": 100,
"mimetype": "image/tiff",
},
]
],
}
]
)
self.assertEqual(results, ([], "http://example.com"))
def test_get_asset_urls_from_item_resource_no_jpgs(self):
results = tasks.items.get_asset_urls_from_item_resources(
[
{
"url": "http://example.com",
"files": [
[
{
"url": "http://example.com/1.jpg",
"height": 1,
"width": 1,
"mimetype": "file/pdf",
},
{"url": "http://example.com/2.jpg"},
{
"url": "http://example.com/3.gif",
"height": 2,
"width": 2,
"mimetype": "image/gif",
},
{
"url": "http://example.com/4.gif",
"height": 100,
"width": 100,
"mimetype": "image/gif",
},
]
],
}
]
)
self.assertEqual(results, (["http://example.com/4.gif"], "http://example.com"))
def test_download_asset_task(self):
with mock.patch("importer.tasks.assets.download_asset") as task_mock:
tasks.assets.download_asset_task(self.import_asset.pk)
self.assertTrue(task_mock.called)
task, called_import_asset = task_mock.call_args.args
self.assertTrue(called_import_asset, self.import_asset)
# Test sending a bad pk
task_mock.reset_mock()
max_pk = ImportItemAsset.objects.aggregate(Max("pk"))["pk__max"]
with self.assertRaises(ImportItemAsset.DoesNotExist):
tasks.assets.download_asset_task(max_pk + 1)
self.assertFalse(task_mock.called)
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_valid(self):
with (
mock.patch("importer.tasks.assets.requests.get") as get_mock,
mock.patch("importer.tasks.assets.boto3.client") as boto_mock,
mock.patch("importer.tasks.assets.flag_enabled") as flag_mock,
):
get_mock.return_value.iter_content.return_value = self.get_return_value
boto_mock.return_value = self.s3_client_mock
flag_mock.return_value = True
self.head_object_mock.return_value = {"ETag": f'"{self.valid_hash}"'}
tasks.assets.download_asset(self.task_mock, self.import_asset)
self.assertEqual(get_mock.call_args[0], ("http://example.com",))
self.assertTrue(get_mock.call_args[1]["stream"])
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_valid_checksum_fail(self):
with (
mock.patch("importer.tasks.assets.requests.get") as get_mock,
mock.patch("importer.tasks.assets.boto3.client") as boto_mock,
mock.patch("importer.tasks.assets.flag_enabled") as flag_mock,
):
get_mock.return_value.iter_content.return_value = self.get_return_value
boto_mock.return_value = self.s3_client_mock
flag_mock.return_value = True
self.head_object_mock.return_value = {"ETag": f'"{self.invalid_hash}"'}
with self.assertRaises(Exception) as assertion:
tasks.assets.download_asset(self.task_mock, self.import_asset)
self.assertEqual(
str(assertion.exception),
f"ETag {self.invalid_hash} for {self.filename} did not match "
f"calculated md5 hash {self.valid_hash}",
)
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_valid_checksum_fail_without_flag(self):
with (
mock.patch("importer.tasks.assets.requests.get") as get_mock,
mock.patch("importer.tasks.assets.boto3.client") as boto_mock,
self.assertLogs("importer.tasks", level="WARN") as log,
):
get_mock.return_value.iter_content.return_value = self.get_return_value
boto_mock.return_value = self.s3_client_mock
self.head_object_mock.return_value = {"ETag": f'"{self.invalid_hash}"'}
tasks.assets.download_asset(self.task_mock, self.import_asset)
self.assertEqual(
log.output[0],
f"WARNING:importer.tasks.assets:ETag ({self.invalid_hash}) for "
f"{self.filename} did not match calculated md5 hash "
f"({self.valid_hash}) but the IMPORT_IMAGE_CHECKSUM flag is disabled",
)
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_invalid(self):
with (
mock.patch("importer.tasks.assets.requests.get") as get_mock,
self.assertLogs("importer.tasks", level="ERROR") as log,
):
get_mock.return_value.raise_for_status.side_effect = AttributeError
with self.assertRaises(exceptions.ImageImportFailure):
tasks.assets.download_asset(self.task_mock, self.import_asset)
# Since the logging includes a stacktrace, we just check the
# beginning of the log entry with assertIn
self.assertIn(
"ERROR:importer.tasks.assets:"
"Unable to download http://example.com to "
"test-campaign/test-project/testitem.0123456789/1.jpg",
log.output[0],
)
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_retry_success(self):
import_asset = self.import_asset
import_asset.failed = timezone.now()
import_asset.completed = None
import_asset.failure_reason = TaskStatusModel.FailureReason.IMAGE
import_asset.status = "Test failed status"
import_asset.retry_count = 0
import_asset.failure_history = []
import_asset.save()
with mock.patch(
"importer.models.tasks.assets.download_asset_task"
) as task_mock:
response = import_asset.retry_if_possible()
self.assertNotEqual(response, False)
self.assertTrue(task_mock.apply_async.called)
self.assertEqual(len(import_asset.failure_history), 1)
self.assertEqual(import_asset.failed, None)
self.assertEqual(import_asset.retry_count, 1)
self.assertEqual(import_asset.failure_reason, "")
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_retry_maximum_exceeded(self):
try:
config = Configuration.objects.get(key="asset_image_import_max_retries")
config.value = "1"
config.data_type = Configuration.DataType.NUMBER
config.save()
except Configuration.DoesNotExist:
Configuration.objects.create(
key="asset_image_import_max_retries",
value="1",
data_type=Configuration.DataType.NUMBER,
)
import_asset = self.import_asset
import_asset.failed = timezone.now()
import_asset.completed = None
import_asset.failure_reason = TaskStatusModel.FailureReason.IMAGE
import_asset.status = "Test failed status"
import_asset.retry_count = 1
import_asset.failure_history = []
import_asset.save()
with mock.patch(
"importer.models.tasks.assets.download_asset_task"
) as task_mock:
response = import_asset.retry_if_possible()
self.assertFalse(response)
self.assertFalse(task_mock.apply_async.called)
self.assertEqual(len(import_asset.failure_history), 1)
self.assertNotEqual(import_asset.failed, None)
self.assertEqual(
import_asset.status,
"Maximum number of retries reached while retrying image download "
"for asset. The failure reason before retrying was Image and the "
"status was Test failed status",
)
self.assertEqual(import_asset.retry_count, 1)
self.assertEqual(
import_asset.failure_reason, TaskStatusModel.FailureReason.RETRIES
)
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_retry_cant_reset(self):
import_asset = self.import_asset
import_asset.completed = None
import_asset.failure_reason = TaskStatusModel.FailureReason.IMAGE
import_asset.status = "Test failed status"
import_asset.retry_count = 0
import_asset.failure_history = []
import_asset.save()
with mock.patch(
"importer.models.tasks.assets.download_asset_task"
) as task_mock:
response = import_asset.retry_if_possible()
self.assertFalse(response)
self.assertFalse(task_mock.apply_async.called)
self.assertNotEqual(import_asset.status, "Test failed status")
self.assertEqual(len(import_asset.failure_history), 0)
self.assertEqual(import_asset.failed, None)
self.assertEqual(import_asset.retry_count, 0)
self.assertEqual(
import_asset.failure_reason, TaskStatusModel.FailureReason.IMAGE
)
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_retry_invalid_failure_reason(self):
import_asset = self.import_asset
import_asset.failed = timezone.now()
import_asset.completed = None
import_asset.failure_reason = ""
import_asset.status = "Test failed status"
import_asset.retry_count = 0
import_asset.failure_history = []
import_asset.save()
with mock.patch(
"importer.models.tasks.assets.download_asset_task"
) as task_mock:
response = import_asset.retry_if_possible()
self.assertFalse(response)
self.assertFalse(task_mock.apply_async.called)
self.assertEqual(import_asset.status, "Test failed status")
self.assertEqual(len(import_asset.failure_history), 0)
self.assertNotEqual(import_asset.failed, None)
self.assertEqual(import_asset.retry_count, 0)
self.assertEqual(import_asset.failure_reason, "")
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
def test_download_asset_manual_retry_success(self):
# This mimics an admin manually retrying the task, rather than
# the automatic retry system (such as through an admin action).
# We want to be sure the failure information is correctly reset.
import_asset = self.import_asset
import_asset.failed = timezone.now()
import_asset.completed = None
import_asset.failure_reason = ""
import_asset.status = "Test failed status"
import_asset.retry_count = 0
import_asset.failure_history = []
import_asset.save()
with mock.patch(
"importer.models.tasks.assets.download_and_store_asset_image"
) as download_mock:
download_mock.return_value = "image.jpg"
tasks.assets.download_asset_task.delay(import_asset.pk)
import_asset.refresh_from_db()
self.assertTrue(download_mock.called)
self.assertEqual(import_asset.status, "Completed")
self.assertEqual(len(import_asset.failure_history), 0)
self.assertEqual(import_asset.failed, None)
self.assertEqual(import_asset.retry_count, 0)
self.assertEqual(import_asset.failure_reason, "")
@mock.patch("importer.tasks.assets.download_and_store_asset_image")
@mock.patch("importer.tasks.assets.logger.info")
def test_download_url_from_asset(self, mock_logger, mock_download):
self.asset.download_url = "https://example.com/image.png"
self.asset.save()
self.job.refresh_from_db()
mock_download.return_value = "stored_image.png"
tasks.assets.download_asset(self.task_mock, self.job)
mock_download.assert_called_once_with(self.asset.download_url, mock.ANY)
self.asset.refresh_from_db()
self.assertEqual(self.asset.storage_image, "stored_image.png")
mock_logger.assert_any_call(
"Download and storage of asset image %s complete. Setting storage_image "
"on asset %s (%s)",
"stored_image.png",
self.asset,
self.asset.id,
)
@mock.patch("importer.tasks.assets.download_and_store_asset_image")
@mock.patch("importer.tasks.assets.logger.info")
def test_valid_file_extension(self, mock_logger, mock_download):
self.asset.download_url = "https://example.com/image.png"
self.asset.save()
self.job.refresh_from_db()
mock_download.return_value = "stored_image.png"
tasks.assets.download_asset(self.task_mock, self.job)
asset_image_filename = self.asset.get_asset_image_filename("png")
mock_download.assert_called_once_with(
self.asset.download_url, asset_image_filename
)
self.asset.refresh_from_db()
self.assertEqual(self.asset.storage_image, "stored_image.png")
mock_logger.assert_any_call(
"Download and storage of asset image %s complete. Setting storage_image "
"on asset %s (%s)",
"stored_image.png",
self.asset,
self.asset.id,
)
class BatchVerifyAssetImagesTaskCallbackTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 5
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_no_failures_detected_no_failures_in_results(self, mock_task):
results = [True, True, True]
tasks.images.batch_verify_asset_images_task_callback(
results, self.batch_id, self.concurrency, False
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency, False)
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_no_failures_detected_some_failures_in_results(self, mock_task):
results = [True, False, True]
with self.assertLogs("importer.tasks", level="INFO") as log:
tasks.images.batch_verify_asset_images_task_callback(
results, self.batch_id, self.concurrency, False
)
self.assertIn(
"INFO:importer.tasks.images:At least one verification "
f"failure detected for batch {self.batch_id}",
log.output,
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency, True)
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_failures_already_detected(self, mock_task):
results = [True, False, True]
tasks.images.batch_verify_asset_images_task_callback(
results, self.batch_id, self.concurrency, True
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency, True)
class BatchVerifyAssetImagesTaskTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 2
asset1 = create_asset()
asset2 = create_asset(item=asset1.item, slug="test-asset-2")
self.job1 = create_verify_asset_image_job(batch=self.batch_id, asset=asset1)
self.job2 = create_verify_asset_image_job(batch=self.batch_id, asset=asset2)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.batch_download_asset_images_task")
def test_no_jobs_remaining_with_failures(self, mock_batch_download, mock_logger):
VerifyAssetImageJob.objects.all().delete()
tasks.images.batch_verify_asset_images_task(
self.batch_id, self.concurrency, True
)
mock_logger.assert_any_call(
"Failures in VerifyAssetImageJobs in batch %s detected, so starting "
"DownloadAssetImageJob batch",
self.batch_id,
)
mock_batch_download.assert_called_once_with(self.batch_id, self.concurrency)
@mock.patch("importer.tasks.images.logger.info")
def test_no_jobs_remaining_no_failures(self, mock_logger):
VerifyAssetImageJob.objects.all().delete()
tasks.images.batch_verify_asset_images_task(
self.batch_id, self.concurrency, False
)
mock_logger.assert_any_call(
"No failures in VerifyAssetImageJob batch %s. Ending task.", self.batch_id
)
@mock.patch("importer.tasks.images.chord")
@mock.patch("importer.tasks.images.verify_asset_image_task.s")
def test_jobs_remaining(self, mock_task_s, mock_chord):
tasks.images.batch_verify_asset_images_task(
self.batch_id, self.concurrency, False
)
self.assertEqual(mock_task_s.call_count, 2)
mock_chord.assert_called()
class VerifyAssetImageTaskTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
@mock.patch("importer.tasks.images.logger.exception")
def test_asset_not_found(self, mock_logger):
with self.assertRaises(Asset.DoesNotExist):
tasks.images.verify_asset_image_task(999)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.logger.exception")
def test_verify_job_not_found(self, mock_logger):
with self.assertRaises(VerifyAssetImageJob.DoesNotExist):
tasks.images.verify_asset_image_task(
self.asset.pk, self.batch_id, create_job=False
)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.verify_asset_image")
def test_verify_asset_image_task_success(self, mock_verify):
job = create_verify_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_verify.return_value = True
result = tasks.images.verify_asset_image_task(self.asset.pk, self.batch_id)
self.assertTrue(result)
job.refresh_from_db()
self.assertEqual(job.status, "Storage image verified")
@mock.patch("importer.tasks.images.verify_asset_image")
def test_verify_asset_image_task_failure(self, mock_verify):
job = create_verify_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_verify.return_value = False
result = tasks.images.verify_asset_image_task(self.asset.pk, self.batch_id)
self.assertFalse(result)
job.refresh_from_db()
self.assertNotEqual(job.status, "Storage image verified")
@mock.patch("importer.tasks.images.verify_asset_image")
def test_create_verify_asset_image_job(self, mock_verify):
mock_verify.return_value = True
result = tasks.images.verify_asset_image_task(
self.asset.pk, self.batch_id, create_job=True
)
self.assertTrue(result)
self.assertTrue(
VerifyAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).exists()
)
@mock.patch("importer.tasks.images.verify_asset_image")
def test_http_error_retries(self, mock_verify):
create_verify_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_verify.side_effect = requests.exceptions.HTTPError("HTTP Error Occurred")
with self.assertRaises(requests.exceptions.HTTPError):
tasks.images.verify_asset_image_task(self.asset.pk, self.batch_id)
class CreateDownloadAssetImageJobTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
def test_create_new_job(self):
tasks.images.create_download_asset_image_job(self.asset, self.batch_id)
self.assertTrue(
DownloadAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).exists()
)
def test_existing_uncompleted_job_not_duplicated(self):
create_download_asset_image_job(asset=self.asset, batch=self.batch_id)
tasks.images.create_download_asset_image_job(self.asset, self.batch_id)
job_count = DownloadAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).count()
self.assertEqual(job_count, 1)
def test_create_new_job_if_previous_failed(self):
failed_job = create_download_asset_image_job(
asset=self.asset, batch=self.batch_id
)
failed_job.failed = timezone.now()
failed_job.save()
new_batch = uuid.uuid4()
tasks.images.create_download_asset_image_job(self.asset, new_batch)
job_count = DownloadAssetImageJob.objects.filter(asset=self.asset).count()
self.assertEqual(job_count, 2)
class VerifyAssetImageTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.job = create_verify_asset_image_job(asset=self.asset)
self.mock_task = mock.MagicMock()
self.mock_task.request.id = uuid.uuid4()
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
def test_no_storage_image(self, mock_create_job, mock_logger):
# Use update in order to avoid the validation of storage_image, since this is
# an invalid value, but we have to account for it
Asset.objects.filter(id=self.asset.id).update(storage_image="")
# We need to update the job from the database to get rid of the cached asset
self.job.refresh_from_db()
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"No storage image set on {self.asset} ({self.asset.id})"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=False)
def test_storage_image_missing(self, mock_exists, mock_create_job, mock_logger):
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"Storage image for {self.asset} ({self.asset.id}) missing from storage"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=True)
@mock.patch("importer.tasks.images.ASSET_STORAGE.open")
@mock.patch(
"importer.tasks.images.Image.open",
side_effect=UnidentifiedImageError("Invalid image format"),
)
def test_storage_image_invalid(
self, mock_image_open, mock_open, mock_exists, mock_create_job, mock_logger
):
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"Storage image for {self.asset} ({self.asset.id}), "
f"{self.asset.storage_image.name}, is corrupt. The exception "
"raised was Type: UnidentifiedImageError, Message: Invalid image format"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=True)
@mock.patch("importer.tasks.images.ASSET_STORAGE.open")
@mock.patch("importer.tasks.images.Image.open")
def test_storage_image_verify_fail(
self, mock_image_open, mock_open, mock_exists, mock_create_job, mock_logger
):
mock_image = mock.MagicMock()
mock_image.verify.side_effect = UnidentifiedImageError("Invalid image format")
mock_image_open.return_value.__enter__.return_value = mock_image
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"Storage image for {self.asset} ({self.asset.id}), "
f"{self.asset.storage_image.name}, is corrupt. The exception "
"raised was Type: UnidentifiedImageError, Message: Invalid image format"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=True)
@mock.patch("importer.tasks.images.ASSET_STORAGE.open")
@mock.patch("importer.tasks.images.Image.open")
def test_storage_image_verification_success(
self, mock_image_open, mock_open, mock_exists, mock_logger
):
mock_image = mock.MagicMock()
mock_image.verify.return_value = None
mock_image_open.return_value.__enter__.return_value = mock_image
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertTrue(result)
mock_logger.assert_any_call(
"Storage image for %s (%s), %s, verified successfully",
self.asset,
self.asset.id,
self.asset.storage_image.name,
)
class BatchDownloadAssetImagesTaskCallbackTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 5
@mock.patch("importer.tasks.images.batch_download_asset_images_task.delay")
def test_callback_triggers_next_batch(self, mock_task):
results = [True, False, True]
tasks.images.batch_download_asset_images_task_callback(
results, self.batch_id, self.concurrency
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency)
@mock.patch("importer.tasks.images.batch_download_asset_images_task.delay")
def test_callback_with_no_results(self, mock_task):
results = []
tasks.images.batch_download_asset_images_task_callback(
results, self.batch_id, self.concurrency
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency)
@mock.patch("importer.tasks.images.batch_download_asset_images_task.delay")
def test_callback_with_all_successful_results(self, mock_task):
results = [True, True, True]
tasks.images.batch_download_asset_images_task_callback(
results, self.batch_id, self.concurrency
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency)
class BatchDownloadAssetImagesTaskTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 3
asset1 = create_asset()
asset2 = create_asset(item=asset1.item, slug="test-asset-2")
asset3 = create_asset(item=asset1.item, slug="test-asset-3")
self.job1 = create_download_asset_image_job(batch=self.batch_id, asset=asset1)
self.job2 = create_download_asset_image_job(batch=self.batch_id, asset=asset2)
self.job3 = create_download_asset_image_job(batch=self.batch_id, asset=asset3)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.chord")
@mock.patch("importer.tasks.images.download_asset_image_task.s")
def test_jobs_remaining(self, mock_task_s, mock_chord, mock_logger):
tasks.images.batch_download_asset_images_task(self.batch_id, self.concurrency)
self.assertEqual(mock_task_s.call_count, 3)
mock_chord.assert_called()
mock_logger.assert_any_call(
"Processing next %s DownloadAssetImageJobs for batch %s",
self.concurrency,
self.batch_id,
)
@mock.patch("importer.tasks.images.logger.info")
def test_no_jobs_remaining(self, mock_logger):
DownloadAssetImageJob.objects.all().delete()
tasks.images.batch_download_asset_images_task(self.batch_id, self.concurrency)
mock_logger.assert_any_call(
"No DownloadAssetImageJobs found for batch %s", self.batch_id
)
class DownloadAssetImageTaskTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
@mock.patch("importer.tasks.images.logger.exception")
def test_asset_not_found(self, mock_logger):
with self.assertRaises(Asset.DoesNotExist):
tasks.images.download_asset_image_task(999)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.logger.exception")
def test_download_job_not_found(self, mock_logger):
with self.assertRaises(DownloadAssetImageJob.DoesNotExist):
tasks.images.download_asset_image_task(
self.asset.pk, self.batch_id, create_job=False
)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.download_asset")
def test_download_asset_image_task_success(self, mock_download):
create_download_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_download.return_value = "Download successful"
result = tasks.images.download_asset_image_task(self.asset.pk, self.batch_id)
self.assertEqual(result, "Download successful")
@mock.patch("importer.tasks.images.download_asset")
def test_create_download_asset_image_job(self, mock_download):
mock_download.return_value = "Download successful"
result = tasks.images.download_asset_image_task(
self.asset.pk, self.batch_id, create_job=True
)
self.assertEqual(result, "Download successful")
self.assertTrue(
DownloadAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).exists()
)
@mock.patch("importer.tasks.images.download_asset")
def test_http_error_retries(self, mock_download):
mock_download.side_effect = requests.exceptions.HTTPError("HTTP Error Occurred")
with self.assertRaises(requests.exceptions.HTTPError):
tasks.images.download_asset_image_task(
self.asset.pk, self.batch_id, create_job=True
)
================================================
FILE: importer/tests/test_tasks_collections.py
================================================
import sys
from unittest import mock
import requests
from django.core.cache.backends.base import BaseCache
from django.test import TestCase, override_settings
from concordia.tests.utils import CreateTestUsers
from importer import tasks
from importer.tasks.collections import (
import_collection_task,
normalize_collection_url,
)
from importer.tests.utils import create_import_job
class MockResponse:
def __init__(self, original_format="item"):
self.original_format = original_format
def json(self):
url = "https://www.loc.gov/item/%s/" % "mss859430021"
return {
"results": [
{
"id": 1,
"image_url": "https://www.loc.gov/resource/mss85943.000212/",
"original_format": {self.original_format},
"url": url,
},
],
"pagination": {},
}
class MockCache(BaseCache):
def __init__(self, host, *args, **kwargs):
params = {}
super().__init__(params, **kwargs)
def get(self, key, default=None, version=None):
resp = MockResponse()
return resp
# Ensure dotted path used in override_settings still resolves after splitting.
# The original tests referenced "importer.tests.test_tasks.MockCache".
# Point that module name at this module so the cache backend can import it.
sys.modules.setdefault("importer.tests.test_tasks", sys.modules[__name__])
class GetCollectionItemsTests(TestCase):
@mock.patch.object(requests.Session, "get")
@override_settings(
CACHES={
"default": {
"BACKEND": "django.core.cache.backends.dummy.DummyCache",
}
}
)
def test_cache_miss(self, mock_get):
mock_get.return_value = MockResponse()
mock_get.return_value.url = "https://www.loc.gov/collections/example/"
items = tasks.collections.get_collection_items(
"https://www.loc.gov/collections/example/"
)
self.assertEqual(len(items), 1)
@override_settings(
CACHES={
"default": {
"BACKEND": "importer.tests.test_tasks.MockCache",
}
}
)
def test_cache_hit(self):
items = tasks.collections.get_collection_items(
"https://www.loc.gov/collections/example/"
)
self.assertEqual(len(items), 1)
@mock.patch.object(requests.Session, "get")
@override_settings(
CACHES={
"default": {
"BACKEND": "django.core.cache.backends.dummy.DummyCache",
}
}
)
def test_ignored_format(self, mock_get):
mock_get.return_value = MockResponse(original_format="collection")
mock_get.return_value.url = "https://www.loc.gov/collections/example/"
with self.assertLogs("importer.tasks", level="INFO") as log:
items = tasks.collections.get_collection_items(
"https://www.loc.gov/collections/example/"
)
self.assertEqual(
log.output[0],
"INFO:importer.tasks.items:"
"Skipping result 1 because it contains an "
"unsupported format: {'collection'}",
)
self.assertEqual(len(items), 0)
def test_multiple_items(self):
with (
mock.patch("importer.tasks.collections.cache") as cache_mock,
mock.patch(
"importer.tasks.collections.requests_retry_session"
) as requests_mock,
mock.patch(
"importer.tasks.collections.get_item_info_from_result"
) as result_mock,
):
cache_mock.get.return_value = None
requests_mock.return_value.get.return_value.json.return_value = {
"results": [1, 2, 3]
}
# Each time this mock is called, the next value in the list
# is returned
result_mock.side_effect = [4, 5, None]
items = tasks.collections.get_collection_items("http://example.com")
self.assertEqual(items, [4, 5])
self.assertEqual(result_mock.call_count, 3)
def test_no_results(self):
with (
mock.patch("importer.tasks.collections.cache") as cache_mock,
mock.patch(
"importer.tasks.collections.requests_retry_session"
) as requests_mock,
self.assertLogs("importer.tasks", level="ERROR") as log,
):
cache_mock.get.return_value = None
requests_mock.return_value.get.return_value.json.return_value = {}
items = tasks.collections.get_collection_items("http://example.com")
self.assertEqual(items, [])
self.assertEqual(
log.output,
[
"ERROR:importer.tasks.collections:"
'Expected URL http://example.com to include "results"'
],
)
def test_get_info_exception(self):
with (
mock.patch("importer.tasks.collections.cache") as cache_mock,
mock.patch(
"importer.tasks.collections.requests_retry_session"
) as requests_mock,
mock.patch("importer.tasks.items.get_item_info_from_result") as result_mock,
self.assertLogs("importer.tasks", level="WARNING") as log,
):
cache_mock.get.return_value = None
requests_mock.return_value.get.return_value.json.return_value = {
"results": [1]
}
result_mock.side_effect = AttributeError
items = tasks.collections.get_collection_items("http://example.com")
self.assertEqual(items, [])
# The first log entry contains a stack trace, so we use assertIn
# rather than assertEqual here
self.assertIn(
"WARNING:importer.tasks.collections:"
"Skipping result from http://example.com which did not match "
"expected format:",
log.output[0],
)
self.assertEqual(
log.output[1],
"WARNING:importer.tasks.collections:"
"No valid items found for collection url: http://example.com",
)
class ImportCollectionTests(CreateTestUsers, TestCase):
def setUp(self):
self.login_user()
@mock.patch("importer.tasks.collections.get_collection_items")
@mock.patch("importer.tasks.collections.normalize_collection_url")
def test_import_collection(self, mock_get, mock_normalize):
magic_mock = mock.MagicMock()
magic_mock.request = mock.MagicMock()
magic_mock.request.id = 1
import_job = create_import_job(created_by=self.user)
mock_get.return_value = ((None, None),)
import_collection_task(import_job.pk)
self.assertTrue(mock_get.called)
@mock.patch("importer.tasks.collections.create_item_import_task.delay")
@mock.patch("importer.tasks.collections.get_collection_items")
@mock.patch("importer.tasks.collections.normalize_collection_url")
def test_import_collection_enqueues_item_tasks(
self, mock_normalize, mock_get, mock_delay
):
import_job = create_import_job(created_by=self.user)
mock_normalize.return_value = "https://www.loc.gov/collections/example/?fo=json"
mock_get.return_value = [
("mss1", "https://www.loc.gov/item/mss1/"),
("mss2", "https://www.loc.gov/item/mss2/"),
]
# redownload=True so we can assert the third arg is propagated
import_collection_task(import_job.pk, redownload=True)
self.assertEqual(mock_delay.call_count, 2)
self.assertEqual(
mock_delay.call_args_list,
[
mock.call(import_job.pk, "https://www.loc.gov/item/mss1/", True),
mock.call(import_job.pk, "https://www.loc.gov/item/mss2/", True),
],
)
class CollectionURLNormalizationTests(TestCase):
def test_basic_normalization(self):
self.assertEqual(
normalize_collection_url(
"https://www.loc.gov/collections/branch-rickey-papers/"
),
"https://www.loc.gov/collections/branch-rickey-papers/?fo=json",
)
def test_extra_querystring_parameters(self):
self.assertEqual(
normalize_collection_url(
"https://www.loc.gov/collections/branch-rickey-papers/?foo=bar"
),
"https://www.loc.gov/collections/branch-rickey-papers/?fo=json&foo=bar",
)
def test_conflicting_querystring_parameters(self):
self.assertEqual(
normalize_collection_url(
"https://www.loc.gov/collections/branch-rickey-papers/?foo=bar&fo=xml&sp=99&at=item" # NOQA
),
"https://www.loc.gov/collections/branch-rickey-papers/?fo=json&foo=bar",
)
================================================
FILE: importer/tests/test_tasks_core.py
================================================
import concurrent.futures
from unittest import mock
from django.test import TestCase
from importer.tasks import fetch_all_urls
class FetchAllUrlsTests(TestCase):
@mock.patch.object(concurrent.futures.ThreadPoolExecutor, "map")
def test_fetch_all_urls(self, mock_map):
output = "https://www.loc.gov/item/mss859430021/ - Asset Count: 0"
mock_map.return_value = ((output, 0),)
finals, totals = fetch_all_urls(
[
"https://www.loc.gov/item/mss859430021/",
]
)
self.assertEqual(finals, [output])
self.assertEqual(totals, 0)
================================================
FILE: importer/tests/test_tasks_decorators.py
================================================
from unittest import mock
from django.test import TestCase
from django.utils import timezone
from importer.exceptions import ImageImportFailure
from importer.models import ImportJob, TaskStatusModel
from importer.tasks.decorators import update_task_status
from importer.tests.utils import create_import_job
class TaskDecoratorTests(TestCase):
def test_update_task_status(self):
def test_function(self, task_status_object, raise_exception=False):
task_status_object.test_function_ran = True
if raise_exception:
raise Exception("Test Exception")
task_status_object.test_function_finished = True
wrapped_test_function = update_task_status(test_function)
# We create this non-mocked completed job here to use in a later test
# because we can't easily do this once we mock ImportJob.save
test_job = create_import_job(completed=timezone.now())
# We can't just mock the entire model here or use easily use a custom
# class because update_task_status depends on Django model internals,
# particularly __class__._default_manager. __class__ cannot be overriden
# (it points to MagicMock), Model._default_manager cannot be set directly
# and mocking Model.objects does not cause called on Model._default_manager
# to properly use the mock--it continues to use the actual Model.objects
with mock.patch.multiple(
ImportJob,
save=mock.MagicMock(),
__str__=mock.MagicMock(return_value="Mock Job"),
):
job = ImportJob()
wrapped_test_function(mock.MagicMock(), job)
self.assertTrue(hasattr(job, "test_function_ran"))
self.assertTrue(job.test_function_ran)
self.assertTrue(hasattr(job, "test_function_finished"))
self.assertTrue(job.test_function_finished)
self.assertNotEqual(job.last_started, None)
self.assertNotEqual(job.task_id, None)
self.assertTrue(job.completed)
self.assertTrue(job.save.called)
ImportJob.save.reset_mock()
job2 = ImportJob()
job2.status = "Original Status"
with self.assertRaisesRegex(Exception, "Test Exception"):
wrapped_test_function(mock.MagicMock(), job2, True)
self.assertTrue(hasattr(job2, "test_function_ran"))
self.assertTrue(job2.test_function_ran)
self.assertFalse(hasattr(job2, "test_function_finished"))
self.assertNotEqual(job2.last_started, None)
self.assertNotEqual(job2.task_id, None)
self.assertFalse(job2.completed)
self.assertTrue(job2.save.called)
self.assertEqual(
job2.status, "Original Status\n\nUnhandled exception: Test Exception"
)
ImportJob.save.reset_mock()
job3 = ImportJob()
job3.id = test_job.id
with self.assertLogs("importer.tasks", level="WARNING") as log:
wrapped_test_function(mock.MagicMock(), job3)
self.assertEqual(
log.output,
[
"WARNING:importer.tasks.decorators:Task Mock Job was "
"already completed and will not be repeated"
],
)
self.assertFalse(hasattr(job3, "test_function_ran"))
self.assertFalse(hasattr(job3, "test_function_finished"))
self.assertEqual(job3.last_started, None)
self.assertEqual(job3.task_id, None)
self.assertFalse(job3.completed)
self.assertFalse(job3.save.called)
@mock.patch.multiple(
ImportJob,
save=mock.MagicMock(),
__str__=mock.MagicMock(return_value="Mock Job"),
retry_if_possible=mock.MagicMock(),
)
def test_update_task_status_retry_path_sets_last_started_and_task_id(self):
def test_function(self, task_status_object):
raise Exception("boom")
wrapped = update_task_status(test_function)
job = ImportJob()
# Simulate Celery task self with a request.id
task_self = mock.MagicMock()
task_self.request.id = "orig-task-id"
# Make retry_if_possible return an object with an id, like an AsyncResult
retry_result = mock.MagicMock()
retry_result.id = "retry-123"
ImportJob.retry_if_possible.return_value = retry_result
with self.assertRaisesRegex(Exception, "boom"):
wrapped(task_self, job)
# After a retriable exception, the decorator should set these from retry_result
self.assertEqual(job.task_id, "retry-123")
self.assertIsNotNone(job.last_started)
# Saves: one before calling f(), one after exception handling, one after retry
self.assertGreaterEqual(ImportJob.save.call_count, 3)
ImportJob.retry_if_possible.assert_called_once_with()
@mock.patch.multiple(
ImportJob,
save=mock.MagicMock(),
__str__=mock.MagicMock(return_value="Mock Job"),
retry_if_possible=mock.MagicMock(return_value=False),
)
def test_update_task_status_sets_image_failure_reason(self):
def test_function(self, task_status_object):
# Raising ImageImportFailure should set failure_reason to IMAGE.
raise ImageImportFailure("bad image")
wrapped = update_task_status(test_function)
job = ImportJob()
task_self = mock.MagicMock()
task_self.request.id = "task-123"
with self.assertRaises(ImageImportFailure):
wrapped(task_self, job)
self.assertEqual(job.failure_reason, TaskStatusModel.FailureReason.IMAGE)
self.assertIsNotNone(job.failed)
# save() should have been called at least twice (pre & post exception path)
self.assertGreaterEqual(ImportJob.save.call_count, 2)
================================================
FILE: importer/tests/test_tasks_images.py
================================================
import uuid
from unittest import mock
import requests
from django.test import TestCase
from django.utils import timezone
from PIL import Image
from concordia.models import Asset
from concordia.tests.utils import (
create_asset,
)
from importer import tasks
from importer.models import (
DownloadAssetImageJob,
VerifyAssetImageJob,
)
from importer.tasks.images import redownload_image_task
from .utils import (
create_download_asset_image_job,
create_verify_asset_image_job,
)
class RedownloadImageTaskTests(TestCase):
@mock.patch("importer.tasks.images.download_asset")
def test_redownload_image_task(self, mock_download):
redownload_image_task(create_asset().pk)
self.assertTrue(mock_download.called)
class BatchVerifyAssetImagesTaskCallbackTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 5
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_no_failures_detected_no_failures_in_results(self, mock_task):
results = [True, True, True]
tasks.images.batch_verify_asset_images_task_callback(
results, self.batch_id, self.concurrency, False
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency, False)
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_no_failures_detected_some_failures_in_results(self, mock_task):
results = [True, False, True]
with self.assertLogs("importer.tasks", level="INFO") as log:
tasks.images.batch_verify_asset_images_task_callback(
results, self.batch_id, self.concurrency, False
)
self.assertIn(
"INFO:importer.tasks.images:At least one verification "
f"failure detected for batch {self.batch_id}",
log.output,
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency, True)
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_failures_already_detected(self, mock_task):
results = [True, False, True]
tasks.images.batch_verify_asset_images_task_callback(
results, self.batch_id, self.concurrency, True
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency, True)
class BatchVerifyAssetImagesTaskTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 2
asset1 = create_asset()
asset2 = create_asset(item=asset1.item, slug="test-asset-2")
self.job1 = create_verify_asset_image_job(batch=self.batch_id, asset=asset1)
self.job2 = create_verify_asset_image_job(batch=self.batch_id, asset=asset2)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.batch_download_asset_images_task")
def test_no_jobs_remaining_with_failures(self, mock_batch_download, mock_logger):
VerifyAssetImageJob.objects.all().delete()
tasks.images.batch_verify_asset_images_task(
self.batch_id, self.concurrency, True
)
mock_logger.assert_any_call(
"Failures in VerifyAssetImageJobs in batch %s detected, so starting "
"DownloadAssetImageJob batch",
self.batch_id,
)
mock_batch_download.assert_called_once_with(self.batch_id, self.concurrency)
@mock.patch("importer.tasks.images.logger.info")
def test_no_jobs_remaining_no_failures(self, mock_logger):
VerifyAssetImageJob.objects.all().delete()
tasks.images.batch_verify_asset_images_task(
self.batch_id, self.concurrency, False
)
mock_logger.assert_any_call(
"No failures in VerifyAssetImageJob batch %s. Ending task.", self.batch_id
)
@mock.patch("importer.tasks.images.chord")
@mock.patch("importer.tasks.images.verify_asset_image_task.s")
def test_jobs_remaining(self, mock_task_s, mock_chord):
tasks.images.batch_verify_asset_images_task(
self.batch_id, self.concurrency, False
)
self.assertEqual(mock_task_s.call_count, 2)
mock_chord.assert_called()
class VerifyAssetImageTaskTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
@mock.patch("importer.tasks.images.logger.exception")
def test_asset_not_found(self, mock_logger):
with self.assertRaises(Asset.DoesNotExist):
tasks.images.verify_asset_image_task(999)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.logger.exception")
def test_verify_job_not_found(self, mock_logger):
with self.assertRaises(VerifyAssetImageJob.DoesNotExist):
tasks.images.verify_asset_image_task(
self.asset.pk, self.batch_id, create_job=False
)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.verify_asset_image")
def test_verify_asset_image_task_success(self, mock_verify):
job = create_verify_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_verify.return_value = True
result = tasks.images.verify_asset_image_task(self.asset.pk, self.batch_id)
self.assertTrue(result)
job.refresh_from_db()
self.assertEqual(job.status, "Storage image verified")
@mock.patch("importer.tasks.images.verify_asset_image")
def test_verify_asset_image_task_failure(self, mock_verify):
job = create_verify_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_verify.return_value = False
result = tasks.images.verify_asset_image_task(self.asset.pk, self.batch_id)
self.assertFalse(result)
job.refresh_from_db()
self.assertNotEqual(job.status, "Storage image verified")
@mock.patch("importer.tasks.images.verify_asset_image")
def test_create_verify_asset_image_job(self, mock_verify):
mock_verify.return_value = True
result = tasks.images.verify_asset_image_task(
self.asset.pk, self.batch_id, create_job=True
)
self.assertTrue(result)
self.assertTrue(
VerifyAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).exists()
)
@mock.patch("importer.tasks.images.verify_asset_image")
def test_http_error_retries(self, mock_verify):
create_verify_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_verify.side_effect = requests.exceptions.HTTPError("HTTP Error Occurred")
with self.assertRaises(requests.exceptions.HTTPError):
tasks.images.verify_asset_image_task(self.asset.pk, self.batch_id)
class CreateDownloadAssetImageJobTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
def test_create_new_job(self):
tasks.images.create_download_asset_image_job(self.asset, self.batch_id)
self.assertTrue(
DownloadAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).exists()
)
def test_existing_uncompleted_job_not_duplicated(self):
create_download_asset_image_job(asset=self.asset, batch=self.batch_id)
tasks.images.create_download_asset_image_job(self.asset, self.batch_id)
job_count = DownloadAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).count()
self.assertEqual(job_count, 1)
def test_create_new_job_if_previous_failed(self):
failed_job = create_download_asset_image_job(
asset=self.asset, batch=self.batch_id
)
failed_job.failed = timezone.now()
failed_job.save()
new_batch = uuid.uuid4()
tasks.images.create_download_asset_image_job(self.asset, new_batch)
job_count = DownloadAssetImageJob.objects.filter(asset=self.asset).count()
self.assertEqual(job_count, 2)
class VerifyAssetImageTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.job = create_verify_asset_image_job(asset=self.asset)
self.mock_task = mock.MagicMock()
self.mock_task.request.id = uuid.uuid4()
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
def test_no_storage_image(self, mock_create_job, mock_logger):
# Use update to avoid validation of storage_image with invalid value
Asset.objects.filter(id=self.asset.id).update(storage_image="")
self.job.refresh_from_db()
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"No storage image set on {self.asset} ({self.asset.id})"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=False)
def test_storage_image_missing(self, mock_exists, mock_create_job, mock_logger):
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"Storage image for {self.asset} ({self.asset.id}) missing from storage"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=True)
@mock.patch("importer.tasks.images.ASSET_STORAGE.open")
@mock.patch(
"importer.tasks.images.Image.open",
side_effect=Image.UnidentifiedImageError("Invalid image format"),
)
def test_storage_image_invalid(
self, mock_image_open, mock_open, mock_exists, mock_create_job, mock_logger
):
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"Storage image for {self.asset} ({self.asset.id}), "
f"{self.asset.storage_image.name}, is corrupt. The exception "
"raised was Type: UnidentifiedImageError, Message: Invalid image format"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.create_download_asset_image_job")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=True)
@mock.patch("importer.tasks.images.ASSET_STORAGE.open")
@mock.patch("importer.tasks.images.Image.open")
def test_storage_image_verify_fail(
self, mock_image_open, mock_open, mock_exists, mock_create_job, mock_logger
):
mock_image = mock.MagicMock()
mock_image.verify.side_effect = Image.UnidentifiedImageError(
"Invalid image format"
)
mock_image_open.return_value.__enter__.return_value = mock_image
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertFalse(result)
mock_create_job.assert_called_once_with(self.asset, self.job.batch)
mock_logger.assert_any_call(
f"Storage image for {self.asset} ({self.asset.id}), "
f"{self.asset.storage_image.name}, is corrupt. The exception "
"raised was Type: UnidentifiedImageError, Message: Invalid image format"
)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.ASSET_STORAGE.exists", return_value=True)
@mock.patch("importer.tasks.images.ASSET_STORAGE.open")
@mock.patch("importer.tasks.images.Image.open")
def test_storage_image_verification_success(
self, mock_image_open, mock_open, mock_exists, mock_logger
):
mock_image = mock.MagicMock()
mock_image.verify.return_value = None
mock_image_open.return_value.__enter__.return_value = mock_image
result = tasks.images.verify_asset_image(self.mock_task, self.job)
self.assertTrue(result)
mock_logger.assert_any_call(
"Storage image for %s (%s), %s, verified successfully",
self.asset,
self.asset.id,
self.asset.storage_image.name,
)
class BatchDownloadAssetImagesTaskCallbackTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 5
@mock.patch("importer.tasks.images.batch_download_asset_images_task.delay")
def test_callback_triggers_next_batch(self, mock_task):
results = [True, False, True]
tasks.images.batch_download_asset_images_task_callback(
results, self.batch_id, self.concurrency
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency)
@mock.patch("importer.tasks.images.batch_download_asset_images_task.delay")
def test_callback_with_no_results(self, mock_task):
results = []
tasks.images.batch_download_asset_images_task_callback(
results, self.batch_id, self.concurrency
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency)
@mock.patch("importer.tasks.images.batch_download_asset_images_task.delay")
def test_callback_with_all_successful_results(self, mock_task):
results = [True, True, True]
tasks.images.batch_download_asset_images_task_callback(
results, self.batch_id, self.concurrency
)
mock_task.assert_called_once_with(self.batch_id, self.concurrency)
class BatchDownloadAssetImagesTaskTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.concurrency = 3
asset1 = create_asset()
asset2 = create_asset(item=asset1.item, slug="test-asset-2")
asset3 = create_asset(item=asset1.item, slug="test-asset-3")
self.job1 = create_download_asset_image_job(batch=self.batch_id, asset=asset1)
self.job2 = create_download_asset_image_job(batch=self.batch_id, asset=asset2)
self.job3 = create_download_asset_image_job(batch=self.batch_id, asset=asset3)
@mock.patch("importer.tasks.images.logger.info")
@mock.patch("importer.tasks.images.chord")
@mock.patch("importer.tasks.images.download_asset_image_task.s")
def test_jobs_remaining(self, mock_task_s, mock_chord, mock_logger):
tasks.images.batch_download_asset_images_task(self.batch_id, self.concurrency)
self.assertEqual(mock_task_s.call_count, 3)
mock_chord.assert_called()
mock_logger.assert_any_call(
"Processing next %s DownloadAssetImageJobs for batch %s",
self.concurrency,
self.batch_id,
)
@mock.patch("importer.tasks.images.logger.info")
def test_no_jobs_remaining(self, mock_logger):
DownloadAssetImageJob.objects.all().delete()
tasks.images.batch_download_asset_images_task(self.batch_id, self.concurrency)
mock_logger.assert_any_call(
"No DownloadAssetImageJobs found for batch %s", self.batch_id
)
class DownloadAssetImageTaskTests(TestCase):
def setUp(self):
self.asset = create_asset()
self.batch_id = uuid.uuid4()
@mock.patch("importer.tasks.images.logger.exception")
def test_asset_not_found(self, mock_logger):
with self.assertRaises(Asset.DoesNotExist):
tasks.images.download_asset_image_task(999)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.logger.exception")
def test_download_job_not_found(self, mock_logger):
with self.assertRaises(DownloadAssetImageJob.DoesNotExist):
tasks.images.download_asset_image_task(
self.asset.pk, self.batch_id, create_job=False
)
mock_logger.assert_called()
@mock.patch("importer.tasks.images.download_asset")
def test_download_asset_image_task_success(self, mock_download):
create_download_asset_image_job(asset=self.asset, batch=self.batch_id)
mock_download.return_value = "Download successful"
result = tasks.images.download_asset_image_task(self.asset.pk, self.batch_id)
self.assertEqual(result, "Download successful")
@mock.patch("importer.tasks.images.download_asset")
def test_create_download_asset_image_job(self, mock_download):
mock_download.return_value = "Download successful"
result = tasks.images.download_asset_image_task(
self.asset.pk, self.batch_id, create_job=True
)
self.assertEqual(result, "Download successful")
self.assertTrue(
DownloadAssetImageJob.objects.filter(
asset=self.asset, batch=self.batch_id
).exists()
)
@mock.patch("importer.tasks.images.download_asset")
def test_http_error_retries(self, mock_download):
mock_download.side_effect = requests.exceptions.HTTPError("HTTP Error Occurred")
with self.assertRaises(requests.exceptions.HTTPError):
tasks.images.download_asset_image_task(
self.asset.pk, self.batch_id, create_job=True
)
================================================
FILE: importer/tests/test_tasks_items.py
================================================
import io
import shutil
import tempfile
from unittest import mock
import requests
from django.core.exceptions import ValidationError
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.test import TestCase, override_settings
from PIL import Image
from concordia.models import Item
from concordia.tests.utils import (
CreateTestUsers,
create_asset,
create_item,
create_project,
)
from importer import tasks
from importer.models import ImportItem
from importer.tasks.items import (
_guess_extension,
download_and_set_item_thumbnail,
get_item_id_from_item_url,
get_item_info_from_result,
import_items_into_project_from_url,
)
from importer.tests.utils import (
create_import_item,
create_import_job,
)
class ImportItemCountFromUrlTests(TestCase):
def mocked_requests_get(*args, **kwargs):
class MockResponse:
def json(self):
item_data = {
"resources": [
{"files": []},
]
}
return item_data
def raise_for_status(self):
pass
return MockResponse()
@mock.patch("requests.get", side_effect=mocked_requests_get)
@override_settings(
CACHES={
"default": {
"BACKEND": "django.core.cache.backends.dummy.DummyCache",
}
}
)
def test_import_item_count_from_url(self, mock_get):
self.assertEqual(
tasks.items.import_item_count_from_url(None),
("None - Asset Count: 0", 0),
)
def test_unhandled_exception_importing(self):
with mock.patch("importer.tasks.items.requests.get") as get_mock:
get_mock.side_effect = AttributeError("Error message")
self.assertEqual(
tasks.items.import_item_count_from_url("http://example.com"),
(
"Unhandled exception importing http://example.com " "Error message",
0,
),
)
@override_settings(
STORAGES={
"default": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
"assets": {"BACKEND": "django.core.files.storage.InMemoryStorage"},
},
AWS_STORAGE_BUCKET_NAME="test-bucket",
)
class ImportItemsIntoProjectFromUrlTests(CreateTestUsers, TestCase):
def setUp(self):
self.login_user()
self.project = create_project()
@mock.patch("importer.tasks.items.create_item_import_task.delay")
def test_no_match(self, mock_task):
with self.assertRaises(ValueError):
import_items_into_project_from_url(
None, None, "https://www.loc.gov/resource/mss859430021/"
)
self.assertFalse(mock_task.called)
@mock.patch("importer.tasks.items.create_item_import_task.delay")
def test_item(self, mock_task):
import_job = import_items_into_project_from_url(
self.user, self.project, "https://www.loc.gov/item/mss859430021/"
)
self.assertEqual(import_job.project, self.project)
self.assertTrue(mock_task.called)
@mock.patch("importer.tasks.collections.import_collection_task.delay")
def test_other_url_type(self, mock_task):
import_job = import_items_into_project_from_url(
self.user,
self.project,
"https://www.loc.gov/collections/branch-rickey-papers/",
)
self.assertEqual(import_job.project, self.project)
self.assertTrue(mock_task.called)
mock_task.assert_called_with(import_job.pk, False)
class GetItemIdFromItemURLTests(TestCase):
def test_get_item_id_from_item_url_with_slash(self):
"""
Testing get item id from item url if ends with /
"""
url = "https://www.loc.gov/item/mss859430021/"
resp = get_item_id_from_item_url(url)
self.assertEqual(resp, "mss859430021")
def test_get_item_id_from_item_url_without_slash(self):
"""
Testing get item id from item url if ends without /
"""
url = "https://www.loc.gov/item/mss859430021"
resp = get_item_id_from_item_url(url)
self.assertEqual(resp, "mss859430021")
class GetItemInfoFromResultTests(TestCase):
def test_no_image_url(self):
item_info = get_item_info_from_result(
{
"id": 1,
"image_url": False,
"original_format": {"item"},
}
)
self.assertEqual(item_info, None)
def test_no_match(self):
item_info = get_item_info_from_result(
{
"id": 1,
"image_url": "https://www.loc.gov/resource/mss85943.000212/",
"original_format": {"item"},
"url": "https://www.loc.com/item/mss859430021/",
},
)
self.assertEqual(item_info, None)
def test_match(self):
url = "https://www.loc.gov/item/%s/" % "mss859430021"
item_info = get_item_info_from_result(
{
"id": 1,
"image_url": "https://www.loc.gov/resource/mss85943.000212/",
"original_format": {"item"},
"url": url,
},
)
self.assertEqual(item_info[0], "mss859430021")
self.assertEqual(item_info[1], url)
def test_ignored_format(self):
result = {
"id": 42,
"image_url": "https://www.loc.gov/resource/foo/",
"original_format": {"collection"},
"url": "https://www.loc.gov/item/abc123/",
}
with self.assertLogs("importer.tasks", level="INFO") as log:
out = get_item_info_from_result(result)
self.assertIsNone(out)
self.assertEqual(
log.output[0],
"INFO:importer.tasks.items:Skipping result 42 because it contains an "
"unsupported format: {'collection'}",
)
@mock.patch("importer.tasks.items.requests.get")
class CreateItemImportTaskTests(TestCase):
def setUp(self):
self.job = create_import_job()
self.item_url = "http://example.com"
self.response_mock = mock.MagicMock()
self.item_id = "testid1"
self.item_title = "Test Title"
self.image_url = []
self.item_data = {
"item": {
"id": self.item_id,
"title": self.item_title,
"image_url": self.image_url,
}
}
def test_create_item_import_task_http_error(self, get_mock):
get_mock.return_value = self.response_mock
self.response_mock.raise_for_status.side_effect = requests.exceptions.HTTPError
with self.assertRaises(requests.exceptions.HTTPError):
tasks.items.create_item_import_task(self.job.pk, self.item_url)
def test_create_item_import_task_new_item(self, get_mock):
get_mock.return_value = self.response_mock
self.response_mock.json.return_value = self.item_data
with (
mock.patch("importer.tasks.items.import_item_task.delay") as task_mock,
mock.patch("importer.tasks.items.download_and_set_item_thumbnail"),
):
tasks.items.create_item_import_task(self.job.pk, self.item_url)
self.assertTrue(task_mock.called)
self.assertEqual(Item.objects.count(), 1)
self.assertTrue(Item.objects.filter(item_id=self.item_id).exists())
def test_create_item_import_task_existing_item_missing_assets(self, get_mock):
item = create_item(item_id="testid1", project=self.job.project)
get_mock.return_value = self.response_mock
self.response_mock.json.return_value = self.item_data
with (
self.assertLogs("importer.tasks", level="WARNING") as log,
mock.patch(
"importer.tasks.items.get_asset_urls_from_item_resources"
) as asset_url_mock,
mock.patch("importer.tasks.items.import_item_task.delay") as task_mock,
mock.patch("importer.tasks.items.download_and_set_item_thumbnail"),
):
asset_url_mock.return_value = [
["http://example.com/test.jpg"],
self.item_url,
]
tasks.items.create_item_import_task(self.job.pk, self.item_url)
self.assertEqual(
log.output,
[
f"WARNING:importer.tasks.items:"
f"Reprocessing existing item {item} that is missing assets"
],
)
self.assertEqual(Item.objects.count(), 1)
self.assertTrue(task_mock.called)
def test_create_item_import_task_existing_item_no_missing_assets(self, get_mock):
item = create_item(item_id="testid1", project=self.job.project)
# Ensure at least one asset exists for the item
create_asset(item=item)
get_mock.return_value = self.response_mock
self.response_mock.json.return_value = self.item_data
with (
self.assertLogs("importer.tasks", level="WARNING") as log,
mock.patch(
"importer.tasks.items.get_asset_urls_from_item_resources"
) as asset_url_mock,
mock.patch("importer.tasks.items.import_item_task.delay") as task_mock,
mock.patch("importer.tasks.items.download_and_set_item_thumbnail"),
):
asset_url_mock.return_value = [
["http://example.com/test.jpg"],
self.item_url,
]
tasks.items.create_item_import_task(self.job.pk, self.item_url)
self.assertEqual(
log.output,
[
f"WARNING:importer.tasks.items:"
f"Not reprocessing existing item with all assets: {item}"
],
)
self.assertEqual(
ImportItem.objects.get(item=item).status,
f"Not reprocessing existing item with all assets: {item}",
)
self.assertFalse(task_mock.called)
def test_create_item_import_task_existing_item_redownload(self, get_mock):
item = create_item(item_id="testid1", project=self.job.project)
create_asset(item=item)
get_mock.return_value = self.response_mock
self.response_mock.json.return_value = {
"item": {"id": "testid1", "title": "Test Title", "image_url": []}
}
with (
mock.patch(
"importer.tasks.items.get_asset_urls_from_item_resources"
) as asset_url_mock,
mock.patch("importer.tasks.items.import_item_task.delay") as task_mock,
mock.patch("importer.tasks.items.download_and_set_item_thumbnail"),
):
asset_url_mock.return_value = [
["http://example.com/test.jpg"],
self.item_url,
]
tasks.items.create_item_import_task(
self.job.pk, self.item_url, redownload=True
)
self.assertTrue(task_mock.called)
def test_create_item_import_task_full_clean_exception_updates_status_and_reraises(
self, get_mock
):
get_mock.return_value = self.response_mock
self.response_mock.json.return_value = self.item_data
with (
self.assertLogs("importer.tasks", level="ERROR") as log,
mock.patch("importer.tasks.items.Item.full_clean") as full_clean_mock,
mock.patch("importer.tasks.items.import_item_task.delay") as task_mock,
mock.patch(
"importer.tasks.items.download_and_set_item_thumbnail"
) as thumb_mock,
):
full_clean_mock.side_effect = RuntimeError("boom")
with self.assertRaises(RuntimeError):
tasks.items.create_item_import_task(self.job.pk, self.item_url)
self.assertTrue(
any("Unhandled exception when importing item" in m for m in log.output)
)
thumb_mock.assert_not_called()
task_mock.assert_not_called()
item = Item.objects.get(item_id=self.item_id)
import_item = ImportItem.objects.get(item=item)
self.assertIsNotNone(import_item.failed)
self.assertIn("Unhandled exception: boom", import_item.status)
def test_create_item_import_task_save_exception_updates_status_and_reraises(
self, get_mock
):
get_mock.return_value = self.response_mock
self.response_mock.json.return_value = self.item_data
# Grab the real save before patching so we can wrap it.
from importer.tasks.items import Item as _Item
real_save = _Item.save
call_count = {"n": 0}
def save_side_effect(self, *args, **kwargs):
call_count["n"] += 1
# First call is from Item.objects.get_or_create(...) -> allow it to
# persist.
if call_count["n"] == 1:
return real_save(self, *args, **kwargs)
# Second call is the one under test -> raise.
raise RuntimeError("save failed")
with (
self.assertLogs("importer.tasks", level="ERROR") as log,
mock.patch("importer.tasks.items.Item.full_clean") as full_clean_mock,
mock.patch(
"importer.tasks.items.Item.save",
side_effect=save_side_effect,
autospec=True,
),
mock.patch("importer.tasks.items.import_item_task.delay") as task_mock,
mock.patch(
"importer.tasks.items.download_and_set_item_thumbnail"
) as thumb_mock,
):
# Ensure full_clean does not fail so we reach save().
full_clean_mock.return_value = None
with self.assertRaises(RuntimeError):
tasks.items.create_item_import_task(self.job.pk, self.item_url)
self.assertTrue(
any("Unhandled exception when importing item" in m for m in log.output)
)
thumb_mock.assert_not_called()
task_mock.assert_not_called()
item = Item.objects.get(item_id=self.item_id)
import_item = ImportItem.objects.get(item=item)
self.assertIsNotNone(import_item.failed)
self.assertIn("Unhandled exception: save failed", import_item.status)
class ItemImportTests(TestCase):
def setUp(self):
self.item_url = "http://example.com"
self.job = create_import_job()
self.import_item = create_import_item(import_job=self.job, url=self.item_url)
def test_import_item_task(self):
with mock.patch("importer.tasks.items.import_item") as task_mock:
tasks.items.import_item_task(self.import_item.pk)
self.assertTrue(task_mock.called)
task, called_import_item = task_mock.call_args.args
self.assertTrue(called_import_item, self.import_item)
def test_import_item(self):
with (
mock.patch(
"importer.tasks.items.get_asset_urls_from_item_resources"
) as asset_url_mock,
mock.patch("importer.tasks.assets.download_asset_task.s") as download_mock,
mock.patch("importer.tasks.items.group") as group_mock,
):
# It's difficult/impossible to cleanly mock a decorator due to the way
# they're applied when the decorated object/function is evaluated on
# import, so we unfortunately have to handle the update_task_status
# decorator, so we need a mock object that can pass for a Celery task
# object so update_task_status doesn't error during the test
task_mock = mock.MagicMock()
task_mock.request.id = "f81d4fae-7dec-11d0-a765-00a0c91e6bf6"
asset_url_mock.return_value = [
["http://example.com/test.jpg"],
self.item_url,
]
tasks.items.import_item(task_mock, self.import_item)
self.assertFalse(download_mock.called)
self.assertTrue(group_mock.called)
# Test that it properly errors if we try to import the same item again
self.import_item.completed = None
self.import_item.save()
with self.assertRaises(ValidationError):
tasks.items.import_item(task_mock, self.import_item)
asset_url_mock.return_value = [
[],
"",
]
self.import_item.completed = None
self.import_item.save()
tasks.items.import_item(task_mock, self.import_item)
self.assertFalse(download_mock.called)
self.assertTrue(group_mock.called)
def test_populate_item_from_data(self):
item = Item(item_url="http://example.com")
item_info = {
"title": "Test Title",
"description": "Test description",
"image_url": ["image.gif", "image.jpg", "image2.jpg"],
}
tasks.items.populate_item_from_data(item, item_info)
self.assertEqual(item.item_url, "http://example.com")
self.assertEqual(item.title, "Test Title")
self.assertEqual(item.description, "Test description")
self.assertEqual(item.thumbnail_url, "http://example.com/image.jpg")
def test_populate_item_from_data_handles_exception_and_returns_none(self):
# Proxy dict that explodes only when .get("image_url") is called,
# but still works with indexing for the earlier code path.
class ExplodingImageInfo(dict):
def get(self, key, default=None):
if key == "image_url":
raise RuntimeError("error")
return super().get(key, default)
item = Item(item_url="http://example.com")
info = ExplodingImageInfo(
{
"title": "T",
"description": "D",
"image_url": ["image.jpg"], # used by the earlier indexing path
}
)
result = tasks.items.populate_item_from_data(item, info)
# Early indexing still sets thumbnail_url, but the try/except branch
# should swallow the error and return None.
self.assertIsNone(result)
self.assertEqual(item.thumbnail_url, "http://example.com/image.jpg")
@override_settings(DEFAULT_FILE_STORAGE="django.core.files.storage.FileSystemStorage")
class DownloadItemThumbnailTests(TestCase):
class FakeResponse:
"""Minimal streamable response for mocking requests.get(...)."""
def __init__(self, content, content_type="image/png", on_iter=None):
self.headers = {"Content-Type": content_type} if content_type else {}
self._content = content
self._on_iter = on_iter
self._iter_called = False
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def raise_for_status(self):
return
def iter_content(self, chunk_size=64 * 1024):
if self._on_iter and not self._iter_called:
self._on_iter()
self._iter_called = True
yield self._content
def setUp(self):
self.temp_media = tempfile.mkdtemp(prefix="test-media-")
self._override = override_settings(MEDIA_ROOT=self.temp_media)
self._override.enable()
def tearDown(self):
self._override.disable()
shutil.rmtree(self.temp_media, ignore_errors=True)
def make_image_bytes(self, fmt="PNG", size=(2, 2), color=(1, 2, 3)):
buf = io.BytesIO()
img = Image.new("RGB", size, color)
img.save(buf, format=fmt)
return buf.getvalue()
def test_skip_when_already_present_and_not_force(self):
item = create_item()
# Seed an existing thumbnail
item.thumbnail_image.save("existing.jpg", ContentFile(b"old"), save=True)
with mock.patch("importer.tasks.items.requests.get") as get_mock:
msg = download_and_set_item_thumbnail(item, "https://example.com/test.jpg")
self.assertIn("skipping", msg.lower())
self.assertFalse(get_mock.called)
item.refresh_from_db()
self.assertTrue(item.thumbnail_image.name.endswith("existing.jpg"))
self.assertTrue(default_storage.exists(item.thumbnail_image.name))
def test_success_with_content_type_extension(self):
item = create_item()
payload = self.make_image_bytes(fmt="PNG")
url = "https://example.com/path/name.png"
with mock.patch(
"importer.tasks.items.requests.get",
return_value=type(self).FakeResponse(payload, "image/png"),
):
saved = download_and_set_item_thumbnail(item, url)
item.refresh_from_db()
self.assertEqual(saved, item.thumbnail_image.name)
self.assertTrue(saved.endswith(".png"))
self.assertTrue(default_storage.exists(saved))
with default_storage.open(saved, "rb") as fh:
self.assertEqual(fh.read(), payload)
def test_fallback_extension_via_pillow_sniff_when_guess_is_bin(self):
item = create_item()
payload = self.make_image_bytes(fmt="PNG")
url = "https://example.com/noext" # no extension to force sniff path
with (
mock.patch("importer.tasks.items._guess_extension", return_value=".bin"),
mock.patch(
"importer.tasks.items.requests.get",
return_value=type(self).FakeResponse(payload, content_type=""),
),
):
saved = download_and_set_item_thumbnail(item, url)
item.refresh_from_db()
self.assertEqual(saved, item.thumbnail_image.name)
# Pillow sniff sees PNG, so .png via the mapping in the function
self.assertTrue(saved.endswith(".png"))
self.assertTrue(default_storage.exists(saved))
def test_invalid_image_raises_value_error(self):
item = create_item()
bad_bytes = b"not-an-image"
with mock.patch(
"importer.tasks.items.requests.get",
return_value=type(self).FakeResponse(bad_bytes, "application/octet-stream"),
):
with self.assertRaises(ValueError):
download_and_set_item_thumbnail(item, "https://example.com/notimg")
item.refresh_from_db()
self.assertFalse(bool(item.thumbnail_image))
def test_requests_exception_propagates(self):
item = create_item()
with mock.patch(
"importer.tasks.items.requests.get",
side_effect=requests.RequestException("error"),
):
with self.assertRaises(requests.RequestException):
download_and_set_item_thumbnail(item, "https://example.com/error")
def test_race_present_after_download_skips_final_save(self):
"""Simulate another writer saving the thumbnail mid-download."""
item = create_item()
def _concurrent_writer():
# Another process writes a thumbnail before the second transaction.
item.refresh_from_db()
item.thumbnail_image.save("pre.jpg", ContentFile(b"pre"), save=True)
payload = self.make_image_bytes(fmt="PNG")
with mock.patch(
"importer.tasks.items.requests.get",
return_value=type(self).FakeResponse(
payload, "image/png", on_iter=_concurrent_writer
),
):
msg = download_and_set_item_thumbnail(item, "https://example.com/new.png")
self.assertIn("skipping save", msg.lower())
item.refresh_from_db()
self.assertTrue(item.thumbnail_image.name.endswith("pre.jpg"))
self.assertTrue(default_storage.exists(item.thumbnail_image.name))
def test_force_overwrite_path_runs_and_sets_thumbnail(self):
item = create_item()
# Seed an existing thumbnail
item.thumbnail_image.save("existing.jpg", ContentFile(b"old"), save=True)
payload = self.make_image_bytes(fmt="PNG")
with mock.patch(
"importer.tasks.items.requests.get",
return_value=type(self).FakeResponse(payload, "image/png"),
):
saved = download_and_set_item_thumbnail(
item, "https://example.com/new.png", force=True
)
item.refresh_from_db()
self.assertEqual(saved, item.thumbnail_image.name)
self.assertTrue(saved.endswith(".png"))
self.assertTrue(default_storage.exists(saved))
def test_stream_with_empty_chunk_is_skipped(self):
item = create_item()
payload = self.make_image_bytes(fmt="PNG")
url = "https://example.com/streamed.png"
class TwoChunkResponse:
def __init__(self, content, content_type="image/png"):
self.headers = {"Content-Type": content_type}
self._chunks = [b"", content] # first empty, then real data
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def raise_for_status(self):
return
def iter_content(self, chunk_size=64 * 1024):
for c in self._chunks:
yield c
with mock.patch(
"importer.tasks.items.requests.get",
return_value=TwoChunkResponse(payload, "image/png"),
):
saved = download_and_set_item_thumbnail(item, url)
item.refresh_from_db()
self.assertEqual(saved, item.thumbnail_image.name)
self.assertTrue(saved.endswith(".png"))
with default_storage.open(saved, "rb") as fh:
self.assertEqual(fh.read(), payload)
def test_guess_extension_uses_url_path_extension_lowercases(self):
self.assertEqual(
_guess_extension("", "/path/TO/NAME.JPG"),
".jpg",
)
def test_guess_extension_returns_bin_when_no_ext_and_no_content_type(self):
self.assertEqual(
_guess_extension("", "/noext"),
".bin",
)
@mock.patch("importer.tasks.items.mimetypes.guess_extension", return_value=None)
def test_header_guess_none_uses_url_extension(self, _guess):
item = create_item()
payload = self.make_image_bytes(fmt="JPEG")
# Upper-case extension to assert lower-casing behavior
url = "https://example.com/path/name.JPEG"
with mock.patch(
"importer.tasks.items.requests.get",
return_value=type(self).FakeResponse(payload, "image/unknown"),
):
saved = download_and_set_item_thumbnail(item, url)
item.refresh_from_db()
self.assertTrue(saved.endswith(".jpeg"))
class GetAssetUrlsFromItemResourcesTests(TestCase):
def test_empty_resources(self):
assets, resource_url = tasks.items.get_asset_urls_from_item_resources([])
self.assertEqual(assets, [])
self.assertEqual(resource_url, "")
def test_missing_item_resource_url_key(self):
resources = [
{
# 'url' intentionally omitted to hit KeyError path
"files": [
[
{
"url": "http://example.com/ok.jpg",
"height": 2,
"width": 2,
"mimetype": "image/jpeg",
},
{"url": "http://example.com/missing_dims.jpg"}, # skipped
]
],
}
]
assets, resource_url = tasks.items.get_asset_urls_from_item_resources(resources)
self.assertEqual(resource_url, "")
self.assertEqual(assets, ["http://example.com/ok.jpg"])
def test_files_key_missing(self):
resources = [{"url": "http://example.com"}] # no 'files' key
assets, resource_url = tasks.items.get_asset_urls_from_item_resources(resources)
self.assertEqual(assets, [])
self.assertEqual(resource_url, "http://example.com")
def test_picks_largest_jpeg_when_present(self):
resources = [
{
"url": "http://example.com",
"files": [
[
{
"url": "http://example.com/small.jpg",
"height": 1,
"width": 1,
"mimetype": "image/jpeg",
},
{
"url": "http://example.com/large.jpg",
"height": 3,
"width": 3,
"mimetype": "image/jpeg",
},
]
],
}
]
assets, resource_url = tasks.items.get_asset_urls_from_item_resources(resources)
self.assertEqual(resource_url, "http://example.com")
self.assertEqual(assets, ["http://example.com/large.jpg"])
def test_falls_back_to_largest_gif_when_no_jpeg(self):
resources = [
{
"url": "http://example.com",
"files": [
[
{
"url": "http://example.com/small.gif",
"height": 2,
"width": 2,
"mimetype": "image/gif",
},
{
"url": "http://example.com/large.gif",
"height": 5,
"width": 5,
"mimetype": "image/gif",
},
# unacceptable types are ignored
{
"url": "http://example.com/file.tif",
"height": 100,
"width": 100,
"mimetype": "image/tiff",
},
]
],
}
]
assets, resource_url = tasks.items.get_asset_urls_from_item_resources(resources)
self.assertEqual(resource_url, "http://example.com")
self.assertEqual(assets, ["http://example.com/large.gif"])
def test_variants_missing_required_keys_are_ignored(self):
resources = [
{
"url": "http://example.com",
"files": [
[
{"url": "http://example.com/nw.jpg", "height": 2}, # no width
{"height": 2, "width": 2, "mimetype": "image/jpeg"}, # no url
{
"url": "http://example.com/valid.jpg",
"height": 2,
"width": 3,
"mimetype": "image/jpeg",
},
]
],
}
]
assets, resource_url = tasks.items.get_asset_urls_from_item_resources(resources)
self.assertEqual(resource_url, "http://example.com")
self.assertEqual(assets, ["http://example.com/valid.jpg"])
def test_no_candidates_or_backups_skips_appending(self):
resources = [
{
"url": "http://example.com",
"files": [
[
{
"url": "http://example.com/file1.tif",
"height": 10,
"width": 10,
"mimetype": "image/tiff", # unsupported
},
{
"url": "http://example.com/file2",
"height": 5,
"width": 5,
# no mimetype -> not added to candidates/backups
},
]
],
}
]
assets, resource_url = tasks.items.get_asset_urls_from_item_resources(resources)
self.assertEqual(resource_url, "http://example.com")
self.assertEqual(assets, [])
================================================
FILE: importer/tests/test_utils.py
================================================
import uuid
from unittest import mock
from django.test import TestCase
from concordia.tests.utils import create_asset
from importer.models import VerifyAssetImageJob
from importer.utils import create_verify_asset_image_job_batch
from importer.utils.excel import clean_cell_value, slurp_excel
class CreateVerifyAssetImageJobBatchTests(TestCase):
def setUp(self):
self.batch_id = uuid.uuid4()
self.asset = create_asset()
self.assets = [self.asset] + [
create_asset(item=self.asset.item, slug=f"test-asset-{i}")
for i in range(1, 5)
]
self.asset_pks = [asset.pk for asset in self.assets]
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_create_jobs_single_batch(self, mock_task):
job_count, batch_url = create_verify_asset_image_job_batch(
self.asset_pks, self.batch_id
)
self.assertEqual(job_count, 5)
self.assertEqual(
VerifyAssetImageJob.objects.filter(batch=self.batch_id).count(), 5
)
mock_task.assert_called_once_with(batch=self.batch_id)
self.assertEqual(
batch_url, VerifyAssetImageJob.get_batch_admin_url(self.batch_id)
)
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_create_jobs_multiple_batches(self, mock_task):
asset_pks = self.asset_pks + [
asset.pk
for asset in [
create_asset(item=self.asset.item, slug=f"test-asset-{i}")
for i in range(5, 150)
]
]
job_count, _ = create_verify_asset_image_job_batch(asset_pks, self.batch_id)
self.assertEqual(job_count, 150)
self.assertEqual(
VerifyAssetImageJob.objects.filter(batch=self.batch_id).count(), 150
)
mock_task.assert_called_once_with(batch=self.batch_id)
@mock.patch("importer.tasks.images.batch_verify_asset_images_task.delay")
def test_no_assets_provided(self, mock_task):
job_count, batch_url = create_verify_asset_image_job_batch([], self.batch_id)
self.assertEqual(job_count, 0)
self.assertEqual(
VerifyAssetImageJob.objects.filter(batch=self.batch_id).count(), 0
)
mock_task.assert_called_once_with(batch=self.batch_id)
self.assertEqual(
batch_url, VerifyAssetImageJob.get_batch_admin_url(self.batch_id)
)
class ExcelUtilsTests(TestCase):
class _Cell:
def __init__(self, data_type, value):
self.data_type = data_type
self.value = value
class _Worksheet:
def __init__(self, rows):
# rows is a list of tuples of _Cell
self._rows = rows
@property
def rows(self):
return iter(self._rows)
class _Workbook:
def __init__(self, worksheets):
self.worksheets = worksheets
@mock.patch("importer.utils.excel.load_workbook")
def test_slurp_excel_single_worksheet_single_row(self, load_mock):
ws_rows = [
(
type(self)._Cell("s", " Name "),
type(self)._Cell("s", "Age"),
),
(
type(self)._Cell("s", " Alice "),
type(self)._Cell("n", 30),
),
]
wb = type(self)._Workbook([type(self)._Worksheet(ws_rows)])
load_mock.return_value = wb
out = slurp_excel("ignored.xlsx")
self.assertEqual(out, [{"Name": "Alice", "Age": 30}])
@mock.patch("importer.utils.excel.load_workbook")
def test_slurp_excel_multiple_worksheets_multiple_rows(self, load_mock):
ws1_rows = [
(type(self)._Cell("s", "H1"),),
(type(self)._Cell("s", "v1"),),
(type(self)._Cell("s", " v2 "),),
]
ws2_rows = [
(
type(self)._Cell("s", " H2 "),
type(self)._Cell("s", "H3"),
),
(
type(self)._Cell("n", 1),
type(self)._Cell("s", " x "),
),
]
wb = type(self)._Workbook(
[type(self)._Worksheet(ws1_rows), type(self)._Worksheet(ws2_rows)]
)
load_mock.return_value = wb
out = slurp_excel("ignored.xlsx")
# Order is by worksheet, then row order within each worksheet.
self.assertEqual(
out,
[
{"H1": "v1"},
{"H1": "v2"},
{"H2": 1, "H3": "x"},
],
)
def test_clean_cell_value_trims_strings(self):
c = type(self)._Cell("s", " padded ")
self.assertEqual(clean_cell_value(c), "padded")
def test_clean_cell_value_passthrough_non_strings(self):
c_num = type(self)._Cell("n", 42)
c_bool = type(self)._Cell("b", True)
self.assertEqual(clean_cell_value(c_num), 42)
self.assertTrue(clean_cell_value(c_bool))
================================================
FILE: importer/tests/utils.py
================================================
from django.utils.text import slugify
from concordia.tests.utils import create_asset, create_item, create_project
from importer.models import (
DownloadAssetImageJob,
ImportItem,
ImportItemAsset,
ImportJob,
VerifyAssetImageJob,
)
def create_import_job(*, project=None, **kwargs):
# project is a concordia.models.Project instance
if project is None:
project = create_project()
import_job = ImportJob(project=project, **kwargs)
import_job.save()
return import_job
def create_import_item(item=None, project=None, import_job=None, **kwargs):
# item is a concordia.models.Item instance
# project is a concordia.models.Project instance
# import_job is an importer.models.ImportJob instance
if import_job is None:
import_job = create_import_job(project=project)
if item is None:
item = create_item(project=import_job.project)
import_item = ImportItem(item=item, job=import_job, **kwargs)
import_item.save()
return import_item
def create_import_asset(
sequence_number=1,
asset=None,
item=None,
import_item=None,
project=None,
import_job=None,
**kwargs,
):
# sequence_number has to be unique to a particular import_item
# asset is a concordia.models.Asset instance
# item is a concordia.models.Item instance
# import_item is an importer.models.ImportItem instance
# project is a concordia.models.Project instance
# import_job is an importer.models.ImportJob instance
if import_item is None:
import_item = create_import_item(
item=item, import_job=import_job, project=project
)
if asset is None:
item_slug = slugify(import_item.item.title)
slug = f"{item_slug}-{sequence_number}"
asset = create_asset(item=import_item.item, slug=slug)
import_asset = ImportItemAsset(
sequence_number=sequence_number, asset=asset, import_item=import_item, **kwargs
)
import_asset.save()
return import_asset
def create_verify_asset_image_job(asset=None, batch=None, **kwargs):
"""
Create a VerifyAssetImageJob instance.
If no asset is provided, a new one is created.
"""
if asset is None:
asset = create_asset()
job = VerifyAssetImageJob.objects.create(asset=asset, batch=batch, **kwargs)
return job
def create_download_asset_image_job(asset=None, batch=None, **kwargs):
"""
Create a DownloadAssetImageJob instance.
If no asset is provided, a new one is created.
"""
if asset is None:
asset = create_asset()
job = DownloadAssetImageJob.objects.create(asset=asset, batch=batch, **kwargs)
return job
================================================
FILE: importer/utils/__init__.py
================================================
from .excel import slurp_excel
from .verify_images import create_verify_asset_image_job_batch
__all__ = [
"slurp_excel",
"create_verify_asset_image_job_batch",
]
================================================
FILE: importer/utils/excel.py
================================================
from typing import Any
from openpyxl import load_workbook
from openpyxl.cell.cell import Cell
def slurp_excel(filename: str) -> list[dict[str, Any]]:
"""
Parse an Excel workbook into a list of row dictionaries.
Each worksheet is read in order. The first row of each sheet is treated
as the header; subsequent rows become dictionaries mapping header names
to cell values (after basic cleaning via `clean_cell_value`).
Args:
filename (str): Path to the XLSX file.
Returns:
list[dict[str, Any]]: One dict per non-header row across all sheets.
"""
wb = load_workbook(filename=filename)
cells: list[dict[str, Any]] = []
for worksheet in wb.worksheets:
rows = worksheet.rows
headers = [clean_cell_value(i) for i in next(rows)]
for row in rows:
values: list[Any] = []
for cell in row:
values.append(clean_cell_value(cell))
cells.append(dict(zip(headers, values, strict=True)))
return cells
def clean_cell_value(cell: Cell) -> Any:
"""
Return a normalized Python value for an openpyxl cell.
If the cell is a string type ('s'), leading/trailing whitespace is stripped;
otherwise the raw value is returned.
Args:
cell (Cell): openpyxl cell to normalize.
Returns:
Any: Cleaned value suitable for serialization.
"""
if cell.data_type in ("s",):
return cell.value.strip()
else:
return cell.value
================================================
FILE: importer/utils/verify_images.py
================================================
from collections.abc import Iterable
from itertools import islice
from uuid import UUID
from importer.models import VerifyAssetImageJob
from importer.tasks.images import batch_verify_asset_images_task
BATCH_SIZE: int = 100
def create_verify_asset_image_job_batch(
asset_pks: Iterable[int],
batch: UUID,
) -> tuple[int, str]:
"""
Create verification jobs in chunks and enqueue a single batch task.
Iterates through the provided asset primary keys in chunks of
`BATCH_SIZE`, creating `VerifyAssetImageJob` rows via `bulk_create`.
After all jobs are created, schedules the Celery task that verifies
the images for the given batch. Returns the number of jobs created
and the admin URL prefiltered to the batch.
Args:
asset_pks (Iterable[int]): Asset primary keys to generate jobs for.
batch (UUID): Identifier to group jobs; unrelated to chunk size.
Returns:
tuple[int, str]: A pair of `(job_count, batch_admin_url)`.
"""
job_count = 0
# Make sure asset_pks is an iterator, for proper use with islice
# Not doing this causes an infinite loop if asset_pks is not an iterator/generator
asset_pks = iter(asset_pks)
while True:
asset_batch = list(islice(asset_pks, BATCH_SIZE))
if not asset_batch:
break
job_count += len(
VerifyAssetImageJob.objects.bulk_create(
[
VerifyAssetImageJob(asset_id=asset_pk, batch=batch)
for asset_pk in asset_batch
],
batch_size=BATCH_SIZE,
)
)
batch_verify_asset_images_task.delay(batch=batch)
return job_count, VerifyAssetImageJob.get_batch_admin_url(batch)
================================================
FILE: load_test.sh
================================================
#!/bin/bash
set -euo pipefail
LOCUST_USERS="${LOCUST_USERS:-100}"
LOCUST_SPAWN_RATE="${LOCUST_SPAWN_RATE:-2}"
LOCUST_RUN_TIME="${LOCUST_RUN_TIME:-1m30s}"
LOCUST_HOST="${LOCUST_HOST:-https://crowd-dev.loc.gov}"
exec locust \
--headless \
-u "${LOCUST_USERS}" \
-r "${LOCUST_SPAWN_RATE}" \
--run-time "${LOCUST_RUN_TIME}" \
--host "${LOCUST_HOST}"
================================================
FILE: locustfile.py
================================================
import logging
import random
import string
import time
from html.parser import HTMLParser
from pathlib import Path
from urllib.parse import urlencode, urljoin, urlparse
from gevent import sleep
from gevent.event import Event
from locust import HttpUser, between, events, runners, task
from locust.exception import StopUser
ABORT_WHEN_NO_WORK = True # stop the run if a next-* page has no work
NO_WORK_DUMP_HTML = False # set True to write an HTML dump for debugging
HOMEPAGE_PATH = "/"
NEXT_ASSET_PATH = "/next-transcribable-asset/"
NEXT_REVIEWABLE_ASSET_PATH = "/next-reviewable-asset/"
AJAX_STATUS_PATH = "/account/ajax-status/"
AJAX_MSG_PATH = "/account/ajax-messages/"
LOGIN_PATH = "/account/login/"
CSRF_COOKIE_NAME = "csrftoken"
SESSION_COOKIE_NAME = "sessionid"
CSRF_SEED_PATH = HOMEPAGE_PATH
POST_FIELD_NAME = "text"
POST_MIN_CHARS = 10
POST_MAX_CHARS = 200
SAME_PAGE_REPEAT_PROB = 0.75
REDIRECT_RETRIES = 3
REDIRECT_BACKOFF = 0.25
TEST_USER_PREFIX = "locusttest"
TEST_USER_DOMAIN = "example.test"
TEST_USER_COUNT = 10_000
TEST_USER_PASSWORD = "locustpass123" # nosec B105
LOGIN_BAD_PASSWORD_PROB = 0.10
LOGIN_MAX_ATTEMPTS = 5
REVIEWER_SHARE = 0.20
REVIEW_EDIT_PROB = 0.50
NO_WORK_ERROR_MESSAGE = (
"Did you need to refresh the load test database? "
"Try running the 'prepare_load_test_db' command or "
"'create_load_test_fixtures' if you need fixtures first."
)
logger = logging.getLogger(__name__)
# ---------- global abort plumbing ----------
GLOBAL_ABORT_EVENT: Event = Event()
GLOBAL_ABORT_REASON: str | None = None
@events.init.add_listener
def _on_locust_init(environment, **_):
# stop immediately; don’t wait for graceful wind down
environment.stop_timeout = 0
# Register a message handler so both master and workers react to global abort
runner = getattr(environment, "runner", None)
if not runner:
return
def _handle_global_abort(env, msg, **kwargs):
reason = ""
try:
data = getattr(msg, "data", {}) or {}
reason = data.get("reason") or ""
except Exception:
pass
_trigger_global_abort(
env, f"Global abort requested. {reason}", dump_html=None, broadcast=True
)
try:
runner.register_message("global-abort", _handle_global_abort)
except Exception as e:
logger.debug("register_message failed (non-distributed run is fine): %s", e)
@events.quitting.add_listener
def _on_quitting(environment, **_):
"""Print a final, unmissable banner at shutdown."""
if not (GLOBAL_ABORT_EVENT.is_set() or GLOBAL_ABORT_REASON):
return
reason = GLOBAL_ABORT_REASON or "Aborted"
banner = (
"\n" + "=" * 80 + "\n"
" LOAD TEST ABORTED\n" + "-" * 80 + "\n"
f"{reason}\n\n{NO_WORK_ERROR_MESSAGE}\n" + "=" * 80 + "\n"
)
# Print to stdout and log as error so it's visible in any context
try:
print(banner, flush=True)
except Exception:
pass
logger.error(banner)
def _trigger_global_abort(
environment, reason: str, dump_html: str | None = None, *, broadcast: bool = True
) -> None:
"""
Set a global flag so all users bail, set a failing exit code,
and in distributed mode coordinate master<->workers via custom messages.
"""
global GLOBAL_ABORT_REASON
if GLOBAL_ABORT_EVENT.is_set():
return
GLOBAL_ABORT_REASON = reason
GLOBAL_ABORT_EVENT.set()
logger.error("Aborting load test: %s", reason)
logger.error(NO_WORK_ERROR_MESSAGE)
if dump_html:
try:
ts = int(time.time())
out = Path(f"no_work_{ts}.html").resolve()
out.write_text(dump_html, encoding="utf-8")
logger.error("No-work HTML dumped to %s", out)
except Exception as e:
logger.error("Failed to dump no-work HTML (%s)", e)
try:
if hasattr(environment, "process_exit_code"):
environment.process_exit_code = 2
except Exception:
pass
runner = getattr(environment, "runner", None)
if not runner:
return
try:
# Worker that discovers the problem -> tell master
if isinstance(runner, runners.WorkerRunner):
runner.send_message("global-abort", {"reason": reason})
# Master -> broadcast to all workers
if broadcast and isinstance(runner, runners.MasterRunner):
runner.send_message("global-abort", {"reason": reason})
runner.quit()
except Exception as e:
logger.error("Error quitting runner: %s", e)
# ---------- helpers ----------
def _is_local(path_or_url: str, base: str) -> bool:
if not path_or_url:
return False
if path_or_url.startswith("/"):
return True
parsed = urlparse(path_or_url)
if not parsed.scheme:
return True
return urlparse(base).netloc == parsed.netloc
class _ResourceParser(HTMLParser):
"""Extract local script and stylesheet URLs from the page."""
def __init__(self, base_url: str):
super().__init__()
self.base_url = base_url
self.resources = []
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == "script":
src = attrs.get("src")
if src and _is_local(src, self.base_url):
self.resources.append(urljoin(self.base_url, src))
elif tag == "link":
rel = (attrs.get("rel") or "").lower()
href = attrs.get("href")
if "stylesheet" in rel and href and _is_local(href, self.base_url):
self.resources.append(urljoin(self.base_url, href))
class _AssetPageParser(HTMLParser):
"""
Extract form action, supersedes, reserve URL
and review endpoints from an asset page.
"""
def __init__(self, base_url: str):
super().__init__()
self.base_url = base_url
self.in_transcription_form = False
self.form_action = None
self.supersedes = None
self.reserve_url = None
self.review_url = None
self.submit_url = None
def handle_starttag(self, tag, attrs):
a = dict(attrs)
if tag == "form":
if a.get("id") == "transcription-editor":
self.in_transcription_form = True
action = a.get("action")
if action is not None:
resolved = (
self.base_url
if action.strip() == ""
else urljoin(self.base_url, action)
)
self.form_action = resolved
review_attr = a.get("data-review-url")
if review_attr:
self.review_url = urljoin(self.base_url, review_attr)
submit_attr = a.get("data-submit-url")
if submit_attr:
self.submit_url = urljoin(self.base_url, submit_attr)
elif tag == "input":
if a.get("name") == "supersedes" and a.get("value"):
self.supersedes = a["value"]
elif tag == "script":
if a.get("id") == "asset-reservation-data":
reserve = a.get("data-reserve-asset-url")
if reserve:
self.reserve_url = urljoin(self.base_url, reserve)
def handle_endtag(self, tag):
if tag == "form" and self.in_transcription_form:
self.in_transcription_form = False
def _random_text(min_len=10, max_len=200) -> str:
n = random.randint(min_len, max_len)
alphabet = string.ascii_letters + string.digits + " "
s = "".join(random.choice(alphabet) for _ in range(n))
return " ".join(s.split())
# ---------- users ----------
class BaseBrowsingUser(HttpUser):
"""
Shared browse/post behavior. Subclasses provide their own on_start.
"""
abstract = True
wait_time = between(3.0, 8.0)
current_target_path: str | None = None
current_form_action_path: str | None = None
current_supersedes: str | None = None
current_reserve_path: str | None = None
current_review_url_path: str | None = None
current_submit_url_path: str | None = None
next_redirect_path: str = NEXT_ASSET_PATH
next_redirect_label: str = "next asset (redirect)"
_fatal_already_triggered = False
def _fatal_dump_and_quit(self, page_url: str, html: str) -> None:
if self.__class__._fatal_already_triggered:
return
self.__class__._fatal_already_triggered = True
ts = int(time.time())
out = Path(f"asset_parse_failure_{ts}.html").resolve()
try:
out.write_text(html or "", encoding="utf-8")
logger.error(
"FATAL: transcription form not found. Page URL=%s ; HTML dumped to %s",
page_url,
out,
)
except Exception as e:
logger.error(
"FATAL: failed to write HTML dump (%s). Page URL=%s", e, page_url
)
try:
self.environment.runner.quit()
except Exception as e:
logger.error("Error calling runner.quit(): %s", e)
def _after_request_ajax(self):
# simulate normal page load
self.client.get(AJAX_STATUS_PATH, name="AJAX status")
self.client.get(AJAX_MSG_PATH, name="AJAX messaging")
def _get(self, path_or_url: str, *, page: bool = True, **kwargs):
r = self.client.get(path_or_url, **kwargs)
if page:
self._after_request_ajax()
return r
def _post(self, path_or_url: str, **kwargs):
return self.client.post(path_or_url, **kwargs)
def _load_homepage_and_resources(self, *, name_suffix: str = ""):
base = self.environment.host.rstrip("/")
r_home = self._get(HOMEPAGE_PATH, page=True)
parser = _ResourceParser(base_url=base + "/")
try:
parser.feed(r_home.text or "")
except Exception:
parser.resources = []
for res_url in parser.resources:
label = "resource " + urlparse(res_url).path
if name_suffix:
label = f"{label} {name_suffix}"
self._get(res_url, name=label, page=False)
def _parse_asset_page_and_reserve(self, target_path: str) -> None:
base = self.environment.host.rstrip("/")
r = self._get(target_path, name="target page", page=True)
parser = _AssetPageParser(base_url=r.url)
try:
parser.feed(r.text or "")
except Exception:
self._fatal_dump_and_quit(r.url, r.text or "")
return
if parser.form_action:
fa = urlparse(parser.form_action)
self.current_form_action_path = fa.path + (
("?" + fa.query) if fa.query else ""
)
else:
self.current_form_action_path = None
self.current_supersedes = parser.supersedes
if parser.reserve_url:
ru = urlparse(parser.reserve_url)
self.current_reserve_path = ru.path + (("?" + ru.query) if ru.query else "")
else:
self.current_reserve_path = None
if parser.review_url:
rvu = urlparse(parser.review_url)
self.current_review_url_path = rvu.path + (
("?" + rvu.query) if rvu.query else ""
)
else:
self.current_review_url_path = None
if parser.submit_url:
su = urlparse(parser.submit_url)
self.current_submit_url_path = su.path + (
("?" + su.query) if su.query else ""
)
else:
self.current_submit_url_path = None
if not self.current_form_action_path:
if ABORT_WHEN_NO_WORK:
_trigger_global_abort(
self.environment,
f"No work available (no transcription form) on {r.url}",
(r.text or "") if NO_WORK_DUMP_HTML else None,
broadcast=True,
)
else:
logger.info("No transcription form on %s; treating as no work", r.url)
self.current_target_path = None
self.current_review_url_path = None
self.current_submit_url_path = None
return
if self.current_reserve_path:
csrftoken = self.client.cookies.get(CSRF_COOKIE_NAME)
referer = urljoin(base + "/", target_path.lstrip("/"))
self._post(
self.current_reserve_path,
headers={"X-CSRFToken": csrftoken or "", "Referer": referer},
name="reserve asset",
)
def _ensure_csrf(self, target_path: str | None) -> str | None:
if not target_path:
return None
if (
self.current_form_action_path is None
and self.current_review_url_path is None
):
self._parse_asset_page_and_reserve(target_path)
csrftoken = self.client.cookies.get(CSRF_COOKIE_NAME)
if not csrftoken and CSRF_SEED_PATH:
self._get(CSRF_SEED_PATH, name="csrf seed", page=True)
self._parse_asset_page_and_reserve(target_path)
csrftoken = self.client.cookies.get(CSRF_COOKIE_NAME)
return csrftoken
def _follow_next(self, redirect_path: str, label: str) -> str | None:
"""
Follow the next-* redirect. If it lands on the homepage, treat that as no work.
"""
last_body = None
for attempt in range(1, REDIRECT_RETRIES + 1):
with self.client.get(
redirect_path,
name=label,
allow_redirects=True,
catch_response=True,
) as resp:
try:
last_body = (resp.text or "")[:10000]
except Exception:
last_body = None
if 200 <= resp.status_code < 400:
final_path = urlparse(resp.url).path or "/"
if final_path == HOMEPAGE_PATH:
msg = f"{label} landed on homepage -> no work"
resp.failure(msg)
logger.error(msg)
if ABORT_WHEN_NO_WORK:
_trigger_global_abort(
self.environment,
f"No work available from {label} ({redirect_path})",
last_body if NO_WORK_DUMP_HTML else None,
broadcast=True,
)
return None
return final_path
msg = (
f"redirect failed (status={resp.status_code}) "
f"attempt={attempt}/{REDIRECT_RETRIES}"
)
resp.failure(msg)
logger.warning("%s retry: %s", label, msg)
sleep(REDIRECT_BACKOFF * attempt)
logger.error("%s: all %d retries failed", label, REDIRECT_RETRIES)
return None
def _post_then_get_same_page(
self, target_path: str | None, csrftoken: str, name_prefix: str
):
if not target_path:
return
base = self.environment.host.rstrip("/")
referer = urljoin(base + "/", target_path.lstrip("/"))
post_path = self.current_form_action_path
if not post_path:
logger.warning("No form action parsed for %s; skipping POST", target_path)
return
data = {POST_FIELD_NAME: _random_text(POST_MIN_CHARS, POST_MAX_CHARS)}
if self.current_supersedes:
data["supersedes"] = self.current_supersedes
self._post(
post_path,
data=data,
headers={"X-CSRFToken": csrftoken, "Referer": referer},
name=f"{name_prefix} POST",
)
self._parse_asset_page_and_reserve(target_path)
def _review_decision(self, target_path: str, decision: str) -> None:
if not self.current_review_url_path:
return
base = self.environment.host.rstrip("/")
referer = urljoin(base + "/", target_path.lstrip("/"))
csrftoken = self.client.cookies.get(CSRF_COOKIE_NAME) or ""
form = {"csrfmiddlewaretoken": csrftoken, "decision": decision}
name = "review accept" if decision == "accept" else "review reject"
self._post(
self.current_review_url_path,
data=form,
headers={"X-CSRFToken": csrftoken, "Referer": referer},
name=name,
)
@task
def browse_and_submit(self):
# if someone already pulled the plug, stop this user
if GLOBAL_ABORT_EVENT.is_set():
raise StopUser()
if not self.current_target_path:
new_path = self._follow_next(
self.next_redirect_path, self.next_redirect_label
)
if new_path is None:
return
self.current_target_path = new_path
self.current_form_action_path = None
self.current_supersedes = None
self.current_reserve_path = None
self.current_review_url_path = None
self.current_submit_url_path = None
else:
maybe_switch = getattr(self, "is_reviewer", False) is False
if maybe_switch and random.random() >= SAME_PAGE_REPEAT_PROB:
new_path = self._follow_next(
self.next_redirect_path, self.next_redirect_label
)
if new_path is None:
return
self.current_target_path = new_path
self.current_form_action_path = None
self.current_supersedes = None
self.current_reserve_path = None
self.current_review_url_path = None
self.current_submit_url_path = None
csrftoken = self._ensure_csrf(self.current_target_path)
if not csrftoken:
if self.current_target_path:
self._get(
self.current_target_path, name="target page (no CSRF)", page=True
)
return
if getattr(self, "is_reviewer", False):
do_edit = random.random() < REVIEW_EDIT_PROB
if do_edit:
self._review_decision(self.current_target_path, "reject")
self._parse_asset_page_and_reserve(self.current_target_path)
csrftoken = self._ensure_csrf(self.current_target_path) or ""
if csrftoken:
self._post_then_get_same_page(
self.current_target_path, csrftoken, "review edit save"
)
else:
self._review_decision(self.current_target_path, "accept")
self.current_target_path = None
self.current_form_action_path = None
self.current_supersedes = None
self.current_reserve_path = None
self.current_review_url_path = None
self.current_submit_url_path = None
return
# Transcriber branch
self._post_then_get_same_page(self.current_target_path, csrftoken, "target")
if random.random() < SAME_PAGE_REPEAT_PROB:
csrftoken = self.client.cookies.get(CSRF_COOKIE_NAME) or self._ensure_csrf(
self.current_target_path
)
if csrftoken:
self._post_then_get_same_page(
self.current_target_path, csrftoken, "target (repeat)"
)
class AnonUser(BaseBrowsingUser):
"""Anonymous user flow."""
def on_start(self):
self._load_homepage_and_resources()
class AuthUser(BaseBrowsingUser):
"""Authenticated user flow."""
chosen_username: str | None = None
chosen_email: str | None = None
is_reviewer: bool = False
def _pick_fixture_user(self):
index = random.randint(1, TEST_USER_COUNT)
username = f"{TEST_USER_PREFIX}{index:05d}"
email = f"{username}@{TEST_USER_DOMAIN}"
self.chosen_username = username
self.chosen_email = email
def _login_once(self, login_url: str, referer: str) -> bool:
csrftoken = self.client.cookies.get(CSRF_COOKIE_NAME) or ""
if not csrftoken:
self._get(login_url, name="login page", page=True)
csrftoken = self.client.cookies.get(CSRF_COOKIE_NAME) or ""
assert self.chosen_username and self.chosen_email
identifier = (
self.chosen_username if random.random() < 0.5 else self.chosen_email
)
wrong = random.random() < LOGIN_BAD_PASSWORD_PROB
password = TEST_USER_PASSWORD if not wrong else TEST_USER_PASSWORD + "x"
form = {
"username": identifier,
"password": password,
"csrfmiddlewaretoken": csrftoken,
"next": "/",
}
self._post(
login_url,
data=form,
headers={"X-CSRFToken": csrftoken, "Referer": referer},
name="login POST",
)
has_session = bool(self.client.cookies.get(SESSION_COOKIE_NAME))
if has_session:
return True
self._get("/", name="post-login home probe", page=True)
has_session = bool(self.client.cookies.get(SESSION_COOKIE_NAME))
return has_session
def on_start(self):
self._get(HOMEPAGE_PATH, page=True)
self._pick_fixture_user()
query = urlencode({"next": "/"})
login_url = f"{LOGIN_PATH}?{query}"
base = self.environment.host.rstrip("/")
referer = urljoin(base + "/", LOGIN_PATH.lstrip("/"))
self._get(login_url, name="login page", page=True)
success = False
for _ in range(LOGIN_MAX_ATTEMPTS):
if self._login_once(login_url, referer):
success = True
break
self._get(login_url, name="login page (retry)", page=True)
if not success:
logger.error(
"AuthUser failed to authenticate after %d attempts (user=%s / %s)",
LOGIN_MAX_ATTEMPTS,
self.chosen_username,
self.chosen_email,
)
self.is_reviewer = random.random() < REVIEWER_SHARE
if self.is_reviewer:
self.next_redirect_path = NEXT_REVIEWABLE_ASSET_PATH
self.next_redirect_label = "next reviewable (redirect)"
else:
self.next_redirect_path = NEXT_ASSET_PATH
self.next_redirect_label = "next asset (redirect)"
self._load_homepage_and_resources(name_suffix="(authed)")
================================================
FILE: manage.py
================================================
#!/usr/bin/env python3
import sys
if __name__ == "__main__":
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
================================================
FILE: package.json
================================================
{
"devDependencies": {
"@axe-core/cli": "^4.11.3",
"@puppeteer/browsers": "^2.10.13",
"child_process": "^1.0.2",
"sass-embedded": "^1.99.0",
"stylelint-value-no-unknown-custom-properties": "^6.1.1",
"vite": "^8.0.10",
"vite-plugin-compression2": "^2.5.2"
},
"dependencies": {
"@duetds/date-picker": "^1.4.0",
"@fortawesome/fontawesome-free": "^7.1.0",
"@popperjs/core": "^2.11.8",
"@sentry/browser": "^10.49.0",
"@sentry/core": "^10.48.0",
"@sentry/tracing": "^7.120.4",
"bootstrap": "^5.3.8",
"chart.js": "^4.5.1",
"chroma-js": "^3.2.0",
"codemirror": "^5.65.19",
"fancy-log": "^2.0.0",
"jquery": "^3.5.1",
"js-cookie": "^3.0.5",
"openseadragon": "^6.0.2",
"openseadragon-filters": "^2.2.0",
"prettier": "^2.8.8",
"remarkable": "^2.0.1",
"screenfull": "^6.0.0",
"split.js": "^1.6.2",
"urijs": "^1.19.11"
},
"name": "concordia",
"private": true,
"version": "1.0.0",
"directories": {
"doc": "docs"
},
"repository": {
"type": "git",
"url": "git+https://github.com/LibraryOfCongress/concordia.git"
},
"license": "CC0-1.0",
"bugs": {
"url": "https://github.com/LibraryOfCongress/concordia/issues"
},
"homepage": "https://github.com/LibraryOfCongress/concordia",
"scripts": {
"dev": "vite",
"copy-vendor": "mkdir -p concordia/static/openseadragon/images && cp -R node_modules/openseadragon/build/openseadragon/images/* concordia/static/openseadragon/images/",
"build": "npm run copy-vendor && vite build",
"preview": "vite preview",
"postinstall": "npm run copy-vendor"
}
}
================================================
FILE: postgresql/create-multiple-postgresql-databases.sh
================================================
#!/bin/bash
set -e
set -u
function create_user_and_database() {
local database=$1
echo " Creating user and database '$database'"
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL
CREATE USER $database;
CREATE DATABASE $database;
GRANT ALL PRIVILEGES ON DATABASE $database TO $database;
EOSQL
}
if [ -n "$POSTGRES_MULTIPLE_DATABASES" ]; then
echo "Multiple database creation requested: $POSTGRES_MULTIPLE_DATABASES"
for db in $(echo $POSTGRES_MULTIPLE_DATABASES | tr ',' ' '); do
create_user_and_database $db
if [ $db = "sentry" ]
then
echo " Giving sentry superuser powers!!"
psql -v ON_ERROR_STOP=1 --username postgres -c "ALTER ROLE sentry superuser;"
fi
done
echo "Multiple databases created"
fi
================================================
FILE: prometheus_metrics/LICENSE
================================================
MIT License
Copyright (c) 2017 Jimdo GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: prometheus_metrics/__init__.py
================================================
================================================
FILE: prometheus_metrics/apps.py
================================================
from django.apps import AppConfig
class PrometheusMetricsConfig(AppConfig):
name = "prometheus_metrics"
verbose_name = "Prometheus Metrics"
================================================
FILE: prometheus_metrics/middleware.py
================================================
from timeit import default_timer
from django.utils.deprecation import MiddlewareMixin
from prometheus_client import Counter, Histogram
requests_total = Counter(
"django_http_requests_total",
"Total count of requests",
["status_code", "method", "view"],
)
requests_latency = Histogram(
"django_http_requests_latency_seconds",
"Histogram of requests processing time",
["status_code", "method", "view"],
)
class PrometheusBeforeMiddleware(MiddlewareMixin):
def process_request(self, request):
request.prometheus_middleware_request_start = default_timer()
def process_response(self, request, response):
resolver_match = request.resolver_match
if resolver_match:
handler = resolver_match.url_name
if not handler:
handler = resolver_match.view_name
handler = handler.replace("-", "_")
else:
handler = ""
requests_total.labels(response.status_code, request.method, handler).inc()
if hasattr(request, "prometheus_middleware_request_start"):
requests_latency.labels(
response.status_code, request.method, handler
).observe(default_timer() - request.prometheus_middleware_request_start)
return response
================================================
FILE: prometheus_metrics/models.py
================================================
from prometheus_client import Counter
model_inserts_total = Counter(
"django_model_inserts_total", "Number of inserts on a certain model", ["model"]
)
model_updates_total = Counter(
"django_model_updates_total", "Number of updates on a certain model", ["model"]
)
model_deletes_total = Counter(
"django_model_deletes_total", "Number of deletes on a certain model", ["model"]
)
def MetricsModelMixin(name):
class Mixin(object):
def _do_insert(self, *args, **kwargs):
model_inserts_total.labels(name).inc()
return super(Mixin, self)._do_insert(*args, **kwargs)
def _do_update(self, *args, **kwargs):
model_updates_total.labels(name).inc()
return super(Mixin, self)._do_update(*args, **kwargs)
def _do_delete(self, *args, **kwargs):
model_deletes_total.labels(name).inc()
return super(Mixin, self).delete(*args, **kwargs)
return Mixin
================================================
FILE: prometheus_metrics/views.py
================================================
import prometheus_client
from django.http import HttpResponse
from django.views import View
class MetricsView(View):
def get(self, request, *args, **kwargs):
metrics_page = prometheus_client.generate_latest()
return HttpResponse(
metrics_page, content_type=prometheus_client.CONTENT_TYPE_LATEST
)
================================================
FILE: pylenium.json
================================================
{
"driver": {
"browser": "chrome",
"remote_url": "",
"wait_time": 10,
"page_load_wait_time": 0,
"options": [
"headless",
"no-sandbox",
"disable-gpu"
],
"capabilities": {},
"experimental_options": null,
"extension_paths": [],
"webdriver_kwargs": {},
"seleniumwire_enabled": false,
"seleniumwire_options": {},
"local_path": ""
},
"logging": {
"screenshots_on": true
},
"viewport": {
"maximize": true,
"width": 1440,
"height": 900,
"orientation": "portrait"
},
"customer": {}
}
================================================
FILE: pyproject.toml
================================================
[tool.black]
target_version = ["py312"]
exclude = '''
(
/(
| \.git
| node_modules
)/
)
'''
[tool.ruff]
target-version = "py310"
select = [
"E",
"F",
"W",
"A", # flake8-builtins
"B", # flake8-bugbear
"C4", # flake8-comprehensions
"ERA", # flake8-eradicate
"G", # flake8-logging-format
"I", # isort
]
ignore-init-module-imports = true # Prevents removing imports from __init__.py
extend-exclude = [
"concordia/settings_dev_*.py"
]
# Ignore line length in migrations
[tool.ruff.per-file-ignores]
"*/migrations/*" = ["E501"]
# v8.0.4 broke original setup config to produce git version - required configuration
[tool.setuptools_scm]
================================================
FILE: setup.cfg
================================================
[pycodestyle]
exclude = .venv,docs/conf.py
ignore =
max-line-length = 88
[tool:pytest]
DJANGO_SETTINGS_MODULE = concordia.settings_test
addopts = -rf
[isort]
default_section = THIRDPARTY
force_grid_wrap = 0
include_trailing_comma = True
known_first_party = concordia,importer,exporter
line_length = 88
multi_line_output = 3
skip = .venv
use_parentheses = True
[flake8]
exclude = .venv,node_modules,concordia/settings_dev_*.py
max-line-length = 88
enable-extensions = G
per-file-ignores =
*/migrations/*:E501
[readme_check]
readmes =
concordia/views/README.md
================================================
FILE: setup.py
================================================
#!/usr/bin/env python
from setuptools import find_packages, setup
VERSION = __import__("concordia").get_version()
INSTALL_REQUIREMENTS = ["boto3", "Django>=4.2"]
SCRIPTS = ["manage.py"]
DESCRIPTION = "Transcription crowdsourcing"
CLASSIFIERS = """\
Environment :: Web Environment
Framework :: Django
Programming Language :: Python
Programming Language :: Python :: 3.12
""".splitlines()
with open("README.md", "r") as f:
LONG_DESCRIPTION = f.read()
setup(
name="concordia",
version=VERSION,
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
packages=find_packages(),
include_package_data=True,
scripts=SCRIPTS,
install_requires=INSTALL_REQUIREMENTS,
classifiers=CLASSIFIERS,
use_scm_version={
"write_to": "version.txt",
"tag_regex": r"^(?Pv)?(?P[^\+]+)(?P.*)?$",
},
setup_requires=["setuptools_scm"],
)
================================================
FILE: src/about.js
================================================
import '../concordia/static/js/src/modules/concordia-visualization.js';
import '../concordia/static/js/src/visualizations/asset-status-overview.js';
import '../concordia/static/js/src/visualizations/daily-activity.js';
================================================
FILE: src/main.js
================================================
import $ from 'jquery';
window.$ = window.jQuery = $;
import 'bootstrap';
import 'bootstrap/dist/css/bootstrap.min.css';
/* local scripts */
import '../concordia/static/js/src/about-accordions.js';
import '../concordia/static/js/src/asset-reservation.js';
import '../concordia/static/js/src/banner.js';
import '../concordia/static/js/src/contribute.js';
import '../concordia/static/js/src/filter-assets.js';
import '../concordia/static/js/src/guide.js';
import '../concordia/static/js/src/homepage-carousel.js';
import '../concordia/static/js/src/ocr.js';
import {setTutorialHeight} from '../concordia/static/js/src/modules/quick-tips.js';
import '../concordia/static/js/src/quick-tips-setup.js';
import '../concordia/static/js/src/viewer.js';
import '../concordia/static/js/src/viewer-split.js';
/*- Third-party */
import OpenSeadragon from 'openseadragon';
window.OpenSeadragon = OpenSeadragon;
if (setTutorialHeight) {
window.setTutorialHeight = setTutorialHeight;
}
================================================
FILE: src/profile.js
================================================
import '../concordia/static/js/src/campaign-selection.js';
import '../concordia/static/js/src/recent-pages.js';
import '../concordia/static/js/src/profile-fields.js';
================================================
FILE: static/.gitignore
================================================
css
js
sourcemaps
frontend
================================================
FILE: tools/readme_symbol_check.py
================================================
#!/usr/bin/env python3
"""
README Symbol Checker
This script verifies that all top-level class and function names defined in Python
files under the directory containing a given README.md file are mentioned somewhere
in the README.
To use it, configure `setup.cfg` with a [readme_check] section like:
[readme_check]
readmes =
concordia/views/README.md
This will recursively scan all `.py` files in `concordia/views/` and ensure every
class/function defined in them appears by name (case-sensitive) somewhere in the
corresponding README.md.
"""
import ast
import configparser
import sys
from pathlib import Path
from typing import List
def collect_defined_symbols(py_path: Path) -> List[str]:
"""
Parse a Python file and return all top-level class and function names.
"""
with py_path.open(encoding="utf-8") as f:
tree = ast.parse(f.read(), filename=str(py_path))
return [
node.name
for node in tree.body
if isinstance(node, (ast.FunctionDef, ast.ClassDef))
]
def read_readme_text(readme_path: Path) -> str:
return readme_path.read_text(encoding="utf-8")
def check_readme(readme_path: Path) -> int:
"""
Check that each symbol defined in the Python files under the same directory
as the README appears in the README text. Returns exit code (0 or 1).
"""
readme_text = read_readme_text(readme_path)
search_dir = readme_path.parent
exit_code = 0
for py_file in search_dir.rglob("*.py"):
defined = collect_defined_symbols(py_file)
for name in defined:
if name not in readme_text:
print(f"V001 Symbol '{name}' is not documented in {readme_path.name}")
exit_code = 1
return exit_code
def load_readmes_from_config() -> List[Path]:
"""
Read the list of README.md files from setup.cfg under the [readme_check] section.
"""
cfg_path = Path("setup.cfg")
if not cfg_path.exists():
sys.stderr.write("ERROR: setup.cfg not found\n")
sys.exit(2)
config = configparser.ConfigParser()
config.read(cfg_path)
try:
section = config["readme_check"]
readmes = [
Path(p.strip())
for p in section.get("readmes", "").splitlines()
if p.strip()
]
if not readmes:
raise ValueError
return readmes
except (KeyError, ValueError):
sys.stderr.write("ERROR: No [readme_check] readmes configured in setup.cfg\n")
sys.exit(2)
def main() -> None:
exit_code = 0
readmes = load_readmes_from_config()
for readme in readmes:
if not readme.exists():
print(f"ERROR: README file not found: {readme}", file=sys.stderr)
exit_code = 2
else:
exit_code = max(exit_code, check_readme(readme))
sys.exit(exit_code)
if __name__ == "__main__":
main()
================================================
FILE: vite.config.js
================================================
import {defineConfig} from 'vite';
import {compression} from 'vite-plugin-compression2';
import path from 'node:path';
import {fileURLToPath} from 'node:url';
// Define __dirname for ES Modules
const __dirname = path.dirname(fileURLToPath(import.meta.url));
export default defineConfig({
base: '/static/',
resolve: {
alias: {
// Map the custom name to its actual directory
// Adjust the path below to where your visualization logic actually lives
'concordia-visualization': path.resolve(
__dirname,
'./concordia/static/js/src/modules/concordia-visualization.js',
),
},
},
optimizeDeps: {
include: ['openseadragon', 'openseadragon-filters'],
},
build: {
// collectstatic ignores hidden files - so 'true' not enough
manifest: 'manifest.json',
// Using 'dist' prevents Vite from writing into your source folders
outDir: 'concordia/static/dist', // where the compiled files go
emptyOutDir: true,
rollupOptions: {
input: {
// Existing entry points
main: './src/main.js',
about: './src/about.js',
profile: './src/profile.js',
// ADD the new standalone JS files
admin_custom: './concordia/static/admin/custom-inline.js',
admin_editor: './concordia/static/admin/editor-preview.js',
js_base: './concordia/static/js/src/base.js',
accessible_colors:
'./concordia/static/js/src/modules/accessible-colors.js',
chroma_esm: './concordia/static/js/src/modules/chroma-esm.js',
turnstile: './concordia/static/js/src/modules/turnstile.js',
viz_errors:
'./concordia/static/js/src/modules/visualization-errors.js',
password_validation:
'./concordia/static/js/src/password-validation.js',
viz_asset_status:
'./concordia/static/js/src/visualizations/asset-status-by-campaign.js',
jquery_cookie: './concordia/static/vendor/jquery.cookie.js',
// The SCSS entry point
base_styles: './concordia/static/scss/base.scss',
},
output: {
// 1. Enable hashing so Vite handles versioning
entryFileNames: 'js/[name]-[hash].js',
chunkFileNames: 'js/[name]-[hash].js',
assetFileNames: 'assets/[name]-[hash][extname]',
},
},
},
plugins: [
// 2. Pre-compress files so WhiteNoise doesn't have to at startup
compression({algorithm: 'gzip'}),
compression({algorithm: 'brotliCompress'}),
],
});
|