Repository: UB-Mannheim/ocr-fileformat
Branch: master
Commit: 69a917e4db9a
Files: 41
Total size: 83.6 KB
Directory structure:
gitextract_kh4rpdh8/
├── .dockerignore
├── .eslintrc.google.js
├── .eslintrc.js
├── .github/
│ └── workflows/
│ ├── ci.yml
│ └── codeql.yml
├── .gitignore
├── .gitmodules
├── .zipignore
├── CITATION.cff
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── bin/
│ ├── ocr-transform.sh
│ └── ocr-validate.sh
├── docker.config.php
├── example/
│ ├── .gitignore
│ ├── Makefile
│ └── README.md
├── lib.sh
├── script/
│ ├── transform/
│ │ ├── README.md
│ │ ├── alto__page
│ │ ├── gcv__alto
│ │ ├── gcv__hocr
│ │ ├── gcv__page
│ │ ├── page__alto
│ │ ├── page__alto_legacy
│ │ └── textract__page
│ └── validate/
│ ├── README.md
│ └── hocr
├── vendor/
│ └── Makefile
├── web/
│ ├── config.php
│ ├── index.html
│ ├── ocr-fileformat.css
│ ├── ocr-fileformat.js
│ └── ocr-fileformat.php
├── xsd/
│ └── .gitignore
└── xslt/
├── .gitignore
├── alto2.0__alto3.0.xsl
├── page__text.xsl
└── tei__hocr.xsl
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
Dockerfile
example
test
README.md
xsd
xslt
!xslt/alto2.0__alto3.0.xsl
!xslt/page__text.xsl
!xslt/tei__hocr.xsl
vendor/*
!vendor/Makefile
!vendor/saxon*.jar
================================================
FILE: .eslintrc.google.js
================================================
/**
* Copyright 2016 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
'use strict';
module.exports = {
rules: {
// The rules below are listed in the order they appear on the eslint
// rules page. All rules are listed to make it easier to keep in sync
// as new ESLint rules are added.
// http://eslint.org/docs/rules/
// - Rules in the `eslint:recommended` ruleset that aren't specifically
// mentioned by the google styleguide are listed but commented out (so
// they don't override a base ruleset).
// - Rules that are recommended but contradict the Google styleguide
// are explicitely set to the Google styleguide value.
// Possible Errors
// http://eslint.org/docs/rules/#possible-errors
// ---------------------------------------------
// 'for-direction': 0,
// 'no-await-in-loop': 0,
// 'no-compare-neg-zero': 2, // eslint:recommended
'no-cond-assign': 0, // eslint:recommended
// 'no-console': 2, // eslint:recommended
// 'no-constant-condition': 2, // eslint:recommended
// 'no-control-regex': 2, // eslint:recommended
// 'no-debugger': 2, // eslint:recommended
// 'no-dupe-args': 2, // eslint:recommended
// 'no-dupe-keys': 2, // eslint:recommended
// 'no-duplicate-case': 2, // eslint:recommended
// 'no-empty': 2, // eslint:recommended
// 'no-empty-character-class': 2, // eslint:recommended
// 'no-ex-assign': 2, // eslint:recommended
// 'no-extra-boolean-cast': 2, // eslint:recommended
// 'no-extra-parens': 0,
// 'no-extra-semi': 2, // eslint:recommended
// 'no-func-assign': 2, // eslint:recommended
// 'no-inner-declarations': 2, // eslint:recommended
// 'no-invalid-regexp': 2, // eslint:recommended
'no-irregular-whitespace': 2, // eslint:recommended
// 'no-obj-calls': 2, // eslint:recommended
// 'no-prototype-builtins': 0,
// 'no-regex-spaces': 2, // eslint:recommended
// 'no-sparse-arrays': 2, // eslint:recommended
// 'no-template-curly-in-string': 0,
'no-unexpected-multiline': 2, // eslint:recommended
// 'no-unreachable': 2, // eslint:recommended
// 'no-unsafe-finally': 2, // eslint:recommended
// 'no-unsafe-negation': 0,
// 'use-isnan': 2 // eslint:recommended
'valid-jsdoc': [2, {
requireParamDescription: false,
requireReturnDescription: false,
requireReturn: false,
prefer: {returns: 'return'},
}],
// 'valid-typeof': 2 // eslint:recommended
// Best Practices
// http://eslint.org/docs/rules/#best-practices
// --------------------------------------------
// 'accessor-pairs': 0,
// 'array-callback-return': 0,
// 'block-scoped-var': 0,
// 'class-methods-use-this': 0,
// 'complexity': 0,
// 'consistent-return': 0
'curly': [2, 'multi-line'], // TODO(philipwalton): add an option to enforce
// braces with the exception of simple,
// single-line if statements.
// 'default-case': 0,
// 'dot-location': 0,
// 'dot-notation': 0,
// 'eqeqeq': 0,
'guard-for-in': 2,
// 'no-alert': 0,
'no-caller': 2,
// 'no-case-declarations': 2, // eslint:recommended
// 'no-div-regex': 0,
// 'no-else-return': 0,
// 'no-empty-function': 0,
// 'no-empty-pattern': 2, // eslint:recommended
// 'no-eq-null': 0,
// 'no-eval': 0,
'no-extend-native': 2,
'no-extra-bind': 2,
// 'no-extra-label': 0,
// 'no-fallthrough': 2, // eslint:recommended
// 'no-floating-decimal': 0,
// 'no-global-assign': 0,
// 'no-implicit-coercion': 0,
// 'no-implicit-globals': 0,
// 'no-implied-eval': 0,
'no-invalid-this': 2,
// 'no-iterator': 0,
// 'no-labels': 0,
// 'no-lone-blocks': 0,
// 'no-loop-func': 0,
// 'no-magic-numbers': 0,
'no-multi-spaces': 2,
'no-multi-str': 2,
// 'no-new': 0,
// 'no-new-func': 0,
'no-new-wrappers': 2,
// 'no-octal': 2, // eslint:recommended
// 'no-octal-escape': 0,
// 'no-param-reassign': 0,
// 'no-proto': 0,
// 'no-redeclare': 2, // eslint:recommended
// 'no-restricted-properties': 0,
// 'no-return-assign': 0,
// 'no-script-url': 0,
// 'no-self-assign': 2, // eslint:recommended
// 'no-self-compare': 0,
// 'no-sequences': 0,
'no-throw-literal': 2, // eslint:recommended
// 'no-unmodified-loop-condition': 0,
// 'no-unused-expressions': 0,
// 'no-unused-labels': 2, // eslint:recommended
// 'no-useless-call': 0,
// 'no-useless-concat': 0,
// 'no-useless-escape': 0,
// 'no-void': 0,
// 'no-warning-comments': 0,
'no-with': 2,
// 'prefer-promise-reject-errors': 0,
// 'radix': 0,
// 'require-await': 0,
// 'vars-on-top': 0,
// 'wrap-iife': 0,
// 'yoda': 0,
// Strict Mode
// http://eslint.org/docs/rules/#strict-mode
// -----------------------------------------
// 'strict': 0,
// Variables
// http://eslint.org/docs/rules/#variables
// ---------------------------------------
// 'init-declarations': 0,
// 'no-catch-shadow': 0,
// 'no-delete-var': 2, // eslint:recommended
// 'no-label-var': 0,
// 'no-restricted-globals': 0,
// 'no-shadow': 0,
// 'no-shadow-restricted-names': 0,
// 'no-undef': 2, // eslint:recommended
// 'no-undef-init': 0,
// 'no-undefined': 0,
'no-unused-vars': [2, {args: 'none'}], // eslint:recommended
// 'no-use-before-define': 0,
// Node.js and CommonJS
// http://eslint.org/docs/rules/#nodejs-and-commonjs
// -------------------------------------------------
// 'callback-return': 0,
// 'global-require': 0,
// 'handle-callback-err': 0,
// 'no-buffer-constructor': 0,
// 'no-mixed-requires': 0,
// 'no-new-require': 0,
// 'no-path-concat': 0,
// 'no-process-env': 0,
// 'no-process-exit': 0,
// 'no-restricted-modules': 0,
// 'no-sync': 0,
// Stylistic Issues
// http://eslint.org/docs/rules/#stylistic-issues
// ----------------------------------------------
'array-bracket-newline': 0, // eslint:recommended
'array-bracket-spacing': [2, 'never'],
'array-element-newline': 0, // eslint:recommended
'block-spacing': [2, 'never'],
'brace-style': 2,
'camelcase': [2, {properties: 'never'}],
// 'capitalized-comments': 0,
'comma-dangle': [2, 'always-multiline'],
'comma-spacing': 2,
'comma-style': 2,
'computed-property-spacing': 2,
// 'consistent-this': 0,
'eol-last': 2,
'func-call-spacing': 2,
// 'func-name-matching': 0,
// 'func-names': 0,
// 'func-style': 0,
// 'id-blacklist': 0,
// 'id-length': 0,
// 'id-match': 0,
// 'indent': 0, // TODO(philipwalton): this rule isn't compatible with
// Google's 4-space indent for line continuations.
// 'jsx-quotes': 0,
'key-spacing': 2,
'keyword-spacing': 2,
// 'line-comment-position': 0,
'linebreak-style': 2,
// 'lines-around-comment': 0,
// 'max-depth': 0,
'max-len': [2, {
code: 80,
tabWidth: 2,
ignoreUrls: true,
ignorePattern: '^goog\.(module|require)',
}],
// 'max-lines': 0,
// 'max-nested-callbacks': 0,
// 'max-params': 0,
// 'max-statements': 0,
// 'max-statements-per-line': 0,
// 'multiline-ternary': 0, // TODO(philipwalton): add a rule to enforce the
// operator appearing at the end of the line.
'new-cap': 2,
// 'new-parens': 0,
// 'newline-per-chained-call': 0,
'no-array-constructor': 2,
// 'no-bitwise': 0,
// 'no-continue': 0,
// 'no-inline-comments': 0,
// 'no-lonely-if': 0,
// 'no-mixed-operators': 0,
'no-mixed-spaces-and-tabs': 2, // eslint:recommended
// 'no-multi-assign': 0,
'no-multiple-empty-lines': [2, {max: 2}],
// 'no-negated-condition': 0,
// 'no-nested-ternary': 0,
'no-new-object': 2,
// 'no-plusplus': 0,
// 'no-restricted-syntax': 0,
'no-tabs': 2,
// 'no-ternary': 0,
'no-trailing-spaces': 2,
// 'no-underscore-dangle': 0,
// 'no-unneeded-ternary': 0,
// 'no-whitespace-before-property': 0,
// 'nonblock-statement-body-position': 0,
// 'object-curly-newline': 0,
'object-curly-spacing': 2,
// 'object-property-newline': 0,
'one-var': [2, {
var: 'never',
let: 'never',
const: 'never',
}],
// 'one-var-declaration-per-line': 0,
// 'operator-assignment': 0,
// 'operator-linebreak': 0,
'padded-blocks': [2, 'never'],
// 'padding-line-between-statements': 0,
'quote-props': [2, 'consistent'],
'quotes': [2, 'single', {allowTemplateLiterals: true}],
'require-jsdoc': [2, {
require: {
FunctionDeclaration: true,
MethodDefinition: true,
ClassDeclaration: true,
},
}],
'semi': 2,
'semi-spacing': 2,
// 'semi-style': 0,
// 'sort-keys': 0,
// 'sort-vars': 0,
'space-before-blocks': 2,
'space-before-function-paren': [2, {
asyncArrow: 'always',
anonymous: 'never',
named: 'never',
}],
// 'space-in-parens': 0,
// 'space-infix-ops': 0,
// 'space-unary-ops': 0,
'spaced-comment': [2, 'always'],
// 'switch-colon-spacing': 2,
// 'template-tag-spacing': 0,
// 'unicode-bom': 0,
// 'wrap-regex': 0,
// ECMAScript 6
// http://eslint.org/docs/rules/#ecmascript-6
// ------------------------------------------
// 'arrow-body-style': 0,
'arrow-parens': [2, 'always'], // TODO(philipwalton): technically arrow
// parens are optional but recommended.
// ESLint doesn't support a *consistent*
// setting so "always" is used.
// 'arrow-spacing': 0,
'constructor-super': 2, // eslint:recommended
'generator-star-spacing': [2, 'after'],
// 'no-class-assign': 0,
// 'no-confusing-arrow': 0,
// 'no-const-assign': 0, // eslint:recommended
// 'no-dupe-class-members': 0, // eslint:recommended
// 'no-duplicate-imports': 0,
'no-new-symbol': 2, // eslint:recommended
// 'no-restricted-imports': 0,
'no-this-before-super': 2, // eslint:recommended
// 'no-useless-computed-key': 0,
// 'no-useless-constructor': 0,
// 'no-useless-rename': 0,
'no-var': 2,
// 'object-shorthand': 0,
// 'prefer-arrow-callback': 0,
// 'prefer-const': 0,
// 'prefer-destructuring': 0,
// 'prefer-numeric-literals': 0,
'prefer-rest-params': 2,
'prefer-spread': 2,
// 'prefer-template': 0,
// 'require-yield': 2, // eslint:recommended
'rest-spread-spacing': 2,
// 'sort-imports': 0,
// 'symbol-description': 0,
// 'template-curly-spacing': 0,
'yield-star-spacing': [2, 'after'],
},
};
================================================
FILE: .eslintrc.js
================================================
module.exports = {
extends: './.eslintrc.google.js',
parserOptions: {
"ecmaVersion": 2017,
"sourceType": "module",
},
env: {
es6: true,
},
rules: {
'arrow-parens': 0,
'block-spacing': 0,
'brace-style': 0,
'camelcase': 0,
'comma-dangle': 0,
'comma-style': [2, 'last'],
'curly': 0,
'indent': [0, 4],
'key-spacing': 0,
'linebreak-style': 2,
'max-len': 0,
'new-cap': 0,
'no-invalid-this': 0,
'no-multi-spaces': 0,
'no-undef': 2,
'no-unused-vars': 1,
'object-curly-spacing': 0,
'padded-blocks': [0, 'never'],
'quote-props': 0,
'quotes': 0,
'require-jsdoc': 0,
'semi': [1, 'always'],
'space-before-function-paren': [0, {"anonymous": "never"}],
'valid-jsdoc': 0,
},
globals: {
// $: true,
_: true,
rdfstore: true,
FormData: true,
Backbone: true,
document: true,
require: true,
define: true,
console: true,
window: true,
process: true,
module: true,
Image: true,
exports: true,
parent: true,
setTimeout: true,
setInterval: true,
clearTimeout: true,
clearInterval: true,
__dirname: true,
GM_registerMenuCommand: true,
__filename: true,
Buffer: true,
fetch: true,
},
}
================================================
FILE: .github/workflows/ci.yml
================================================
name: Continuous Integration
# Continuous integration test for ocr-fileformat.
on:
# pull_request:
# push:
# schedule:
# - cron: 0 20 * * *
workflow_dispatch:
jobs:
linux:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Install tesseract and other dependencies
run: |
sudo apt-get update
sudo make -C example deps
- name: Run make all
run: |
make all PREFIX=$HOME
- name: Run tests
run: |
make -C example roundtrip diff
================================================
FILE: .github/workflows/codeql.yml
================================================
name: "CodeQL"
on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]
schedule:
- cron: "46 17 * * 3"
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: [ javascript ]
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
queries: +security-and-quality
- name: Autobuild
uses: github/codeql-action/autobuild@v2
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
with:
category: "/language:${{ matrix.language }}"
================================================
FILE: .gitignore
================================================
/Saxon*
*.jar
/*.alto
vendor/*
!vendor/Makefile
ocr-fileformat_*
*~
================================================
FILE: .gitmodules
================================================
[submodule "vendor/alto-schema"]
path = vendor/alto-schema
url = https://github.com/altoxml/schema.git
[submodule "vendor/format-converters"]
path = vendor/format-converters
url = https://github.com/OCR-D/format-converters.git
[submodule "vendor/gcv2hocr"]
path = vendor/gcv2hocr
url = https://github.com/dinosauria123/gcv2hocr.git
[submodule "vendor/hocr-spec-python"]
path = vendor/hocr-spec-python
url = https://github.com/kba/hocr-spec-python.git
[submodule "vendor/hOCR-to-ALTO"]
path = vendor/hOCR-to-ALTO
url = https://github.com/filak/hOCR-to-ALTO.git
[submodule "vendor/im2alto"]
path = vendor/im2alto
url = https://github.com/karkraeg/im2alto.git
[submodule "vendor/page-to-alto"]
path = vendor/page-to-alto
url = https://github.com/kba/page-to-alto.git
[submodule "vendor/xsd-validator"]
path = vendor/xsd-validator
url = https://github.com/kba/xsd-validator.git
[submodule "vendor/textract2page"]
path = vendor/textract2page
url = https://github.com/slub/textract2page.git
================================================
FILE: .zipignore
================================================
.git
.zipignore
.gitignore
example
ocr-fileformat_*
*.pdf
*.zip
================================================
FILE: CITATION.cff
================================================
# This CITATION.cff file was generated with cffinit.
# Visit https://bit.ly/cffinit to generate yours today!
cff-version: 1.2.0
title: ocr-fileformat
message: >-
You may cite this software using the metadata from this file.
type: software
authors:
- name: Universitätsbibliothek Mannheim
country: DE
city: Mannheim
website: 'https://www.bib.uni-mannheim.de/'
- given-names: Konstantin
family-names: Baierer
orcid: 'https://orcid.org/0000-0003-2397-242X'
- given-names: Stefan
family-names: Weil
affiliation: Universitätsbibliothek Mannheim
orcid: 'https://orcid.org/0000-0002-0524-9898'
- family-names: Zumstein
given-names: Philipp
affiliation: Universitätsbibliothek Mannheim
orcid: 'https://orcid.org/0000-0002-6485-9434'
- given-names: Robert
family-names: Sachunsky
- given-names: Jörg
orcid: 'https://orcid.org/0000-0002-6406-4906'
family-names: Mechnich
affiliation: Universitätsbibliothek Mannheim
- given-names: Uwe
family-names: Hartwig
orcid: 'https://orcid.org/0000-0001-7164-6376'
- given-names: Mike
family-names: Gerber
- given-names: Clemens
orcid: 'https://orcid.org/0000-0001-5293-8322'
family-names: Neudecker
================================================
FILE: Dockerfile
================================================
FROM alpine:edge
EXPOSE 8080
COPY . /ocr-fileformat
WORKDIR /ocr-fileformat
RUN apk add --no-cache openjdk8-jre php7 php7-json php7-openssl python3 py-lxml py-future git make ca-certificates wget bash gcc libc-dev \
&& update-ca-certificates \
&& make install \
&& cp docker.config.php web/config.local.php \
&& sed -i '/^upload_max_filesize/ s/=.*$/= 100M/' /etc/php7/php.ini \
&& sed -i 's/;extension=php_openssl.dll/extension=php_openssl.dll/' /etc/php7/php.ini \
&& mv web /ocr-fileformat-web \
&& rm -rf /ocr-fileformat \
&& apk del git make wget gcc libc-dev
# Disable POST upload limit
RUN sed -i 's,post_max_size = 8M,post_max_size = 0,' /etc/php7/php.ini
VOLUME /data
WORKDIR /data
CMD php7 -S $(hostname -i):8080 -t /ocr-fileformat-web
================================================
FILE: LICENSE
================================================
The MIT License (MIT)
Copyright (c) 2016 Universitätsbibliothek Mannheim
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Makefile
================================================
# Makefile for ocr-fileformat
PKG_NAME = ocr-fileformat
PKG_VERSION = 0.5.0
DOCKER_IMAGE = ubma/ocr-fileformat
# Either get the version from Git (if available) or use PKG_VERSION.
ROOTDIR = $(abspath $(dir $(MAKEFILE_LIST)))
VERSION = $(shell [ -d "$(ROOTDIR)/.git" ] && git -C "$(ROOTDIR)" describe --tags 2>/dev/null || echo $(PKG_VERSION))
CP = cp -a
LN = ln -sf
MV = mv -f
MKDIR = mkdir -p
RM = rm -rfv
ZIP = zip
PREFIX = $(DESTDIR)/usr/local
SHAREDIR = $(PREFIX)/share/$(PKG_NAME)
BINDIR = $(PREFIX)/bin
PYTHON = python3
TSHT = ./test/tsht
TSHT_URL = https://cdn.rawgit.com/kba/tsht/master/tsht
# BEGIN-EVAL makefile-parser --make-help Makefile
help:
@echo ""
@echo " Targets"
@echo ""
@echo " all Download vendor assets, link XSD schemas and XSLT stylesheets"
@echo " vendor Download all vendor assets"
@echo " xsd Link all XSD schemas"
@echo " xslt Link all XSLT stylesheets"
@echo " install Install ocr-fileformat"
@echo " uninstall Uninstall ocr-fileformat"
@echo " clean Remove linked assets"
@echo " realclean Remove linked assets and vendor files"
@echo " docker Create the docker image"
@echo " release Make release tarball / zipball"
@echo
@echo
@echo " Variables"
@echo
@echo " PREFIX Top-level directory for installation [$(PREFIX)]"
@echo " PYTHON Python version to use for tools [$(PYTHON)]"
# END-EVAL
# Download vendor assets, link XSD schemas and XSLT stylesheets
all: vendor xsd xslt
check:
$(MAKE) -C vendor check
.PHONY: vendor
# Download all vendor assets
vendor: check
# download the dependencies
git submodule update --init
# create+activate a Python venv if not already active
if [ -z "$(VIRTUAL_ENV)" ]; then \
$(PYTHON) -m venv $(SHAREDIR)/venv && \
. $(SHAREDIR)/venv/bin/activate && \
pip install -U pip; \
fi && $(MAKE) -C vendor all
.PHONY: xsd
# Link all XSD schemas
xsd: vendor
$(MKDIR) xsd
# copy Alto XSD
cd xsd && $(LN) ../vendor/alto-schema/*/*.xsd . && \
for xsd in *.xsd;do \
target_xsd=`echo $$xsd|sed 's/.//g'|sed 's/-/./'`; \
if [ ! -e $$target_xsd ];then \
$(MV) $$xsd $$target_xsd; \
fi; done
# copy PAGE XSD
@cd xsd && $(LN) ../vendor/page-schema/*.xsd .
# copy ABBYY XSD
cd xsd && $(LN) ../vendor/abbyy-schema/*.xsd .
.PHONY: xslt
# Link all XSLT stylesheets
xslt: vendor
$(MKDIR) xslt
# symlink hocr<->alto as well as the language codes lookup xml
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto.xsl hocr__alto.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto2.0.xsl hocr__alto2.0.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto2.1.xsl hocr__alto2.1.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto3.xsl hocr__alto3.0.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto4.xsl hocr__alto4.0.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/alto__hocr.xsl alto__hocr.xsl
cd xslt && $(LN) alto__hocr.xsl alto2.0__hocr.xsl
cd xslt && $(LN) alto__hocr.xsl alto2.1__hocr.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__text.xsl hocr__text.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/alto__text.xsl alto__text.xsl
cd xslt && $(LN) ../vendor/hOCR-to-ALTO/codes_lookup.xml codes_lookup.xml
cd xslt && $(LN) ../vendor/format-converters/page2hocr.xsl page__hocr.xsl
cd xslt && $(LN) ../vendor/format-converters/abbyy2hocr.xsl abbyy__hocr.xsl
cd xslt && $(LN) ../vendor/format-converters/hocr2tei.xsl hocr__tei.xsl
cd xslt && $(LN) alto2.0__alto3.0.xsl alto2.0__alto3.1.xsl
cd xslt && $(LN) alto2.0__alto3.0.xsl alto2.1__alto3.0.xsl
cd xslt && $(LN) alto2.0__alto3.0.xsl alto2.1__alto3.1.xsl
cd xslt && $(LN) ../vendor/im2alto/iw2alto.xsl mybib__alto3.0.xsl
# Install ocr-fileformat
define SEDSCRIPT
echo '/^SHAREDIR=/c\'
echo 'SHAREDIR="$(SHAREDIR)"'
echo 's/VERSION/$(VERSION)/'
endef
export SEDSCRIPT
install: all
$(MKDIR) $(SHAREDIR)
$(CP) script xsd xslt vendor lib.sh $(SHAREDIR)
$(RM) $(SHAREDIR)/vendor/*/.git
$(MKDIR) $(BINDIR)
eval "$$SEDSCRIPT" | sed -f - bin/ocr-transform.sh > $(BINDIR)/ocr-transform
eval "$$SEDSCRIPT" | sed -f - bin/ocr-validate.sh > $(BINDIR)/ocr-validate
chmod a+x $(BINDIR)/ocr-transform $(BINDIR)/ocr-validate
find $(SHAREDIR) -not -type l -exec chmod u+w {} \;
# Uninstall ocr-fileformat
uninstall:
$(RM) $(BINDIR)/ocr-transform
$(RM) $(BINDIR)/ocr-validate
$(RM) $(SHAREDIR)
# Remove linked assets
clean:
$(RM) xsd/*
find xslt -type l -delete
# Remove linked assets and vendor files
realclean: clean
$(MAKE) -C vendor clean
# Create the docker image
docker:
docker build -t "$(DOCKER_IMAGE)" .
# Make release tarball / zipball
release:
$(RM) $(PKG_NAME)_$(PKG_VERSION)
$(MKDIR) $(PKG_NAME)_$(PKG_VERSION)
tar -X .zipignore -cf - . | tar -xf - -C $(PKG_NAME)_$(PKG_VERSION)
# $(CP) LICENSE Makefile README.md bin/ lib.sh vendor/
tar czf $(PKG_NAME)_$(PKG_VERSION).tar.gz $(PKG_NAME)_$(PKG_VERSION)
zip --symlinks -r $(PKG_NAME)_$(PKG_VERSION).zip $(PKG_NAME)_$(PKG_VERSION)
================================================
FILE: README.md
================================================
# ocr-fileformat
[](https://app.codacy.com/gh/UB-Mannheim/ocr-fileformat/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
[](https://github.com/UB-Mannheim/ocr-fileformat/actions/workflows/ci.yml)
[](https://github.com/UB-Mannheim/ocr-fileformat/releases)
[](https://hub.docker.com/r/ubma/ocr-fileformat)
Validate and transform between OCR file formats (hOCR, ALTO, PAGE, FineReader)

<!-- BEGIN-MARKDOWN-TOC -->
* [Installation](#installation)
* [Docker](#docker)
* [System-wide](#system-wide)
* [Usage](#usage)
* [CLI](#cli)
* [GUI](#gui)
* [API](#api)
* [Transformation](#transformation)
* [Transformation CLI](#transformation-cli)
* [Transformation GUI](#transformation-gui)
* [Transformation API](#transformation-api)
* [Supported Transformations](#supported-transformations)
* [Validation](#validation)
* [Validation CLI](#validation-cli)
* [Validation GUI](#validation-gui)
* [Validation API](#validation-api)
* [Supported Validation Formats](#supported-validation-formats)
* [License](#license)
<!-- END-MARKDOWN-TOC -->
## Installation
### Docker
You can run the [command line scripts](#cli) and [web interface](#gui) as a
[Docker container](https://hub.docker.com/r/ubma/ocr-fileformat), you only need
Docker installed.
To start the web interface on [http://localhost:8080](http://localhost:8080):
```sh
docker run --rm -it -p 8080:8080 ubma/ocr-fileformat
```
To run the command line scripts, mount the directory containing your input
files into the container's `/data` directory:
```sh
docker run --rm -it -v "$PWD":/data ubma/ocr-fileformat ocr-transform alto2.0 hocr somefile.alto
```
### System-wide
To install system-wide to `/usr/local`:
```sh
sudo make install
```
To install without `sudo` to your home directory:
```sh
make install PREFIX=$HOME/.local
```
If `$HOME/.local/bin` is not in your `PATH`, add this to your shell startup file (e.g. `~/.bashrc` or `~/.zshrc`):
```
export PATH="$HOME/.local/bin $PATH"
```
The web application has a PHP backed. You can deploy it on any PHP-capable
server by copying the [`web`](./web) folder somewhere below the document root
of your server, e.g. `/var/www/html` for Apache on Debian/Ubuntu:
```
sudo -u www-data cp -r web /var/www/html/ocr-fileformat
```
In this example the GUI would be available under [http://localhost/ocr-fileformat/](http://localhost/ocr-fileformat/).
## Usage
The project offers two functionalities, which can be accessd via a command line
script (CLI), using a web interface (GUI) or in you own tools (API)
### CLI
* [`ocr-transform`](./bin/ocr-transform.sh): Transformation of OCR output between OCR formats
* [`ocr-validate`](./bin/ocr-validate.sh): Validation of OCR output against OCR format schemas
### GUI
The web interface is for testing validation and transformations. You can upload
a file or select an input file by URL.
### API
* [`$PREFIX/share/ocr-fileformat/xslt`](./xslt) - XSLT stylesheets
* [`$PREFIX/share/ocr-fileformat/xsd`](./xsd) - XSD schemas
* [`$PREFIX/share/ocr-fileformat/script/transform`](./script/transform) - Transformation scripts
* [`$PREFIX/share/ocr-fileformat/script/validate`](./script/validate) - Validation scripts
## Transformation
### Transformation CLI
```
Usage: ocr-transform [-dl] <input-fmt> <output-fmt> [<input> [<output>]] [-- <saxon_opts>]
```
For example, you can transform an ALTO XML to a hOCR file with:
```sh
ocr-transform alto hocr sample.xml sample.hocr
```
Or convert from ALTO XML (version 2.1) to hOCR with:
```sh
ocr-transform alto2.1 hocr sample.alto sample.hocr
```
You can also pass arguments directly to the Saxon CLI by passing them after a double dash (`--`). For example, to set the `foo` parameter to `bar`:
```sh
ocr-transform alto hocr sample.xml sample.hocr -- foo=bar
```
Try `ocr-transform -h` to get an overview:
<!-- BEGIN-EVAL echo '```';./bin/ocr-transform.sh -h 2>&1;echo '```' -->
```
Usage:
ocr-transform [OPTIONS] <from> <to> [<infile> [<outfile>]] [-- <script-args>]
ocr-transform [OPTIONS] <from> <to> --help-args Show script-args, and exit
ocr-transform [OPTIONS] -h|--help Show this help, and exit
ocr-transform [OPTIONS] -v|--version Show version, and exit
ocr-transform [OPTIONS] -L|--list List available from/to, and exit
Options:
--debug -d Increase debug level by 1, can be repeated
Transformations:
abbyy hocr
abbyy page
alto hocr
alto page
alto text
alto2.0 alto3.0
alto2.0 alto3.1
alto2.0 hocr
alto2.1 alto3.0
alto2.1 alto3.1
alto2.1 hocr
alto4.2 alto2.1
gcv alto
gcv hocr
gcv page
hocr alto
hocr alto2.0
hocr alto2.1
hocr alto3.0
hocr alto4.0
hocr page
hocr tei
hocr text
mybib alto3.0
page alto
page alto_legacy
page hocr
page page2019
page text
tei hocr
textract page
```
<!-- END-EVAL -->
### Transformation GUI
Select the `Transform` menu option. Choose a URL, an input and an output
format. Click `Transform`.
### Transformation API
The stylesheets are installed in `$PREFIX/share/ocr-fileformat/xslt` and can be
used directly in your scripts and software. You will need to use an XSLT 2.0
capable stylesheet transformer.
### Supported Transformations
| From ╲ To | hOCR | ALTO | PAGEXML | TEI | Text |
| ---: | --- | --- | --- | --- | --- |
| hOCR | - | ✓ | ✓ | ✓ | ✓ |
| ALTO | ✓ | ✓ | ✓ | - | ✓ |
| PAGEXML | ✓ | ✓ | ✓ | - | ✓ |
| ABBYY FineReader | ✓ | - | ✓ | - | - |
| Google Cloud Vision | ✓ | ✓ | ✓ | - | - |
| Amazon AWS Textract | - | - | ✓ | - | - |
| TEI | ✓ | - | - | - | - |
## Validation
<!-- BEGIN-EVAL echo '```';./bin/ocr-validate.sh -h 2>&1;echo '```' -->
```
Usage:
ocr-validate [OPTIONS] <schema> <file> [<resultsFile>]
ocr-validate [OPTIONS] -h|--help Show this help, and exit
ocr-validate [OPTIONS] -v|--version Show version, and exit
ocr-validate [OPTIONS] -L|--list List available schemas, and exit
Options:
--debug -d Increase debug level by 1, can be repeated
Schemas:
hocr
alto-1-0 alto-1-1 alto-1-2 alto-1-3 alto-1-4 alto-2-0 alto-2-1 alto-2-2-draft alto-3-0 alto-3-1 alto-3-2-draft alto-4-0 alto-4-1 alto-4-2 alto-4-3
abbyy-6-schema-v1 abbyy-8-schema-v2 abbyy-9-schema-v1 abbyy-10-schema-v1
page-2009-03-16 page-2010-01-12 page-2010-03-19 page-2013-07-15 page-2016-07-15 page-2017-07-15 page-2018-07-15 page-2019-07-15
```
<!-- END-EVAL -->
### Validation CLI
For example, to validate an XML file against the ALTO 3.1 schema:
```
ocr-validate alto-3-1 myFile.alto
```
### Validation GUI
Select the `Validate` menu option. Choose a URL and an schema. Click `Validate`.
### Validation API
The XSD files are installed under `$PREFIX/share/ocr-fileformat/xsd`
### Supported Validation Formats
| | hOCR | ALTO | PAGEXML | FineReader | Google Cloud Vision | Amazon AWS Textract |
| ---: | --- | --- | --- | --- | --- | --- |
| Validation | ✓ | ✓ | ✓ | ✓ | - | - |
## License
This is free software. You may use it under the terms of the [MIT License](LICENSE).
During the installation process several projects are included (in [`./vendor`](./vendor)). These projects have different licenses:
* [Saxon HE 9.7](http://saxon.sourceforge.net/#F9.7HE), [`MPL`](https://www.mozilla.org/MPL/).
* [ALTOXML schema](https://github.com/altoxml/schema), ["Open Source"](https://github.com/altoxml/schema/issues/37#issuecomment-218730230) for ALTO <= 3.1, [`CC BY SA 4.0`](https://creativecommons.org/licenses/by-sa/4.0/legalcode) since ALTO 4.0
* [PAGE schemas](http://www.primaresearch.org/schema/PAGE/gts/pagecontent/), `?`
* [xsd-validator](https://github.com/kba/xsd-validator) by Adrian Mouat [@amouat](https://github.com/amouat), `Apache 2.0`
* ABBYY FineReader XSD, `?`
* [hOCR-to-ALTO](https://github.com/filak/hOCR-to-ALTO) by Filip Kriz [@filak](https://github.com/filak), [`MIT`](https://github.com/filak/hOCR-to-ALTO/blob/master/LICENSE.txt)
* [hocr-spec](https://github.com/kba/hocr-spec-python) by Konstantin Baierer [@kba](https://github.com/kba), [`MIT`](https://github.com/kba/hocr-spec-python/blob/master/LICENSE)
* [gcv2hocr](https://github.com/dinosauria123/gcv2hocr) by Endo Michiaki, [`CC BY 4.0`](https://creativecommons.org/licenses/by/4.0/legalcode)
* [format-converters](https://github.com/OCR-D/format-converters) by OCR-D, [`Apache 2.0`](https://github.com/OCR-D/format-converters/blob/master/LICENSE)
* [prima-page-converter](https://github.com/PRImA-Research-Lab/prima-page-converter/) by PRImA Research Lab , [`Apache 2.0`](https://github.com/PRImA-Research-Lab/prima-page-converter/blob/master/LICENSE)
* [page-to-alto](https://github.com/kba/page-to-alto/) by Konstantin Baierer @kba, [`Apache 2.0`](https://github.com/kba/page-to-alto/blob/master/LICENSE)
* [textract2page](https://github.com/slub/textract2page/) by Arne Rümmler @rue-a, [`Apache 2.0`](https://github.com/slub/textract2page/blob/master/LICENSE)
================================================
FILE: bin/ocr-transform.sh
================================================
#!/usr/bin/env bash
# Default to the parent dir of this script. Overwritten by `make install`
SHAREDIR="$(readlink -f "$(dirname "$(readlink -f "$0")")/..")"
source "$SHAREDIR/lib.sh"
#{{{ show_usage ()
show_usage () {
[[ "$#" -gt 0 ]] && logerr "$@"
echo >&2 "Usage:
${0##*/} [OPTIONS] <from> <to> [<infile> [<outfile>]] [-- <script-args>]
${0##*/} [OPTIONS] <from> <to> --help-args Show script-args, and exit
${0##*/} [OPTIONS] -h|--help Show this help, and exit
${0##*/} [OPTIONS] -v|--version Show version, and exit
${0##*/} [OPTIONS] -L|--list List available from/to, and exit
Options:
--debug -d Increase debug level by 1, can be repeated
"
echo >&2 -e "\n${INDENT}Transformations:"
show_transformations|sed "s/^/${INDENT}${INDENT}/"
[[ "$#" -gt 0 ]] && exit 1
}
#}}}
#{{{ show_version ()
show_version () {
echo "${0##*/} VERSION"
}
#}}}
#{{{ main ()
main () {
# debug option -d -d to print all commands to the terminal
if (( DEBUG > 1 ));then
set -x
fi
local from="$1" to="$2" infile='-' outfile='-' transformer
shift 2
# Validate parameters
if [[ -z "$from" ]];then
show_usage "Must set 'from' parameter"
elif [[ -z "$to" ]];then
show_usage "Must set 'to' parameter"
elif [[ -z "${OCR_TRANSFORMATIONS[$from]}" ]];then
show_usage "No mapping from '$from'"
else
declare -a possible=(${OCR_TRANSFORMATIONS[$from]})
if ! in_array "$to" "${possible[@]}";then
show_usage "No mapping from '$from' to '$to'"
fi
fi
transformer=${OCR_TRANSFORMERS[${from}__${to}]}
if [[ "$1" == '--help-args' ]];then
if [[ "$transformer" = */gcv__hocr ]];then
echo >&2 -e "${INDENT}Extra arguments: <width> <height>"
elif [[ "$transformer" = */page__alto ]];then
echo >&2 -e "${INDENT}page-to-alto options:"
page-to-alto --help|sed '1,/^Options:/d;/--output-file/,$d' >&2
elif [[ "$transformer" = */textract__page ]];then
echo >&2 -e "${INDENT}textract2page arguments: <image-file>"
echo >&2 -e "${INDENT}textract2page options:"
else
# xsl and other transformers both take arbitrary Saxon options
show_saxon_options|sed "s/^/${INDENT}${INDENT}/"
fi
exit 0
fi
declare -a script_args
# <infile>
if [[ "$1" == '--' ]];then
script_args+=("${@:2}")
set --
elif [[ -n "$1" ]];then
infile="$1"
fi
shift
# <outfile>
if [[ "$1" == '--' ]];then
script_args+=("${@:2}")
set --
elif [[ -n "$1" ]];then
outfile="$1"
fi
shift;
# <script-args>
if [[ "$1" == '--' ]];then
script_args+=("${@:2}")
fi
if (( DEBUG > 0 ));then
[[ "$infile" = '-' ]] && logdebug "Reading from STDIN"
[[ "$outfile" = '-' ]] && logdebug "Writing to STDOUT"
fi
# Run it
optstate=$(set +o)
set -o errexit
if [[ "$transformer" = *.xsl ]];then
script_args=("${script_args[@]}" "-xsl:$transformer")
script_args=("${script_args[@]}" "-s:$infile")
[[ "$outfile" != '-' ]] && script_args=("${script_args[@]}" "-o:$outfile")
exec_saxon "${script_args[@]}"
else
script_args=("$infile" "$outfile" "${script_args[@]}")
source "$transformer" "${script_args[@]}"
fi
eval "$optstate"
}
#}}}
while [[ "$1" = -* ]]; do
case "$1" in
-d|--debug) let DEBUG+=1 ;;
-L|--list) show_transformations ; exit 0 ;;
-h|--help) show_usage ; exit 0 ;;
-v|--version) show_version ; exit 0 ;;
*) logerr "Unknown option '$1'" && show_usage && exit 1 ;;
esac
shift
done
if [[ -d "$SHAREDIR/venv" ]];then
. "$SHAREDIR/venv/bin/activate"
fi
main "$@"
================================================
FILE: bin/ocr-validate.sh
================================================
#!/usr/bin/env bash
# Default to the parent dir of this script. Overwritten by `make install`
SHAREDIR="$(readlink "$(dirname "$(readlink "$0")")/..")"
source "$SHAREDIR/lib.sh"
#{{{ show_usage ()
show_usage () {
[[ "$#" -gt 0 ]] && logerr "$@"
echo >&2 "Usage:
${0##*/} [OPTIONS] <schema> <file> [<resultsFile>]
${0##*/} [OPTIONS] -h|--help Show this help, and exit
${0##*/} [OPTIONS] -v|--version Show version, and exit
${0##*/} [OPTIONS] -L|--list List available schemas, and exit
Options:
--debug -d Increase debug level by 1, can be repeated
"
echo >&2 -e "\n${INDENT}Schemas:"
show_schemas|sed "s/^/${INDENT}${INDENT}/"
echo
[[ "$#" -gt 0 ]] && exit 1
}
#}}}
#{{{ show_version ()
show_version () {
echo "${0##*/} VERSION"
}
#}}}
#{{{ main ()
main () {
# debug option -d -d to print all commands to the terminal
if (( DEBUG > 1 ));then
set -x
fi
local schema="$1" file="$2"
shift 2
if [[ -z "$schema" ]];then
show_usage "Must set 'schema'"
elif [[ -z "${OCR_VALIDATORS[$schema]}" ]];then
show_usage "No such schema '$schema'"
fi
if [[ -z "$file" ]];then
show_usage "Must set 'file'"
fi
if [[ "$file" == "-" ]];then
((DEBUG > 1)) && loginfo "Reading from STDIN"
else
file=$(readlink "$file")
if [[ ! -e "$file" ]];then
show_usage "No such file: '$file'"
fi
fi
if [[ "${OCR_VALIDATORS[$schema]}" = *.xsd ]];then
"exec_xsdv" "$schema" "$file"
else
source "${OCR_VALIDATORS[$schema]}" "$file"
fi
}
#}}}
while [[ "$1" = -* ]]; do
case "$1" in
--debug|-d) let DEBUG+=1 ;;
--list|-L) show_schemas|sed -e 's/\s*$//' -e 's/ \+/\n/g' ; exit 0 ;;
--help|-h) show_usage ; exit 0 ;;
--version|-v) show_version ; exit 0 ;;
*) logerr "Unknown option '$1'" && show_usage && exit 1 ;;
esac
shift
done
if [[ -d "$SHAREDIR/venv" ]];then
. "$SHAREDIR/venv/bin/activate"
fi
main "$@"
================================================
FILE: docker.config.php
================================================
<?php
$config['ocr-validate'] = '/usr/local/bin/ocr-validate';
$config['ocr-transform'] = '/usr/local/bin/ocr-transform';
================================================
FILE: example/.gitignore
================================================
wetzel_reisebegleiter_1901_0021*.alto
wetzel_reisebegleiter_1901_0021*.hocr
wetzel_reisebegleiter_1901_0021*.page
/out
================================================
FILE: example/Makefile
================================================
# https://media.dwds.de/dta/images/wetzel_reisebegleiter_1901/wetzel_reisebegleiter_1901_0021_800px.jpg
BOOK=wetzel_reisebegleiter_1901
PAGE=0021
BASENAME=$(BOOK)_$(PAGE)
DEBIAN_PACKAGES = libxml2-utils tesseract-ocr tesseract-ocr-script-frak wget dwdiff
XMLLINT = xmllint --format
OCR_TRANSFORM = ../bin/ocr-transform.sh
ifdef HOMEBREW_PREFIX
TESSERACT = $(HOMEBREW_PREFIX)/bin/tesseract -l Fraktur
else
TESSERACT = /usr/bin/tesseract -l Fraktur
endif
WGET = wget
RM = rm -f
DWDIFF = dwdiff -p -l -c
PAGER = less -R
APT_GET = sudo apt-get -y
.PHONY: roundtrip
roundtrip: $(BASENAME).roundtrip.hocr $(BASENAME).alto.page
.PHONY: deps
deps:
$(APT_GET) install $(DEBIAN_PACKAGES)
.PHONY: diff
diff: $(BASENAME).roundtrip.hocr $(BASENAME).hocr
$(DWDIFF) $^ || exit 0
.PHONY: idiff
idiff: $(BASENAME).roundtrip.hocr $(BASENAME).hocr
$(DWDIFF) $^ | $(PAGER)
$(BASENAME)_800px.jpg:
$(WGET) http://media.dwds.de/dta/images/$(BOOK)/$(BASENAME)_800px.jpg
$(BASENAME).hocr : $(BASENAME)_800px.jpg
$(TESSERACT) $< stdout hocr | $(XMLLINT) - > $@
$(BASENAME).alto : $(BASENAME).hocr
$(OCR_TRANSFORM) hocr alto2.0 $< | $(XMLLINT) - > $@
$(BASENAME).alto.page : $(BASENAME).alto
$(OCR_TRANSFORM) alto page $< | $(XMLLINT) - > $@
$(BASENAME).alto.page.alto : $(BASENAME).alto.page
$(OCR_TRANSFORM) page alto $< | $(XMLLINT) - > $@
$(BASENAME).roundtrip.hocr : $(BASENAME).alto
$(OCR_TRANSFORM) alto hocr $< | $(XMLLINT) - > $@
clean:
$(RM) $(BASENAME)*.hocr $(BASENAME)*.alto
================================================
FILE: example/README.md
================================================
# Testing transformations
Install dependencies. For Debian/Ubuntu:
make deps
Run a roundtrip example:
make roundtrip
This will:
* download image (`-> x.jpg`)
* OCR the image (`-> x.hocr`)
* hOCR -> ALTO 2.0 (`-> x.alto`)
* ALTO 2.0 -> hOCR (`-> x.roundtrip.hocr`)
To see the information lost/added:
make diff
This will compare `x.hocr` to `x.roundtrip.hocr` using `dwdiff` and open the result in a pager.
## License
The example data is from the [Deutsches Textarchiv](https://www.deutschestextarchiv.de/book/show/wetzel_reisebegleiter_1901) project, data is licensed CC BY-NC 3.0.
================================================
FILE: lib.sh
================================================
#!/usr/bin/env bash
#{{{ Logging
if [[ -n "$COLORTERM" || "$TERM" = *color* || "$TERM" = xterm* ]];then
COLOR_ERROR="\033[1;31m"
COLOR_INFO="\033[1;32m"
COLOR_DEBUG="\033[1;34m"
COLOR_DEFAULT="\033[0m"
fi
# shellcheck disable=SC2048
logerr () {
local IFS=$'\n'
for line in $*;do
echo -e "${COLOR_DEFAULT}[${COLOR_ERROR}ERROR${COLOR_DEFAULT}] $line" >&2
done
}
loginfo () { echo -e "${COLOR_DEFAULT}[${COLOR_INFO}INFO${COLOR_DEFAULT}] $*" >&2; }
logdebug () { echo -e "${COLOR_DEFAULT}[${COLOR_DEBUG}DEBUG${COLOR_DEFAULT}] $*" >&2; }
#}}}
if [[ -z "$SHAREDIR" || ! -d "$SHAREDIR" ]];then
logerr "Set \$SHAREDIR before sourcing $0"
exit 1
fi
#{{{ utils (in_array)
# utility function to find the first pos param in the rest pos params
in_array () {
local e
for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done
return 1
}
#}}}
#{{{ Global vars
export DEBUG=0
export INDENT=" "
# Mapping 'fmt' -> 'fmt2 fmt3 fmt4'
declare -Ax OCR_TRANSFORMATIONS=()
# Mapping 'fmt' -> '/path-to-xslt-or-transform-script'
declare -Ax OCR_TRANSFORMERS=()
# Mapping 'fmt' -> '/path-to-xsd-or-validate-script'
declare -Ax OCR_VALIDATORS=()
#}}}
#{{{ Set up validation and transformation formats
# setup_transformations ()
setup_transformations () {
declare -a transformers=($(
find -L "$SHAREDIR/xslt" "$SHAREDIR/script/transform" \
! -type d \( -name '*.xsl' -or -perm -005 \) \
))
local in_fmt out_fmt
for path in "${transformers[@]}";do
fmt=${path##*/}
fmt=${fmt%.*}
OCR_TRANSFORMERS[$fmt]="$path"
in_fmt=${fmt%%__*}
out_fmt=${fmt##*__}
if [[ -z "${OCR_TRANSFORMATIONS[$in_fmt]}" ]];then
OCR_TRANSFORMATIONS[$in_fmt]="$out_fmt"
else
OCR_TRANSFORMATIONS[$in_fmt]+=" $out_fmt"
fi
done
}
# setup_validations ()
setup_validations () {
declare -a validators=($(
find -L "$SHAREDIR/xsd" "$SHAREDIR/script/validate" \
! -type d \( -name '*.xsd' -or -perm -005 \) \
|sort))
local path fmt
for path in "${validators[@]}";do
fmt=${path##*/}
fmt=${fmt%.*}
OCR_VALIDATORS[$fmt]="$path"
done
}
setup () {
setup_transformations
setup_validations
}
setup
#}}}
#{{{ List transformations, validations, saxon options
# show_schemas ()
show_schemas() {
local schema schemagroup
declare -a sorted=($(IFS=$'\n'; echo "${!OCR_VALIDATORS[*]}"|sort -t- -nk2 -k1))
for schema in "${sorted[@]}";do
[[ -n "$schemagroup" && "$schemagroup" != ${schema%%-*} ]] && echo
echo -n "$schema "
schemagroup=${schema%%-*}
done
}
# show_transformations ()
show_transformations() {
local in_fmt out_fmt
for in_fmt in "${!OCR_TRANSFORMATIONS[@]}";do
declare -a out_fmts=(${OCR_TRANSFORMATIONS[$in_fmt]})
for out_fmt in "${out_fmts[@]}";do
echo "${in_fmt} ${out_fmt}";
done
done|sort
}
# show_saxon_options ()
show_saxon_options () {
exec_saxon -t 2>&1|sed -e '0,/No source file/ d' -e '/Format:/ d'
}
#}}}
#{{{ run saxon / xsd-validator (xsdv.sh)
# exec_saxon ()
exec_saxon() {
(( DEBUG > 0 )) && loginfo Executing "java -jar $SHAREDIR/vendor/saxon.jar" "$@"
(( DEBUG > 1 )) && SAXON_ARGS+=('-t')
java -jar "$SHAREDIR/vendor/saxon.jar" "$@"
}
# exec_xsdv ()
exec_xsdv() {
local schema="$1" file="$2"
cd "$SHAREDIR/vendor/xsd-validator"
if ((DEBUG > 0));then
loginfo "PWD: '$PWD'"
loginfo "./xsdv.sh '$SHAREDIR/xsd/${schema}.xsd' '$file'"
fi
./xsdv.sh "$SHAREDIR/xsd/${schema}.xsd" "$file"
}
#}}}
================================================
FILE: script/transform/README.md
================================================
Scripts should be named `<from>__<to>`, e.g. `hocr-1.0__abbby-10`.
Will be called as
```
/script/transform/<from>__<to> <infile> <outfile> <additional-args>
```
Both `<infile>` and `<outfile>` can be `-`, in which case input should be read
from STDIN or written to STDOUT.
================================================
FILE: script/transform/alto__page
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)"
JAR="$VENDORDIR/JPageConverter/PageConverter.jar"
INFILE="$1"
OUTFILE="$2"
ARGUMENT="$3"
if [[ "$1" = "-" ]]; then
INFILE="$(mktemp)"
cat >"$INFILE"
fi
if [[ "$2" = "-" ]]; then
OUTFILE="$(mktemp)"
fi
java -jar "$JAR" -neg-coords toZero -source-xml "$INFILE" -target-xml "$OUTFILE" -convert-to LATEST 2>&1
if [[ "$1" = "-" ]]; then
rm "$INFILE"
fi
if [[ "$2" = "-" ]]; then
if [[ -z "$ARGUMENT" ]]; then
cat "$OUTFILE"
else
java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT"
fi
rm "$OUTFILE"
fi
================================================
FILE: script/transform/gcv__alto
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)"
JAR="$VENDORDIR/JPageConverter/PageConverter.jar"
INFILE="$1"
OUTFILE="$2"
ARGUMENT="$3"
if [[ "$1" = "-" ]]; then
INFILE="$(mktemp)"
cat >"$INFILE"
fi
if [[ "$2" = "-" ]]; then
OUTFILE="$(mktemp)"
fi
java -jar "$JAR" -neg-coords toZero -source-json "$INFILE" -target-xml "$OUTFILE" -convert-to ALTO 2>&1
if [[ "$1" = "-" ]]; then
rm "$INFILE"
fi
if [[ "$2" = "-" ]]; then
if [[ -z "$ARGUMENT" ]]; then
cat "$OUTFILE"
else
java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT"
fi
rm "$OUTFILE"
fi
================================================
FILE: script/transform/gcv__hocr
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)"
VENDORSCRIPT="$VENDORDIR/gcv2hocr/gcv2hocr"
INFILE="$1"
OUTFILE="$2"
#TODO
WIDTH=${3:-2000}
HEIGHT=${4:-2000}
if [[ "$1" = "-" ]]; then
INFILE="$(mktemp)"
cat >"$INFILE"
fi
if [[ "$2" = "-" ]]; then
OUTFILE="$(mktemp)"
fi
"$VENDORSCRIPT" "$INFILE" "$OUTFILE" "$WIDTH" "$HEIGHT"
if [[ "$1" = "-" ]]; then
rm "$INFILE"
fi
if [[ "$2" = "-" ]]; then
cat "$OUTFILE"
rm "$OUTFILE"
fi
rm preout1.txt preout2.txt
================================================
FILE: script/transform/gcv__page
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)"
JAR="$VENDORDIR/JPageConverter/PageConverter.jar"
INFILE="$1"
OUTFILE="$2"
ARGUMENT="$3"
if [[ "$1" = "-" ]]; then
INFILE="$(mktemp)"
cat >"$INFILE"
fi
if [[ "$2" = "-" ]]; then
OUTFILE="$(mktemp)"
fi
java -jar "$JAR" -neg-coords toZero -source-json "$INFILE" -target-xml "$OUTFILE" -convert-to LATEST 2>&1
if [[ "$1" = "-" ]]; then
rm "$INFILE"
fi
if [[ "$2" = "-" ]]; then
if [[ -z "$ARGUMENT" ]]; then
cat "$OUTFILE"
else
java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT"
fi
rm "$OUTFILE"
fi
================================================
FILE: script/transform/page__alto
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)"
INFILE="$1"
OUTFILE="$2"
ARGUMENTS=("${@:3}")
if [[ "$1" = "-" ]]; then
INFILE="$(mktemp)"
cat >"$INFILE"
fi
if [[ "$2" = "-" ]]; then
OUTFILE="$(mktemp)"
fi
page-to-alto "${ARGUMENTS[@]}" -O "$OUTFILE" "$INFILE" ; retval="$?"
if [[ "$1" = "-" ]]; then
rm "$INFILE"
fi
if (( retval > 0 )); then
rm "$OUTFILE"
exit $retval
fi
if [[ "$2" = "-" ]]; then
cat "$OUTFILE"
rm "$OUTFILE"
fi
================================================
FILE: script/transform/page__alto_legacy
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)"
JAR="$VENDORDIR/JPageConverter/PageConverter.jar"
INFILE="$1"
OUTFILE="$2"
ARGUMENT="$3"
if [[ "$1" = "-" ]]; then
INFILE="$(mktemp)"
cat >"$INFILE"
fi
if [[ "$2" = "-" ]]; then
OUTFILE="$(mktemp)"
fi
java -jar "$JAR" -neg-coords toZero -source-xml "$INFILE" -target-xml "$OUTFILE" -convert-to ALTO 2>&1
if [[ "$1" = "-" ]]; then
rm "$INFILE"
fi
if [[ "$2" = "-" ]]; then
if [[ -z "$ARGUMENT" ]]; then
cat "$OUTFILE"
else
java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT"
fi
rm "$OUTFILE"
fi
================================================
FILE: script/transform/textract__page
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)"
INFILE="$1"
OUTFILE="$2"
ARGUMENTS=("${@:3}")
if [[ "$1" = "-" ]]; then
INFILE="$(mktemp)"
cat >"$INFILE"
fi
if [[ "$2" = "-" ]]; then
OUTFILE="$(mktemp)"
fi
textract2page "${ARGUMENTS[@]:1}" -O "$OUTFILE" "$INFILE" "${ARGUMENTS[0]}"; retval="$?"
if [[ "$1" = "-" ]]; then
rm "$INFILE"
fi
if (( retval > 0 ));then
rm "$OUTFILE"
exit $retval
fi
if [[ "$2" = "-" ]]; then
cat "$OUTFILE"
rm "$OUTFILE"
fi
================================================
FILE: script/validate/README.md
================================================
Scripts here will be called by `ocr-validate`.
Name should be the format and version, lowercase letters, numbers and dash only.
================================================
FILE: script/validate/hocr
================================================
#!/bin/bash
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HOCR_SPEC="$SCRIPTDIR/../../vendor/hocr-spec-python/hocr-spec"
format="xml"
if [[ "$TERM" = *"color"* ]];then
format="ansi"
fi
python3 "$HOCR_SPEC" -f "$format" -p relaxed --filename "STDIN" "$1"
================================================
FILE: vendor/Makefile
================================================
MKDIR = mkdir -p
RM = rm -rfv
UNZIP = unzip -o
WGET = wget --progress=bar:force --no-verbose
PIP = pip3
SAXON_HE_VERSION_MAJOR = 11
SAXON_HE_VERSION_MINOR = 2
SAXON_HE_ZIP = SaxonHE$(SAXON_HE_VERSION_MAJOR)-$(SAXON_HE_VERSION_MINOR)J.zip
SAXON_HE_URL = https://netcologne.dl.sourceforge.net/project/saxon/Saxon-HE/$(SAXON_HE_VERSION_MAJOR)/Java/$(SAXON_HE_ZIP)
SAXON_HE_JAR = saxon-he-$(SAXON_HE_VERSION_MAJOR).$(SAXON_HE_VERSION_MINOR).jar
PAGE_SCHEMA_REPO = page-schema
PAGE_SCHEMA_VERSIONS = 2009-03-16 2010-01-12 2010-03-19 2013-07-15 2016-07-15 2017-07-15 2018-07-15 2019-07-15
PAGE_SCHEMA_BASE_URL = https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/PAGE-release/gts/pagecontent
ABBYY_SCHEMA_REPO = abbyy-schema
ABBYY_SCHEMA_BASE_URL = https://fr7.abbyy.com/FineReader_xml/FineReader
ABBYY_SCHEMA_VERSIONS = 6-schema-v1 8-schema-v2 9-schema-v1 10-schema-v1
ALTO2PAGE_VERSION_MAJOR_MINOR = 1.5
ALTO2PAGE_VERSION = $(ALTO2PAGE_VERSION_MAJOR_MINOR).06
ALTO2PAGE_ZIP = JPageConverter.zip
ALTO2PAGE_URL = https://github.com/UB-Mannheim/prima-page-converter/releases/download/$(ALTO2PAGE_VERSION)/JPageConverter_$(ALTO2PAGE_VERSION).zip
ALTO2PAGE_DIR = JPageConverter
# {{{
# SAXON_BROWSER_VERSION = 1.1
# SAXON_BROWSER_ZIP = Saxon-CE_$(SAXON_BROWSER_VERSION).zip
# SAXON_BROWSER_JS = TODO
# SAXON_BROWSER_URL = http://www.saxonica.com/ce/download/$(SAXON_BROWSER_ZIP)
# $(SAXON_BROWSER_JS): $(SAXON_BROWSER_ZIP)
# $(SAXON_BROWSER_ZIP):
# wget -O '$@' '$(SAXON_BROWSER_URL)'
#}}}
.PHONY: all check $(PAGE_SCHEMA_REPO) $(ABBYY_SCHEMA_REPO) gcv2hocr page-to-alto textract2page format-converters
all:\
$(PAGE_SCHEMA_REPO)\
$(ABBYY_SCHEMA_REPO)\
gcv2hocr \
saxon.jar \
$(ALTO2PAGE_DIR) \
page-to-alto \
textract2page \
format-converters
clean:
$(RM) $(SAXON_HE_JAR) saxon.jar
$(RM) $(SAXON_HE_ZIP)
$(RM) $(PAGE_SCHEMA_REPO)
$(RM) $(ALTO2PAGE_DIR)
$(RM) $(ALTO2PAGE_ZIP)
check:
@which wget >/dev/null || (echo "Missing wget. Please install package wget." && exit 1)
@which unzip >/dev/null || (echo "Missing unzip. Please install package unzip." && exit 1)
$(ABBYY_SCHEMA_REPO):
@$(MKDIR) "$@" && cd "$@" && \
for version in $(ABBYY_SCHEMA_VERSIONS);do \
xsd=abbyy-$$version.xsd; if [ ! -e $$xsd ];then \
$(WGET) -O $$xsd $(ABBYY_SCHEMA_BASE_URL)$$version.xml; \
fi; \
done;
$(PAGE_SCHEMA_REPO):
@$(MKDIR) "$@" && cd "$@" && \
for version in $(PAGE_SCHEMA_VERSIONS);do \
xsd=page-$$version.xsd; if [ ! -e $$xsd ];then \
$(WGET) -O $$xsd $(PAGE_SCHEMA_BASE_URL)/$$version/pagecontent.xsd; \
fi; \
done;
saxon.jar: $(SAXON_HE_JAR)
ln -sf "$<" "$@"
$(SAXON_HE_JAR): $(SAXON_HE_ZIP)
$(UNZIP) "$<"
$(SAXON_HE_ZIP):
$(WGET) -O "$@" "$(SAXON_HE_URL)"
gcv2hocr:
$(MAKE) -C $@
$(ALTO2PAGE_ZIP):
$(WGET) -O "$@" "$(ALTO2PAGE_URL)"
$(ALTO2PAGE_DIR): $(ALTO2PAGE_ZIP)
$(UNZIP) "$<"
rm -rf "$@"
mv "JPageConverter $(ALTO2PAGE_VERSION)" "$@"
page-to-alto:
cd "$@"; $(PIP) install .
textract2page:
cd "$@"; $(PIP) install .
format-converters:
cd "$@"; $(PIP) install .
================================================
FILE: web/config.php
================================================
<?php
if (!defined('IncludingScript')) {
die('Direct access not permitted');
}
// We don't want ANSI coloring.
putenv('TERM=dumb');
$config = [
'ocr-validate' => dirname(__FILE__) . '/../bin/ocr-validate.sh',
'ocr-transform' => dirname(__FILE__) . '/../bin/ocr-transform.sh',
'formats' => [
'transform' => [],
'validate' => [],
],
];
$local_settings = dirname(__FILE__) . '/config.local.php';
if (file_exists($local_settings) === TRUE) {
include $local_settings;
}
/**
* List of installed transform from-to-tuples.
* List of installed schemas.
*/
function buildFormatList()
{
global $config;
$lines = [];
exec($config['ocr-transform'] . ' -L', $lines);
foreach ($lines as $line) {
$fromto = preg_split("/\s+/", $line);
$from = $fromto[0];
$to = $fromto[1];
// echo $from, "\t", $to, "\n";
if (! array_key_exists($from, $config['formats']['transform'])) {
$config['formats']['transform'][$from] = [$to];
} else {
array_push($config['formats']['transform'][$from], $to);
}
}
exec($config['ocr-validate'] . ' -L', $config['formats']['validate']);
}
buildFormatList();
return $config;
================================================
FILE: web/index.html
================================================
<!doctype HTML>
<html lang="en">
<head>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet"/>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/notie/3.2.0/notie.css"/>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.9.0/themes/prism.min.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.9.0/themes/prism-coy.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/github-fork-ribbon-css/0.2.0/gh-fork-ribbon.min.css" />
<link rel="stylesheet" href="ocr-fileformat.css"/>
<link rel="icon" type="image/png" sizes="32x32" href="favicon.png">
<title>OCR Fileformat</title>
</head>
<body>
<!-- Static navbar -->
<nav class="navbar navbar-inverse">
<div class="container-fluid">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="navbar-brand" href="#">OCR Fileformat</a>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li class="active"><a data-toggle="tab" href="#transform">Transform</a></li>
<li class=""><a data-toggle="tab" href="#validate">Validate</a></li>
<li><a data-toggle="tab" href="#help">Help</a></li>
</ul>
</div><!--/.nav-collapse -->
</div><!--/.container-fluid -->
</nav>
<div class="container">
<section class="tab-content">
<div class="tab-pane active" id="transform">
<div class="row">
<div class="col-xs-3">
<!-- Nav tabs -->
<ul class="nav nav-tabs nav-justified" role="tablist">
<li role="presentation" class="active">
<a role="tab" data-toggle="tab" href="#transform-url-tab" tabindex=-1>URL</a>
</li>
<li role="presentation">
<a role="tab" data-toggle="tab" href="#transform-file-tab" tabindex=-1>Upload</a>
</li>
</ul>
</div>
</div>
<div class="row">
<div class="col-xs-6">
<!-- Tab panes -->
<div class="tab-content input">
<div role="tabpanel" class="tab-pane active" id="transform-url-tab">
<input id="transform-url"
class="form-control"
type="url"
placeholder="http://example.org/xml"
style="width:100%" />
</div>
<div role="tabpanel" class="tab-pane" id="transform-file-tab">
<input id="transform-file"
class="form-control"
type="file"
style="width:100%" />
</div>
</div>
</div>
<div class="col-xs-6">
<div class="form-inline formats" role="form">
<select id="transform-from" class="form-control" disabled>
<option disabled selected value> -- input -- </option>
</select>
<select id="transform-to" class="form-control" disabled>
<option disabled selected value> -- output -- </option>
</select>
<button id="transform-submit" class="btn btn-success" disabled>
Transform
<span class="hidden glyphicon glyphicon-refresh spinning"></span>
</button>
</div>
</div>
</div>
<div class="row">
<div class="col-xs-12">
<div id="transform-result" class="result hidden">
<div class="btn-group btn-group-sm" role="group">
<a class="btn btn-default btn-primary download">
<span class="glyphicon glyphicon-download"> </span>Download
</a>
</div>
<pre>
<code class="language-markup">
<code></code>
</code>
</pre>
</div>
</div>
</div>
</div>
<div class="tab-pane" id="validate">
<div class="row">
<div class="col-xs-6">
<!-- Tab panes -->
<div class="tab-content input">
<div role="tabpanel" class="tab-pane active" id="validate-url-tab">
<input id="validate-url"
class="form-control"
type="url"
placeholder="http://example.org/xml"
style="width:100%" />
</div>
<div role="tabpanel" class="tab-pane" id="validate-file-tab">
<input id="validate-file"
class="form-control"
type="file"
style="width:100%" />
</div>
</div>
<!-- Nav tabs -->
<ul class="nav nav-tabs nav-justified" role="tablist">
<li role="presentation" class="active">
<a role="tab" data-toggle="tab" href="#validate-url-tab" tabindex=-1>URL</a>
</li>
<li role="presentation">
<a role="tab" data-toggle="tab" href="#validate-file-tab" tabindex=-1>Upload</a>
</li>
</ul>
</div>
<div class="col-xs-6">
<div class="form-inline formats" role="form">
<select id="validate-format" class="form-control">
<option disabled selected value> -- format -- </option>
</select>
<button id="validate-submit" class="btn btn-success" disabled>
Validate
<span class="hidden glyphicon glyphicon-refresh spinning"></span>
</button>
</div>
</div>
</div>
<div class="row">
<div class="col-xs-12">
<div class="result hidden" id="validate-result">
<pre>
<code class="language-markup">
<code></code>
</code>
</pre>
</div>
</div>
</div>
</div>
<div class="tab-pane" id="help">
<h3>Examples</h3>
<h4>ABBYY FineReader</h4>
<ul>
<li><a href="https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/abbyy/417576986_0031.xml">https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/abbyy/417576986_0031.xml</a></li>
</ul>
<h4>hOCR</h4>
<ul>
<li><a href="https://cdn.rawgit.com/kba/ocr-fileformat-samples/master/samples/hocr/1.1/wetzel_reisebegleiter_1901_0021.hocr">https://cdn.rawgit.com/kba/ocr-fileformat-samples/master/samples/hocr/1.1/wetzel_reisebegleiter_1901_0021.hocr</a></li>
<li><a href="https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/0001-tesseract.hocr">https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/0001-tesseract.hocr</a></li>
</ul>
<h4>ALTO</h4>
<ul>
<li><a href="http://chroniclingamerica.loc.gov/lccn/sn86069133/1910-10-31/ed-1/seq-1/ocr.xml">http://chroniclingamerica.loc.gov/lccn/sn86069133/1910-10-31/ed-1/seq-1/ocr.xml</a></li>
<li><a href="https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/alto/417576986_0031.xml">https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/alto/417576986_0031.xml</a></li>
<li><a href="https://rawgit.com/kba/ocr-fileformat-samples/master/samples/alto/2.0/wetzel_reisebegleiter_1901_0021.alto">https://rawgit.com/kba/ocr-fileformat-samples/master/samples/alto/2.0/wetzel_reisebegleiter_1901_0021.alto</a></li>
</ul>
<h4>PAGE XML</h4>
<ul>
<li><a href="https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/documentation/example/SimplePage.xml">https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/documentation/example/SimplePage.xml</a></li>
<li><a href="https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/pagecontent/examples/aletheiaexamplepage.xml">https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/pagecontent/examples/aletheiaexamplepage.xml</a></li>
</ul>
<h3>Source Code</h3>
<a href="https://github.com/UB-Mannheim/ocr-fileformat">https://github.com/UB-Mannheim/ocr-fileformat</a>
</div>
</section>
</div>
<a class="github-fork-ribbon left-bottom"
href="https://github.com/UB-Mannheim/ocr-fileformat"
title="Fork me on GitHub">Fork me on GitHub</a>
<script src="https://code.jquery.com/jquery-2.2.4.js" integrity="sha256-iT6Q9iMJYuQiMWNd9lDyBUStIq/8PuOW33aOqmvFpqI=" crossorigin="anonymous"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
<!-- <script src="https://cdn.rawgit.com/rndme/download/master/download.min.js"></script> -->
<script src="https://rawgit.com/notifyjs/notifyjs/master/dist/notify.js"></script>
<script src="ocr-fileformat.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.9.0/prism.min.js" integrity="sha512-KnX1xdVSdEHliREuSgUX9kgmit/Wk63n4X3cjoWfISEVsi2Qi2NW88dYyKXCCS8YcFMgzHywK3BIafTfhK2Tig==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
</body>
</html>
================================================
FILE: web/ocr-fileformat.css
================================================
.glyphicon.spinning {
animation: spin 1s infinite linear;
-webkit-animation: spin2 1s infinite linear;
}
@keyframes spin {
from { transform: scale(1) rotate(0deg); }
to { transform: scale(1) rotate(360deg); }
}
@-webkit-keyframes spin2 {
from { -webkit-transform: rotate(0deg); }
to { -webkit-transform: rotate(360deg); }
}
.result {
max-height: 75vh;
}
.github-fork-ribbon {
position: fixed;
}
.github-fork-ribbon.left-bottom:before {
background-color: #080;
}
================================================
FILE: web/ocr-fileformat.js
================================================
/* globals $ */
/* globals Blob */
/* global Prism */
let OcrFileformatAPI = function OcrFileformatAPI(endpoint) {
this.endpoint = endpoint;
};
OcrFileformatAPI.prototype.urlFor = function urlFor(action, params) {
params || (params = {});
let url = this.endpoint + '?do=' + action;
for (let paramName of Object.keys(params)) {
url += '&' + paramName + '=' + params[paramName];
}
return url;
};
OcrFileformatAPI.prototype.updateFormats = function updateFormats(cb) {
let self = this;
this.request('list', null, null, function(err, formats) {
self.formats = formats;
cb(err);
});
};
OcrFileformatAPI.prototype.request = function request(endpoint, query, formData, cb) {
let ajaxCall = {
type: 'GET',
url: window.api.urlFor(endpoint, query),
success: function(data) {
cb(null, data);
},
error: function(xhr) {
cb(xhr.responseText);
},
};
if (formData) {
ajaxCall.type = 'POST';
ajaxCall.data = formData;
ajaxCall.processData = false;
ajaxCall.contentType = false;
}
$.ajax(ajaxCall);
};
function escapeHTML(str) {
return str.
replace(/&/g, '&').
replace(/</g, '<').
replace(/"/g, '"').
replace(/'/g, ''').
replace(/\//g, '/').
replace(/>/g, '>');
}
function onChangeFormat() {
if ($("#transform-from option").length == 1) {
Object.keys(window.api.formats.transform).forEach(function(from) {
$("#transform-from").append($("<option>").append(from));
});
$("#transform-from").removeAttr('disabled');
}
let selectedFrom = $("#transform-from").val();
$("#transform-to").attr('disabled', selectedFrom === null);
if (selectedFrom) {
$("#transform-to option").slice(1).remove();
window.api.formats.transform[selectedFrom].forEach(function(to) {
$("#transform-to").append($("<option>").append(to));
});
}
if ($("#validate-format option").length == 1) {
window.api.formats.validate.forEach(function(format) {
$("#validate-format").append($("<option>").append(format));
});
}
}
function submit(tabName, params) {
let pane = $("#" + tabName);
let input = pane.find(".input .active input");
let formData;
const isFileUpload = input.attr('type') === 'file';
if (isFileUpload) {
formData = new FormData();
formData.append('file', input.prop('files')[0]);
} else {
params.url = input.val();
}
$("button .spinning", pane).removeClass('hidden');
window.api.request(tabName, params, formData, function(err, data) {
pane.find("button .spinning").addClass('hidden');
if (err) {
return $.notify(err, 'error');
}
pane.find(".result a.download").off('click').on('click', ev => {
const outputFormat = $("#transform-to").val();
const basename = input.val()
.replace(/^.*\\/, '') // C:\fakepath\foo.hocr -> foo.hocr
.replace(/^.*\//, '') // http://bla/foo.bar -> foo.hocr?raw=true
.replace(/\?.*$$/, '') // foo.hocr?raw=true -> foo.hocr
;
const extension = outputFormat === 'text' ? 'text'
: outputFormat === 'hocr' ? 'html'
: outputFormat + '.xml';
const type = outputFormat === 'text' ? 'text/plain'
: outputFormat === 'hocr' ? 'text/html'
: 'text/xml';
const downloadUrl = window.URL.createObjectURL(new Blob([data], {type}));
const filename = `${basename}.${extension}`;
const dummyLink = document.createElement('a');
dummyLink.setAttribute('download', filename);
dummyLink.href = downloadUrl;
dummyLink.style.display = 'none';
document.body.appendChild(dummyLink);
dummyLink.click();
document.body.removeChild(dummyLink);
window.URL.revokeObjectURL(downloadUrl);
});
pane.find('.result pre code').html(escapeHTML(data));
pane.find(".result").removeClass('hidden');
Prism.highlightAll();
});
}
function maybeEnableSubmit() {
let el = $(".tab-pane.active");
let inputSet = !!$(".input .active input", el).val();
let selects = $(".formats select", el);
let formatsSet = selects.length == selects.map(function() {return $(this).val();}).length;
$("button", el).attr('disabled', !(inputSet && formatsSet));
}
function hashRoute() {
let hash = window.location.hash;
let pageTab = hash.replace(/-.*/, '');
$("a[data-toggle='tab'][href='" + pageTab + "']").tab('show');
$("a[data-toggle='tab'][href='" + hash + "']").tab('show');
}
$(function() {
$.notify.defaults({position: 'bottom right'});
const api = window.api = new OcrFileformatAPI('ocr-fileformat.php');
$.notify("Loading formats", 'info');
api.updateFormats(function(err) {
if (err) {
$.notify("Error loading formats", "error");
return;
}
$.notify("Loaded formats", 'success');
$("#transform-from").on('change', onChangeFormat);
$("a").on('shown.bs.tab', maybeEnableSubmit);
$(":input").on('input change', maybeEnableSubmit);
$(".tab-pane").on('shown.bs.tab', maybeEnableSubmit);
$("#validate-submit").on('click', function() {
submit('validate', {format: $("#validate-format").val()});
});
$("#transform-submit").on('click', function() {
submit('transform', {from: $("#transform-from").val(), to: $("#transform-to").val()});
});
$("a[data-toggle='tab']").on('click tap', function() {window.location.hash = $(this).attr('href');});
$(window).on('hashchange', hashRoute);
onChangeFormat();
hashRoute();
});
});
/* vim: set sw=4 : */
================================================
FILE: web/ocr-fileformat.php
================================================
<?php
// To hide the config
define('IncludingScript', TRUE);
$config = include('config.php');
/**
* Send a Malformed Request error.
*/
function send400($msg)
{
http_response_code(400);
header("Content-Type: text/plain");
echo $msg;
}
/**
* Send a JSON response
*/
function sendJSON($data)
{
header("Content-Type: application/json");
echo json_encode($data);
}
/**
* Open a bidirectinal child process, write data into it and echo the result.
*/
function pipeToCommand($cmd, $xml)
{
$descriptorspec = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w"),
);
$process = proc_open("TERM=dumb " . $cmd, $descriptorspec, $pipes);
$ret = array();
if (is_resource($process)) {
fwrite($pipes[0], $xml);
fclose($pipes[0]);
$ret['stdout'] = stream_get_contents($pipes[1]);
$ret['stderr'] = stream_get_contents($pipes[2]);
fclose($pipes[1]);
fclose($pipes[2]);
proc_close($process);
return $ret;
}
}
/**
* Transform from one format to another, fetching the data by URL
*/
function transform($url, $from, $to)
{
global $config;
if (!array_key_exists($from, $config['formats']['transform'])
|| !in_array($to, $config['formats']['transform'][$from])) {
send400("No such transformation '$from -> $to'");
return;
}
$xml = file_get_contents($url);
if (!$xml) {
send400("Could not retrieve URL '$url'");
return;
}
header("Content-Type: " . $to === "html" ? "text/html" : "application/xml");
$res = pipeToCommand($config['ocr-transform'] . " -d '$from' '$to' - -- '!indent=yes'", $xml);
echo $res['stdout'];
}
/**
* Validate against a schema, data retrieved via HTTP GET.
*/
function validate($url, $format)
{
global $config;
if (!in_array($format, $config['formats']['validate'])) {
return send400("No validator for '$format'");
}
header("Content-Type: text/plain");
$xml = file_get_contents($url);
if (!$xml) {
return send400("Could not retrieve URL '$url'");
}
header("Content-Type: text/plain");
$res = pipeToCommand($config['ocr-validate'] . " " . $format . " -", $xml);
echo $res['stdout'];
echo $res['stderr'];
}
/**
* Handle request
*/
if (array_key_exists('file', $_FILES)) {
$_GET['url'] = $_FILES["file"]['tmp_name'];
}
switch ($_GET['do']) {
case 'list':
sendJSON($config['formats']);
break;
case 'transform':
if (!array_key_exists('url', $_GET)) {
return send400("Must be either POST with file field 'file' or GET with param 'url'.");
}
transform($_GET["url"], $_GET["from"], $_GET["to"]);
break;
case 'validate':
if (!array_key_exists('url', $_GET)) {
return send400("Must be either POST with file field 'file' or GET with param 'url'.");
}
validate($_GET["url"], $_GET["format"]);
break;
default:
send400("Unknown/missing action, set 'do' parameter to either 'validate' or 'transform'");
break;
}
================================================
FILE: xsd/.gitignore
================================================
*.xsd
================================================
FILE: xslt/.gitignore
================================================
*.xml
*.xsl
!alto2.0__alto3.0.xsl
!page__text.xsl
!tei__hocr.xsl
================================================
FILE: xslt/alto2.0__alto3.0.xsl
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!-- https://github.com/altoxml/documentation/issues/1#issuecomment-219671094 -->
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="1.0"
xmlns:v2="http://www.loc.gov/standards/alto/ns-v2#"
xmlns:v3="http://www.loc.gov/standards/alto/ns-v3#">
<xsl:template match="@* | node()">
<xsl:copy>
<xsl:apply-templates select="@* | node()"/>
</xsl:copy>
</xsl:template>
<!-- replace xsi:schemaLocation attribute -->
<xsl:template match="@xsi:schemaLocation">
<xsl:attribute name="xsi:schemaLocation">http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/standards/alto/v3/alto-3-0.xsd</xsl:attribute>
</xsl:template>
<!-- replace namespace -->
<xsl:template match="v2:*">
<xsl:element name="{local-name()}" namespace="http://www.loc.gov/standards/alto/ns-v3#">
<xsl:apply-templates select="@* | node()"/>
</xsl:element>
</xsl:template>
</xsl:stylesheet>
================================================
FILE: xslt/page__text.xsl
================================================
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15">
<!-- rid of xml syntax: -->
<xsl:output
method="text"
standalone="yes"
omit-xml-declaration="yes"/>
<!-- copy text element verbatim: -->
<xsl:variable name="newline"><xsl:text>
</xsl:text>
</xsl:variable>
<!-- paragraph break -->
<xsl:param name="pb" select="concat($newline,$newline)"/>
<!-- line break -->
<xsl:param name="lb" select="$newline"/>
<!-- text order: by element or by explicit ReadingOrder (reading-order|document) -->
<xsl:param name="order" select="'reading-order'"/>
<!-- hierarchy level to extract text annotation from (region|line|word|glyph|highest) -->
<xsl:param name="level" select="'highest'"/>
<!-- use key mechanism for IDREFs, because XSD does not support id mechanism -->
<xsl:key name="textRegion" match="pc:TextRegion" use="@id"/>
<xsl:template match="pc:PcGts/pc:Page">
<xsl:variable name="regions" select="//pc:TextRegion"/>
<xsl:choose>
<xsl:when test="starts-with($order, 'reading-order') and pc:ReadingOrder//*[@regionRef|@regionRefIndexed]">
<xsl:call-template name="getrefs">
<xsl:with-param name="group" select="pc:ReadingOrder/*"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="$regions">
<xsl:call-template name="getlines">
<xsl:with-param name="region" select="."/>
</xsl:call-template>
<xsl:value-of select="$pb"/>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="getlines">
<xsl:param name="region"/>
<xsl:choose>
<xsl:when test="$level='region' or $level='highest' and $region/pc:TextEquiv/pc:Unicode">
<xsl:value-of select="$region/pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="$region/pc:TextLine">
<xsl:if test="position()>1">
<xsl:value-of select="$lb"/>
</xsl:if>
<xsl:choose>
<xsl:when test="$level='line' or $level='highest' and pc:TextEquiv/pc:Unicode">
<xsl:value-of select="pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="pc:Word">
<xsl:if test="position()>1">
<xsl:text> </xsl:text>
</xsl:if>
<xsl:choose>
<xsl:when test="$level='word' or $level='highest' and pc:TextEquiv/pc:Unicode">
<xsl:value-of select="pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="pc:Glyph">
<xsl:value-of select="pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose> <!-- word level? -->
</xsl:for-each>
</xsl:otherwise>
</xsl:choose> <!-- line level? -->
</xsl:for-each>
</xsl:otherwise>
</xsl:choose> <!-- region level? -->
</xsl:template>
<xsl:template name="getrefs">
<xsl:param name="group"/>
<xsl:for-each select="$group/*">
<xsl:sort select="@index" data-type="number"/>
<!--<xsl:variable name="region" select="id(@regionRef|@regionRefIndexed)"/>-->
<xsl:variable name="region" select="key('textRegion', @regionRef|@regionRefIndexed)"/>
<xsl:if test="$region">
<xsl:call-template name="getlines">
<xsl:with-param name="region" select="$region"/>
</xsl:call-template>
<xsl:value-of select="$pb"/>
</xsl:if>
<!-- UnorderedGroup(Indexed) and OrderedGroup(Indexed): recurse -->
<xsl:if test="contains(local-name(.), 'Group')">
<xsl:call-template name="getrefs">
<xsl:with-param name="group" select="."/>
</xsl:call-template>
</xsl:if>
</xsl:for-each>
</xsl:template>
<!-- override implicit rules copying elements and attributes: -->
<xsl:template match="text()"/>
</xsl:stylesheet>
================================================
FILE: xslt/tei__hocr.xsl
================================================
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:util="http://example/com/util/namespace"
version="2.0"
exclude-result-prefixes="xsl util"
xmlns="http://www.w3.org/1999/xhtml">
<xsl:output method="html" encoding="UTF-8" indent="yes"
omit-xml-declaration="yes" />
<xsl:param name="docTitle" select="'document_name'"/>
<xsl:param name="langs" select="'de'"/>
<xsl:param name="npages" select="1"/>
<xsl:param name="scripts" select="'Latg'"/>
<xsl:param name="system" select="'unknown'"/>
<xsl:param name="left" select="-1"/>
<xsl:param name="top" select="-1"/>
<xsl:param name="width" select="-1"/>
<xsl:param name="height" select="-1"/>
<!-- converts comma-separated to space-separated coordinates -->
<xsl:function name="util:coords">
<xsl:param name="coords" />
<xsl:value-of select="replace($coords, ',', ' ')" />
</xsl:function>
<!-- calculates bounding box of all nodes with attribute 'function' -->
<xsl:function name="util:get-pagebox">
<xsl:param name="nodes" />
<xsl:variable name="bbox">
<xsl:choose>
<xsl:when test="$left = -1">
<xsl:for-each select="$nodes">
<xsl:sort select="tokenize(./@function, ',')[1]" data-type="number" order="ascending" />
<xsl:if test="position() = 1">
<xsl:value-of select="tokenize(./@function, ',')[1]" />
</xsl:if>
<xsl:text> </xsl:text>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$left" />
<xsl:text> </xsl:text>
</xsl:otherwise>
</xsl:choose>
<xsl:choose>
<xsl:when test="$top = -1">
<xsl:for-each select="$nodes">
<xsl:sort select="tokenize(./@function, ',')[2]" data-type="number" order="ascending" />
<xsl:if test="position() = 1">
<xsl:value-of select="tokenize(./@function, ',')[2]" />
</xsl:if>
<xsl:text> </xsl:text>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$top" />
<xsl:text> </xsl:text>
</xsl:otherwise>
</xsl:choose>
<xsl:choose>
<xsl:when test="$width = -1">
<xsl:for-each select="$nodes">
<xsl:sort select="tokenize(./@function, ',')[3]" data-type="number" order="descending" />
<xsl:if test="position() = 1">
<xsl:value-of select="tokenize(./@function, ',')[3]" />
</xsl:if>
<xsl:text> </xsl:text>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$width" />
<xsl:text> </xsl:text>
</xsl:otherwise>
</xsl:choose>
<xsl:choose>
<xsl:when test="$height = -1">
<xsl:for-each select="$nodes">
<xsl:sort select="tokenize(./@function, ',')[4]" data-type="number" order="descending" />
<xsl:if test="position() = 1">
<xsl:value-of select="tokenize(./@function, ',')[4]" />
</xsl:if>
<xsl:text> </xsl:text>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$height" />
<xsl:text> </xsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:value-of select="normalize-space($bbox)" />
</xsl:function>
<!-- Start of transformation -->
<xsl:template match="/">
<html>
<head>
<title>
<xsl:value-of select="$docTitle" />
</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="ocr-system" content="{$system}" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
<meta name="ocr-langs" content="{$langs}" />
<meta name="ocr-number-of-pages" content="{$npages}" />
<meta name="ocr-scripts" content="{$scripts}" />
</head>
<xsl:apply-templates select=".//text" />
</html>
</xsl:template>
<xsl:template match="text">
<body>
<xsl:apply-templates select=".//milestone" />
</body>
</xsl:template>
<!-- Page -->
<xsl:template match="milestone[@type='page']">
<xsl:variable name="pageno" select="@n" />
<xsl:variable name="pagenodes" select="//*[@function]" />
<xsl:variable name="pagebox" select="util:get-pagebox($pagenodes)" />
<div class="ocr_page" id="page_{$pageno}" title="image "{$docTitle}"; bbox {$pagebox}; ppageno {$pageno - 1}">
<div class="ocr_carea" id="block_{$pageno}" title="bbox {$pagebox}">
<xsl:apply-templates select="//p|//figure" />
</div>
</div>
</xsl:template>
<!-- Paragraph -->
<xsl:template match="p">
<xsl:variable name="pid" select="@id" />
<p class="ocr_par" id="{$pid}">
<xsl:apply-templates select="./w" />
</p>
</xsl:template>
<!-- Word -->
<xsl:template match="w">
<xsl:variable name="bbox" select="util:coords(@function)" />
<span class="ocrx_word" title="bbox {$bbox}">
<xsl:value-of select="text()" />
</span>
</xsl:template>
<!-- Figure -->
<xsl:template match="figure">
<xsl:variable name="bbox" select="util:coords(@function)" />
<div class="ocr_float" title="bbox {$bbox}" />
</xsl:template>
<!-- Unmatched Elements -->
<xsl:template match="*">
<xsl:message terminate="no">
WARNING: Unmatched element: <xsl:value-of select="name()"/>
</xsl:message>
<xsl:apply-templates/>
</xsl:template>
</xsl:stylesheet>
gitextract_kh4rpdh8/
├── .dockerignore
├── .eslintrc.google.js
├── .eslintrc.js
├── .github/
│ └── workflows/
│ ├── ci.yml
│ └── codeql.yml
├── .gitignore
├── .gitmodules
├── .zipignore
├── CITATION.cff
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── bin/
│ ├── ocr-transform.sh
│ └── ocr-validate.sh
├── docker.config.php
├── example/
│ ├── .gitignore
│ ├── Makefile
│ └── README.md
├── lib.sh
├── script/
│ ├── transform/
│ │ ├── README.md
│ │ ├── alto__page
│ │ ├── gcv__alto
│ │ ├── gcv__hocr
│ │ ├── gcv__page
│ │ ├── page__alto
│ │ ├── page__alto_legacy
│ │ └── textract__page
│ └── validate/
│ ├── README.md
│ └── hocr
├── vendor/
│ └── Makefile
├── web/
│ ├── config.php
│ ├── index.html
│ ├── ocr-fileformat.css
│ ├── ocr-fileformat.js
│ └── ocr-fileformat.php
├── xsd/
│ └── .gitignore
└── xslt/
├── .gitignore
├── alto2.0__alto3.0.xsl
├── page__text.xsl
└── tei__hocr.xsl
SYMBOL INDEX (11 symbols across 3 files)
FILE: web/config.php
function buildFormatList (line 27) | function buildFormatList()
FILE: web/ocr-fileformat.js
function escapeHTML (line 46) | function escapeHTML(str) {
function onChangeFormat (line 56) | function onChangeFormat() {
function submit (line 78) | function submit(tabName, params) {
function maybeEnableSubmit (line 125) | function maybeEnableSubmit() {
function hashRoute (line 133) | function hashRoute() {
FILE: web/ocr-fileformat.php
function send400 (line 12) | function send400($msg)
function sendJSON (line 22) | function sendJSON($data)
function pipeToCommand (line 31) | function pipeToCommand($cmd, $xml)
function transform (line 55) | function transform($url, $from, $to)
function validate (line 76) | function validate($url, $format)
Condensed preview — 41 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (92K chars).
[
{
"path": ".dockerignore",
"chars": 158,
"preview": "Dockerfile\nexample\ntest\nREADME.md\nxsd\n\nxslt\n!xslt/alto2.0__alto3.0.xsl\n!xslt/page__text.xsl\n!xslt/tei__hocr.xsl\n\nvendor/"
},
{
"path": ".eslintrc.google.js",
"chars": 11489,
"preview": "/**\n * Copyright 2016 Google Inc. All rights reserved.\n *\n * Licensed under the Apache License, Version 2.0 (the \"Licens"
},
{
"path": ".eslintrc.js",
"chars": 1503,
"preview": "module.exports = {\n extends: './.eslintrc.google.js',\n parserOptions: {\n \"ecmaVersion\": 2017,\n \"sour"
},
{
"path": ".github/workflows/ci.yml",
"chars": 586,
"preview": "name: Continuous Integration\n\n# Continuous integration test for ocr-fileformat.\n\non:\n # pull_request:\n # push:\n # sch"
},
{
"path": ".github/workflows/codeql.yml",
"chars": 842,
"preview": "name: \"CodeQL\"\n\non:\n push:\n branches: [ \"master\" ]\n pull_request:\n branches: [ \"master\" ]\n schedule:\n - cron"
},
{
"path": ".gitignore",
"chars": 68,
"preview": "/Saxon*\n*.jar\n/*.alto\nvendor/*\n!vendor/Makefile\nocr-fileformat_*\n*~\n"
},
{
"path": ".gitmodules",
"chars": 1003,
"preview": "[submodule \"vendor/alto-schema\"]\n\tpath = vendor/alto-schema\n\turl = https://github.com/altoxml/schema.git\n[submodule \"ven"
},
{
"path": ".zipignore",
"chars": 64,
"preview": ".git\n.zipignore\n.gitignore\nexample\nocr-fileformat_*\n*.pdf\n*.zip\n"
},
{
"path": "CITATION.cff",
"chars": 1230,
"preview": "# This CITATION.cff file was generated with cffinit.\n# Visit https://bit.ly/cffinit to generate yours today!\n\ncff-versio"
},
{
"path": "Dockerfile",
"chars": 779,
"preview": "FROM alpine:edge\n\nEXPOSE 8080\nCOPY . /ocr-fileformat\nWORKDIR /ocr-fileformat\nRUN apk add --no-cache openjdk8-jre php7 ph"
},
{
"path": "LICENSE",
"chars": 1098,
"preview": "The MIT License (MIT)\n\nCopyright (c) 2016 Universitätsbibliothek Mannheim\n\nPermission is hereby granted, free of charge,"
},
{
"path": "Makefile",
"chars": 4968,
"preview": "# Makefile for ocr-fileformat\n\nPKG_NAME = ocr-fileformat\nPKG_VERSION = 0.5.0\nDOCKER_IMAGE = ubma/ocr-fileformat\n\n# Eithe"
},
{
"path": "README.md",
"chars": 9998,
"preview": "# ocr-fileformat\n\n[](https:/"
},
{
"path": "bin/ocr-transform.sh",
"chars": 3881,
"preview": "#!/usr/bin/env bash\n\n# Default to the parent dir of this script. Overwritten by `make install`\nSHAREDIR=\"$(readlink -f \""
},
{
"path": "bin/ocr-validate.sh",
"chars": 2058,
"preview": "#!/usr/bin/env bash\n\n# Default to the parent dir of this script. Overwritten by `make install`\nSHAREDIR=\"$(readlink \"$(d"
},
{
"path": "docker.config.php",
"chars": 122,
"preview": "<?php\n$config['ocr-validate'] = '/usr/local/bin/ocr-validate';\n$config['ocr-transform'] = '/usr/local/bin/ocr-transform'"
},
{
"path": "example/.gitignore",
"chars": 119,
"preview": "wetzel_reisebegleiter_1901_0021*.alto\nwetzel_reisebegleiter_1901_0021*.hocr\nwetzel_reisebegleiter_1901_0021*.page\n/out\n"
},
{
"path": "example/Makefile",
"chars": 1484,
"preview": "# https://media.dwds.de/dta/images/wetzel_reisebegleiter_1901/wetzel_reisebegleiter_1901_0021_800px.jpg\nBOOK=wetzel_reis"
},
{
"path": "example/README.md",
"chars": 604,
"preview": "# Testing transformations\n\nInstall dependencies. For Debian/Ubuntu:\n\n make deps\n\nRun a roundtrip example:\n\n make r"
},
{
"path": "lib.sh",
"chars": 3662,
"preview": "#!/usr/bin/env bash\n\n#{{{ Logging\nif [[ -n \"$COLORTERM\" || \"$TERM\" = *color* || \"$TERM\" = xterm* ]];then\n COLOR_ERROR"
},
{
"path": "script/transform/README.md",
"chars": 276,
"preview": "Scripts should be named `<from>__<to>`, e.g. `hocr-1.0__abbby-10`.\n\nWill be called as\n\n```\n/script/transform/<from>__<to"
},
{
"path": "script/transform/alto__page",
"chars": 699,
"preview": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJ"
},
{
"path": "script/transform/gcv__alto",
"chars": 698,
"preview": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJ"
},
{
"path": "script/transform/gcv__hocr",
"chars": 554,
"preview": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nV"
},
{
"path": "script/transform/gcv__page",
"chars": 700,
"preview": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJ"
},
{
"path": "script/transform/page__alto",
"chars": 540,
"preview": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nI"
},
{
"path": "script/transform/page__alto_legacy",
"chars": 697,
"preview": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJ"
},
{
"path": "script/transform/textract__page",
"chars": 559,
"preview": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nI"
},
{
"path": "script/validate/README.md",
"chars": 129,
"preview": "Scripts here will be called by `ocr-validate`.\n\nName should be the format and version, lowercase letters, numbers and da"
},
{
"path": "script/validate/hocr",
"chars": 272,
"preview": "#!/bin/bash\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nHOCR_SPEC=\"$SCRIPTDIR/../../vendor/hocr-spec-pytho"
},
{
"path": "vendor/Makefile",
"chars": 3051,
"preview": "MKDIR = mkdir -p\nRM = rm -rfv\nUNZIP = unzip -o\nWGET = wget --progress=bar:force --no-verbose\nPIP = pip3\n\nSAXON_HE_VERSIO"
},
{
"path": "web/config.php",
"chars": 1234,
"preview": "<?php\nif (!defined('IncludingScript')) {\n die('Direct access not permitted');\n}\n\n// We don't want ANSI coloring.\npute"
},
{
"path": "web/index.html",
"chars": 9889,
"preview": "<!doctype HTML>\n<html lang=\"en\">\n <head>\n <link href=\"https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap."
},
{
"path": "web/ocr-fileformat.css",
"chars": 500,
"preview": ".glyphicon.spinning {\n animation: spin 1s infinite linear;\n -webkit-animation: spin2 1s infinite linear;\n}\n\n@keyfr"
},
{
"path": "web/ocr-fileformat.js",
"chars": 6026,
"preview": "/* globals $ */\n/* globals Blob */\n/* global Prism */\n\nlet OcrFileformatAPI = function OcrFileformatAPI(endpoint) {\n "
},
{
"path": "web/ocr-fileformat.php",
"chars": 2949,
"preview": "<?php\n\n// To hide the config\ndefine('IncludingScript', TRUE);\n\n$config = include('config.php');\n\n\n/**\n * Send a Malforme"
},
{
"path": "xsd/.gitignore",
"chars": 6,
"preview": "*.xsd\n"
},
{
"path": "xslt/.gitignore",
"chars": 65,
"preview": "*.xml\n*.xsl\n!alto2.0__alto3.0.xsl\n!page__text.xsl\n!tei__hocr.xsl\n"
},
{
"path": "xslt/alto2.0__alto3.0.xsl",
"chars": 1056,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!-- https://github.com/altoxml/documentation/issues/1#issuecomment-219671094 -->"
},
{
"path": "xslt/page__text.xsl",
"chars": 4326,
"preview": "<xsl:stylesheet\n version=\"1.0\"\n xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n xmlns:pc=\"http://schema.prima"
},
{
"path": "xslt/tei__hocr.xsl",
"chars": 5675,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<xsl:stylesheet\n xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n xmlns:util=\""
}
]
About this extraction
This page contains the full source code of the UB-Mannheim/ocr-fileformat GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 41 files (83.6 KB), approximately 26.5k tokens, and a symbol index with 11 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.