Repository: UB-Mannheim/ocr-fileformat Branch: master Commit: 69a917e4db9a Files: 41 Total size: 83.6 KB Directory structure: gitextract_kh4rpdh8/ ├── .dockerignore ├── .eslintrc.google.js ├── .eslintrc.js ├── .github/ │ └── workflows/ │ ├── ci.yml │ └── codeql.yml ├── .gitignore ├── .gitmodules ├── .zipignore ├── CITATION.cff ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── bin/ │ ├── ocr-transform.sh │ └── ocr-validate.sh ├── docker.config.php ├── example/ │ ├── .gitignore │ ├── Makefile │ └── README.md ├── lib.sh ├── script/ │ ├── transform/ │ │ ├── README.md │ │ ├── alto__page │ │ ├── gcv__alto │ │ ├── gcv__hocr │ │ ├── gcv__page │ │ ├── page__alto │ │ ├── page__alto_legacy │ │ └── textract__page │ └── validate/ │ ├── README.md │ └── hocr ├── vendor/ │ └── Makefile ├── web/ │ ├── config.php │ ├── index.html │ ├── ocr-fileformat.css │ ├── ocr-fileformat.js │ └── ocr-fileformat.php ├── xsd/ │ └── .gitignore └── xslt/ ├── .gitignore ├── alto2.0__alto3.0.xsl ├── page__text.xsl └── tei__hocr.xsl ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ Dockerfile example test README.md xsd xslt !xslt/alto2.0__alto3.0.xsl !xslt/page__text.xsl !xslt/tei__hocr.xsl vendor/* !vendor/Makefile !vendor/saxon*.jar ================================================ FILE: .eslintrc.google.js ================================================ /** * Copyright 2016 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ 'use strict'; module.exports = { rules: { // The rules below are listed in the order they appear on the eslint // rules page. All rules are listed to make it easier to keep in sync // as new ESLint rules are added. // http://eslint.org/docs/rules/ // - Rules in the `eslint:recommended` ruleset that aren't specifically // mentioned by the google styleguide are listed but commented out (so // they don't override a base ruleset). // - Rules that are recommended but contradict the Google styleguide // are explicitely set to the Google styleguide value. // Possible Errors // http://eslint.org/docs/rules/#possible-errors // --------------------------------------------- // 'for-direction': 0, // 'no-await-in-loop': 0, // 'no-compare-neg-zero': 2, // eslint:recommended 'no-cond-assign': 0, // eslint:recommended // 'no-console': 2, // eslint:recommended // 'no-constant-condition': 2, // eslint:recommended // 'no-control-regex': 2, // eslint:recommended // 'no-debugger': 2, // eslint:recommended // 'no-dupe-args': 2, // eslint:recommended // 'no-dupe-keys': 2, // eslint:recommended // 'no-duplicate-case': 2, // eslint:recommended // 'no-empty': 2, // eslint:recommended // 'no-empty-character-class': 2, // eslint:recommended // 'no-ex-assign': 2, // eslint:recommended // 'no-extra-boolean-cast': 2, // eslint:recommended // 'no-extra-parens': 0, // 'no-extra-semi': 2, // eslint:recommended // 'no-func-assign': 2, // eslint:recommended // 'no-inner-declarations': 2, // eslint:recommended // 'no-invalid-regexp': 2, // eslint:recommended 'no-irregular-whitespace': 2, // eslint:recommended // 'no-obj-calls': 2, // eslint:recommended // 'no-prototype-builtins': 0, // 'no-regex-spaces': 2, // eslint:recommended // 'no-sparse-arrays': 2, // eslint:recommended // 'no-template-curly-in-string': 0, 'no-unexpected-multiline': 2, // eslint:recommended // 'no-unreachable': 2, // eslint:recommended // 'no-unsafe-finally': 2, // eslint:recommended // 'no-unsafe-negation': 0, // 'use-isnan': 2 // eslint:recommended 'valid-jsdoc': [2, { requireParamDescription: false, requireReturnDescription: false, requireReturn: false, prefer: {returns: 'return'}, }], // 'valid-typeof': 2 // eslint:recommended // Best Practices // http://eslint.org/docs/rules/#best-practices // -------------------------------------------- // 'accessor-pairs': 0, // 'array-callback-return': 0, // 'block-scoped-var': 0, // 'class-methods-use-this': 0, // 'complexity': 0, // 'consistent-return': 0 'curly': [2, 'multi-line'], // TODO(philipwalton): add an option to enforce // braces with the exception of simple, // single-line if statements. // 'default-case': 0, // 'dot-location': 0, // 'dot-notation': 0, // 'eqeqeq': 0, 'guard-for-in': 2, // 'no-alert': 0, 'no-caller': 2, // 'no-case-declarations': 2, // eslint:recommended // 'no-div-regex': 0, // 'no-else-return': 0, // 'no-empty-function': 0, // 'no-empty-pattern': 2, // eslint:recommended // 'no-eq-null': 0, // 'no-eval': 0, 'no-extend-native': 2, 'no-extra-bind': 2, // 'no-extra-label': 0, // 'no-fallthrough': 2, // eslint:recommended // 'no-floating-decimal': 0, // 'no-global-assign': 0, // 'no-implicit-coercion': 0, // 'no-implicit-globals': 0, // 'no-implied-eval': 0, 'no-invalid-this': 2, // 'no-iterator': 0, // 'no-labels': 0, // 'no-lone-blocks': 0, // 'no-loop-func': 0, // 'no-magic-numbers': 0, 'no-multi-spaces': 2, 'no-multi-str': 2, // 'no-new': 0, // 'no-new-func': 0, 'no-new-wrappers': 2, // 'no-octal': 2, // eslint:recommended // 'no-octal-escape': 0, // 'no-param-reassign': 0, // 'no-proto': 0, // 'no-redeclare': 2, // eslint:recommended // 'no-restricted-properties': 0, // 'no-return-assign': 0, // 'no-script-url': 0, // 'no-self-assign': 2, // eslint:recommended // 'no-self-compare': 0, // 'no-sequences': 0, 'no-throw-literal': 2, // eslint:recommended // 'no-unmodified-loop-condition': 0, // 'no-unused-expressions': 0, // 'no-unused-labels': 2, // eslint:recommended // 'no-useless-call': 0, // 'no-useless-concat': 0, // 'no-useless-escape': 0, // 'no-void': 0, // 'no-warning-comments': 0, 'no-with': 2, // 'prefer-promise-reject-errors': 0, // 'radix': 0, // 'require-await': 0, // 'vars-on-top': 0, // 'wrap-iife': 0, // 'yoda': 0, // Strict Mode // http://eslint.org/docs/rules/#strict-mode // ----------------------------------------- // 'strict': 0, // Variables // http://eslint.org/docs/rules/#variables // --------------------------------------- // 'init-declarations': 0, // 'no-catch-shadow': 0, // 'no-delete-var': 2, // eslint:recommended // 'no-label-var': 0, // 'no-restricted-globals': 0, // 'no-shadow': 0, // 'no-shadow-restricted-names': 0, // 'no-undef': 2, // eslint:recommended // 'no-undef-init': 0, // 'no-undefined': 0, 'no-unused-vars': [2, {args: 'none'}], // eslint:recommended // 'no-use-before-define': 0, // Node.js and CommonJS // http://eslint.org/docs/rules/#nodejs-and-commonjs // ------------------------------------------------- // 'callback-return': 0, // 'global-require': 0, // 'handle-callback-err': 0, // 'no-buffer-constructor': 0, // 'no-mixed-requires': 0, // 'no-new-require': 0, // 'no-path-concat': 0, // 'no-process-env': 0, // 'no-process-exit': 0, // 'no-restricted-modules': 0, // 'no-sync': 0, // Stylistic Issues // http://eslint.org/docs/rules/#stylistic-issues // ---------------------------------------------- 'array-bracket-newline': 0, // eslint:recommended 'array-bracket-spacing': [2, 'never'], 'array-element-newline': 0, // eslint:recommended 'block-spacing': [2, 'never'], 'brace-style': 2, 'camelcase': [2, {properties: 'never'}], // 'capitalized-comments': 0, 'comma-dangle': [2, 'always-multiline'], 'comma-spacing': 2, 'comma-style': 2, 'computed-property-spacing': 2, // 'consistent-this': 0, 'eol-last': 2, 'func-call-spacing': 2, // 'func-name-matching': 0, // 'func-names': 0, // 'func-style': 0, // 'id-blacklist': 0, // 'id-length': 0, // 'id-match': 0, // 'indent': 0, // TODO(philipwalton): this rule isn't compatible with // Google's 4-space indent for line continuations. // 'jsx-quotes': 0, 'key-spacing': 2, 'keyword-spacing': 2, // 'line-comment-position': 0, 'linebreak-style': 2, // 'lines-around-comment': 0, // 'max-depth': 0, 'max-len': [2, { code: 80, tabWidth: 2, ignoreUrls: true, ignorePattern: '^goog\.(module|require)', }], // 'max-lines': 0, // 'max-nested-callbacks': 0, // 'max-params': 0, // 'max-statements': 0, // 'max-statements-per-line': 0, // 'multiline-ternary': 0, // TODO(philipwalton): add a rule to enforce the // operator appearing at the end of the line. 'new-cap': 2, // 'new-parens': 0, // 'newline-per-chained-call': 0, 'no-array-constructor': 2, // 'no-bitwise': 0, // 'no-continue': 0, // 'no-inline-comments': 0, // 'no-lonely-if': 0, // 'no-mixed-operators': 0, 'no-mixed-spaces-and-tabs': 2, // eslint:recommended // 'no-multi-assign': 0, 'no-multiple-empty-lines': [2, {max: 2}], // 'no-negated-condition': 0, // 'no-nested-ternary': 0, 'no-new-object': 2, // 'no-plusplus': 0, // 'no-restricted-syntax': 0, 'no-tabs': 2, // 'no-ternary': 0, 'no-trailing-spaces': 2, // 'no-underscore-dangle': 0, // 'no-unneeded-ternary': 0, // 'no-whitespace-before-property': 0, // 'nonblock-statement-body-position': 0, // 'object-curly-newline': 0, 'object-curly-spacing': 2, // 'object-property-newline': 0, 'one-var': [2, { var: 'never', let: 'never', const: 'never', }], // 'one-var-declaration-per-line': 0, // 'operator-assignment': 0, // 'operator-linebreak': 0, 'padded-blocks': [2, 'never'], // 'padding-line-between-statements': 0, 'quote-props': [2, 'consistent'], 'quotes': [2, 'single', {allowTemplateLiterals: true}], 'require-jsdoc': [2, { require: { FunctionDeclaration: true, MethodDefinition: true, ClassDeclaration: true, }, }], 'semi': 2, 'semi-spacing': 2, // 'semi-style': 0, // 'sort-keys': 0, // 'sort-vars': 0, 'space-before-blocks': 2, 'space-before-function-paren': [2, { asyncArrow: 'always', anonymous: 'never', named: 'never', }], // 'space-in-parens': 0, // 'space-infix-ops': 0, // 'space-unary-ops': 0, 'spaced-comment': [2, 'always'], // 'switch-colon-spacing': 2, // 'template-tag-spacing': 0, // 'unicode-bom': 0, // 'wrap-regex': 0, // ECMAScript 6 // http://eslint.org/docs/rules/#ecmascript-6 // ------------------------------------------ // 'arrow-body-style': 0, 'arrow-parens': [2, 'always'], // TODO(philipwalton): technically arrow // parens are optional but recommended. // ESLint doesn't support a *consistent* // setting so "always" is used. // 'arrow-spacing': 0, 'constructor-super': 2, // eslint:recommended 'generator-star-spacing': [2, 'after'], // 'no-class-assign': 0, // 'no-confusing-arrow': 0, // 'no-const-assign': 0, // eslint:recommended // 'no-dupe-class-members': 0, // eslint:recommended // 'no-duplicate-imports': 0, 'no-new-symbol': 2, // eslint:recommended // 'no-restricted-imports': 0, 'no-this-before-super': 2, // eslint:recommended // 'no-useless-computed-key': 0, // 'no-useless-constructor': 0, // 'no-useless-rename': 0, 'no-var': 2, // 'object-shorthand': 0, // 'prefer-arrow-callback': 0, // 'prefer-const': 0, // 'prefer-destructuring': 0, // 'prefer-numeric-literals': 0, 'prefer-rest-params': 2, 'prefer-spread': 2, // 'prefer-template': 0, // 'require-yield': 2, // eslint:recommended 'rest-spread-spacing': 2, // 'sort-imports': 0, // 'symbol-description': 0, // 'template-curly-spacing': 0, 'yield-star-spacing': [2, 'after'], }, }; ================================================ FILE: .eslintrc.js ================================================ module.exports = { extends: './.eslintrc.google.js', parserOptions: { "ecmaVersion": 2017, "sourceType": "module", }, env: { es6: true, }, rules: { 'arrow-parens': 0, 'block-spacing': 0, 'brace-style': 0, 'camelcase': 0, 'comma-dangle': 0, 'comma-style': [2, 'last'], 'curly': 0, 'indent': [0, 4], 'key-spacing': 0, 'linebreak-style': 2, 'max-len': 0, 'new-cap': 0, 'no-invalid-this': 0, 'no-multi-spaces': 0, 'no-undef': 2, 'no-unused-vars': 1, 'object-curly-spacing': 0, 'padded-blocks': [0, 'never'], 'quote-props': 0, 'quotes': 0, 'require-jsdoc': 0, 'semi': [1, 'always'], 'space-before-function-paren': [0, {"anonymous": "never"}], 'valid-jsdoc': 0, }, globals: { // $: true, _: true, rdfstore: true, FormData: true, Backbone: true, document: true, require: true, define: true, console: true, window: true, process: true, module: true, Image: true, exports: true, parent: true, setTimeout: true, setInterval: true, clearTimeout: true, clearInterval: true, __dirname: true, GM_registerMenuCommand: true, __filename: true, Buffer: true, fetch: true, }, } ================================================ FILE: .github/workflows/ci.yml ================================================ name: Continuous Integration # Continuous integration test for ocr-fileformat. on: # pull_request: # push: # schedule: # - cron: 0 20 * * * workflow_dispatch: jobs: linux: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 with: submodules: recursive - name: Install tesseract and other dependencies run: | sudo apt-get update sudo make -C example deps - name: Run make all run: | make all PREFIX=$HOME - name: Run tests run: | make -C example roundtrip diff ================================================ FILE: .github/workflows/codeql.yml ================================================ name: "CodeQL" on: push: branches: [ "master" ] pull_request: branches: [ "master" ] schedule: - cron: "46 17 * * 3" jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ javascript ] steps: - name: Checkout uses: actions/checkout@v3 - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} queries: +security-and-quality - name: Autobuild uses: github/codeql-action/autobuild@v2 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 with: category: "/language:${{ matrix.language }}" ================================================ FILE: .gitignore ================================================ /Saxon* *.jar /*.alto vendor/* !vendor/Makefile ocr-fileformat_* *~ ================================================ FILE: .gitmodules ================================================ [submodule "vendor/alto-schema"] path = vendor/alto-schema url = https://github.com/altoxml/schema.git [submodule "vendor/format-converters"] path = vendor/format-converters url = https://github.com/OCR-D/format-converters.git [submodule "vendor/gcv2hocr"] path = vendor/gcv2hocr url = https://github.com/dinosauria123/gcv2hocr.git [submodule "vendor/hocr-spec-python"] path = vendor/hocr-spec-python url = https://github.com/kba/hocr-spec-python.git [submodule "vendor/hOCR-to-ALTO"] path = vendor/hOCR-to-ALTO url = https://github.com/filak/hOCR-to-ALTO.git [submodule "vendor/im2alto"] path = vendor/im2alto url = https://github.com/karkraeg/im2alto.git [submodule "vendor/page-to-alto"] path = vendor/page-to-alto url = https://github.com/kba/page-to-alto.git [submodule "vendor/xsd-validator"] path = vendor/xsd-validator url = https://github.com/kba/xsd-validator.git [submodule "vendor/textract2page"] path = vendor/textract2page url = https://github.com/slub/textract2page.git ================================================ FILE: .zipignore ================================================ .git .zipignore .gitignore example ocr-fileformat_* *.pdf *.zip ================================================ FILE: CITATION.cff ================================================ # This CITATION.cff file was generated with cffinit. # Visit https://bit.ly/cffinit to generate yours today! cff-version: 1.2.0 title: ocr-fileformat message: >- You may cite this software using the metadata from this file. type: software authors: - name: Universitätsbibliothek Mannheim country: DE city: Mannheim website: 'https://www.bib.uni-mannheim.de/' - given-names: Konstantin family-names: Baierer orcid: 'https://orcid.org/0000-0003-2397-242X' - given-names: Stefan family-names: Weil affiliation: Universitätsbibliothek Mannheim orcid: 'https://orcid.org/0000-0002-0524-9898' - family-names: Zumstein given-names: Philipp affiliation: Universitätsbibliothek Mannheim orcid: 'https://orcid.org/0000-0002-6485-9434' - given-names: Robert family-names: Sachunsky - given-names: Jörg orcid: 'https://orcid.org/0000-0002-6406-4906' family-names: Mechnich affiliation: Universitätsbibliothek Mannheim - given-names: Uwe family-names: Hartwig orcid: 'https://orcid.org/0000-0001-7164-6376' - given-names: Mike family-names: Gerber - given-names: Clemens orcid: 'https://orcid.org/0000-0001-5293-8322' family-names: Neudecker ================================================ FILE: Dockerfile ================================================ FROM alpine:edge EXPOSE 8080 COPY . /ocr-fileformat WORKDIR /ocr-fileformat RUN apk add --no-cache openjdk8-jre php7 php7-json php7-openssl python3 py-lxml py-future git make ca-certificates wget bash gcc libc-dev \ && update-ca-certificates \ && make install \ && cp docker.config.php web/config.local.php \ && sed -i '/^upload_max_filesize/ s/=.*$/= 100M/' /etc/php7/php.ini \ && sed -i 's/;extension=php_openssl.dll/extension=php_openssl.dll/' /etc/php7/php.ini \ && mv web /ocr-fileformat-web \ && rm -rf /ocr-fileformat \ && apk del git make wget gcc libc-dev # Disable POST upload limit RUN sed -i 's,post_max_size = 8M,post_max_size = 0,' /etc/php7/php.ini VOLUME /data WORKDIR /data CMD php7 -S $(hostname -i):8080 -t /ocr-fileformat-web ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2016 Universitätsbibliothek Mannheim Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ # Makefile for ocr-fileformat PKG_NAME = ocr-fileformat PKG_VERSION = 0.5.0 DOCKER_IMAGE = ubma/ocr-fileformat # Either get the version from Git (if available) or use PKG_VERSION. ROOTDIR = $(abspath $(dir $(MAKEFILE_LIST))) VERSION = $(shell [ -d "$(ROOTDIR)/.git" ] && git -C "$(ROOTDIR)" describe --tags 2>/dev/null || echo $(PKG_VERSION)) CP = cp -a LN = ln -sf MV = mv -f MKDIR = mkdir -p RM = rm -rfv ZIP = zip PREFIX = $(DESTDIR)/usr/local SHAREDIR = $(PREFIX)/share/$(PKG_NAME) BINDIR = $(PREFIX)/bin PYTHON = python3 TSHT = ./test/tsht TSHT_URL = https://cdn.rawgit.com/kba/tsht/master/tsht # BEGIN-EVAL makefile-parser --make-help Makefile help: @echo "" @echo " Targets" @echo "" @echo " all Download vendor assets, link XSD schemas and XSLT stylesheets" @echo " vendor Download all vendor assets" @echo " xsd Link all XSD schemas" @echo " xslt Link all XSLT stylesheets" @echo " install Install ocr-fileformat" @echo " uninstall Uninstall ocr-fileformat" @echo " clean Remove linked assets" @echo " realclean Remove linked assets and vendor files" @echo " docker Create the docker image" @echo " release Make release tarball / zipball" @echo @echo @echo " Variables" @echo @echo " PREFIX Top-level directory for installation [$(PREFIX)]" @echo " PYTHON Python version to use for tools [$(PYTHON)]" # END-EVAL # Download vendor assets, link XSD schemas and XSLT stylesheets all: vendor xsd xslt check: $(MAKE) -C vendor check .PHONY: vendor # Download all vendor assets vendor: check # download the dependencies git submodule update --init # create+activate a Python venv if not already active if [ -z "$(VIRTUAL_ENV)" ]; then \ $(PYTHON) -m venv $(SHAREDIR)/venv && \ . $(SHAREDIR)/venv/bin/activate && \ pip install -U pip; \ fi && $(MAKE) -C vendor all .PHONY: xsd # Link all XSD schemas xsd: vendor $(MKDIR) xsd # copy Alto XSD cd xsd && $(LN) ../vendor/alto-schema/*/*.xsd . && \ for xsd in *.xsd;do \ target_xsd=`echo $$xsd|sed 's/.//g'|sed 's/-/./'`; \ if [ ! -e $$target_xsd ];then \ $(MV) $$xsd $$target_xsd; \ fi; done # copy PAGE XSD @cd xsd && $(LN) ../vendor/page-schema/*.xsd . # copy ABBYY XSD cd xsd && $(LN) ../vendor/abbyy-schema/*.xsd . .PHONY: xslt # Link all XSLT stylesheets xslt: vendor $(MKDIR) xslt # symlink hocr<->alto as well as the language codes lookup xml cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto.xsl hocr__alto.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto2.0.xsl hocr__alto2.0.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto2.1.xsl hocr__alto2.1.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto3.xsl hocr__alto3.0.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto4.xsl hocr__alto4.0.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/alto__hocr.xsl alto__hocr.xsl cd xslt && $(LN) alto__hocr.xsl alto2.0__hocr.xsl cd xslt && $(LN) alto__hocr.xsl alto2.1__hocr.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__text.xsl hocr__text.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/alto__text.xsl alto__text.xsl cd xslt && $(LN) ../vendor/hOCR-to-ALTO/codes_lookup.xml codes_lookup.xml cd xslt && $(LN) ../vendor/format-converters/page2hocr.xsl page__hocr.xsl cd xslt && $(LN) ../vendor/format-converters/abbyy2hocr.xsl abbyy__hocr.xsl cd xslt && $(LN) ../vendor/format-converters/hocr2tei.xsl hocr__tei.xsl cd xslt && $(LN) alto2.0__alto3.0.xsl alto2.0__alto3.1.xsl cd xslt && $(LN) alto2.0__alto3.0.xsl alto2.1__alto3.0.xsl cd xslt && $(LN) alto2.0__alto3.0.xsl alto2.1__alto3.1.xsl cd xslt && $(LN) ../vendor/im2alto/iw2alto.xsl mybib__alto3.0.xsl # Install ocr-fileformat define SEDSCRIPT echo '/^SHAREDIR=/c\' echo 'SHAREDIR="$(SHAREDIR)"' echo 's/VERSION/$(VERSION)/' endef export SEDSCRIPT install: all $(MKDIR) $(SHAREDIR) $(CP) script xsd xslt vendor lib.sh $(SHAREDIR) $(RM) $(SHAREDIR)/vendor/*/.git $(MKDIR) $(BINDIR) eval "$$SEDSCRIPT" | sed -f - bin/ocr-transform.sh > $(BINDIR)/ocr-transform eval "$$SEDSCRIPT" | sed -f - bin/ocr-validate.sh > $(BINDIR)/ocr-validate chmod a+x $(BINDIR)/ocr-transform $(BINDIR)/ocr-validate find $(SHAREDIR) -not -type l -exec chmod u+w {} \; # Uninstall ocr-fileformat uninstall: $(RM) $(BINDIR)/ocr-transform $(RM) $(BINDIR)/ocr-validate $(RM) $(SHAREDIR) # Remove linked assets clean: $(RM) xsd/* find xslt -type l -delete # Remove linked assets and vendor files realclean: clean $(MAKE) -C vendor clean # Create the docker image docker: docker build -t "$(DOCKER_IMAGE)" . # Make release tarball / zipball release: $(RM) $(PKG_NAME)_$(PKG_VERSION) $(MKDIR) $(PKG_NAME)_$(PKG_VERSION) tar -X .zipignore -cf - . | tar -xf - -C $(PKG_NAME)_$(PKG_VERSION) # $(CP) LICENSE Makefile README.md bin/ lib.sh vendor/ tar czf $(PKG_NAME)_$(PKG_VERSION).tar.gz $(PKG_NAME)_$(PKG_VERSION) zip --symlinks -r $(PKG_NAME)_$(PKG_VERSION).zip $(PKG_NAME)_$(PKG_VERSION) ================================================ FILE: README.md ================================================ # ocr-fileformat [![Codacy Badge](https://app.codacy.com/project/badge/Grade/1cd1dc54634249aebbe3e157569ed26f)](https://app.codacy.com/gh/UB-Mannheim/ocr-fileformat/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![Build Status](https://github.com/UB-Mannheim/ocr-fileformat/actions/workflows/ci.yml/badge.svg)](https://github.com/UB-Mannheim/ocr-fileformat/actions/workflows/ci.yml) [![GitHub release](https://img.shields.io/github/release/UB-Mannheim/ocr-fileformat.svg?maxAge=3600)](https://github.com/UB-Mannheim/ocr-fileformat/releases) [![ocr-fileformat Docker build](https://img.shields.io/docker/automated/ubma/ocr-fileformat.svg?maxAge=2592000?style=plastic)](https://hub.docker.com/r/ubma/ocr-fileformat) Validate and transform between OCR file formats (hOCR, ALTO, PAGE, FineReader) ![Screenshot GUI](https://raw.githubusercontent.com/UB-Mannheim/ocr-fileformat/master/screenshot.png) * [Installation](#installation) * [Docker](#docker) * [System-wide](#system-wide) * [Usage](#usage) * [CLI](#cli) * [GUI](#gui) * [API](#api) * [Transformation](#transformation) * [Transformation CLI](#transformation-cli) * [Transformation GUI](#transformation-gui) * [Transformation API](#transformation-api) * [Supported Transformations](#supported-transformations) * [Validation](#validation) * [Validation CLI](#validation-cli) * [Validation GUI](#validation-gui) * [Validation API](#validation-api) * [Supported Validation Formats](#supported-validation-formats) * [License](#license) ## Installation ### Docker You can run the [command line scripts](#cli) and [web interface](#gui) as a [Docker container](https://hub.docker.com/r/ubma/ocr-fileformat), you only need Docker installed. To start the web interface on [http://localhost:8080](http://localhost:8080): ```sh docker run --rm -it -p 8080:8080 ubma/ocr-fileformat ``` To run the command line scripts, mount the directory containing your input files into the container's `/data` directory: ```sh docker run --rm -it -v "$PWD":/data ubma/ocr-fileformat ocr-transform alto2.0 hocr somefile.alto ``` ### System-wide To install system-wide to `/usr/local`: ```sh sudo make install ``` To install without `sudo` to your home directory: ```sh make install PREFIX=$HOME/.local ``` If `$HOME/.local/bin` is not in your `PATH`, add this to your shell startup file (e.g. `~/.bashrc` or `~/.zshrc`): ``` export PATH="$HOME/.local/bin $PATH" ``` The web application has a PHP backed. You can deploy it on any PHP-capable server by copying the [`web`](./web) folder somewhere below the document root of your server, e.g. `/var/www/html` for Apache on Debian/Ubuntu: ``` sudo -u www-data cp -r web /var/www/html/ocr-fileformat ``` In this example the GUI would be available under [http://localhost/ocr-fileformat/](http://localhost/ocr-fileformat/). ## Usage The project offers two functionalities, which can be accessd via a command line script (CLI), using a web interface (GUI) or in you own tools (API) ### CLI * [`ocr-transform`](./bin/ocr-transform.sh): Transformation of OCR output between OCR formats * [`ocr-validate`](./bin/ocr-validate.sh): Validation of OCR output against OCR format schemas ### GUI The web interface is for testing validation and transformations. You can upload a file or select an input file by URL. ### API * [`$PREFIX/share/ocr-fileformat/xslt`](./xslt) - XSLT stylesheets * [`$PREFIX/share/ocr-fileformat/xsd`](./xsd) - XSD schemas * [`$PREFIX/share/ocr-fileformat/script/transform`](./script/transform) - Transformation scripts * [`$PREFIX/share/ocr-fileformat/script/validate`](./script/validate) - Validation scripts ## Transformation ### Transformation CLI ``` Usage: ocr-transform [-dl] [ []] [-- ] ``` For example, you can transform an ALTO XML to a hOCR file with: ```sh ocr-transform alto hocr sample.xml sample.hocr ``` Or convert from ALTO XML (version 2.1) to hOCR with: ```sh ocr-transform alto2.1 hocr sample.alto sample.hocr ``` You can also pass arguments directly to the Saxon CLI by passing them after a double dash (`--`). For example, to set the `foo` parameter to `bar`: ```sh ocr-transform alto hocr sample.xml sample.hocr -- foo=bar ``` Try `ocr-transform -h` to get an overview: ``` Usage: ocr-transform [OPTIONS] [ []] [-- ] ocr-transform [OPTIONS] --help-args Show script-args, and exit ocr-transform [OPTIONS] -h|--help Show this help, and exit ocr-transform [OPTIONS] -v|--version Show version, and exit ocr-transform [OPTIONS] -L|--list List available from/to, and exit Options: --debug -d Increase debug level by 1, can be repeated Transformations: abbyy hocr abbyy page alto hocr alto page alto text alto2.0 alto3.0 alto2.0 alto3.1 alto2.0 hocr alto2.1 alto3.0 alto2.1 alto3.1 alto2.1 hocr alto4.2 alto2.1 gcv alto gcv hocr gcv page hocr alto hocr alto2.0 hocr alto2.1 hocr alto3.0 hocr alto4.0 hocr page hocr tei hocr text mybib alto3.0 page alto page alto_legacy page hocr page page2019 page text tei hocr textract page ``` ### Transformation GUI Select the `Transform` menu option. Choose a URL, an input and an output format. Click `Transform`. ### Transformation API The stylesheets are installed in `$PREFIX/share/ocr-fileformat/xslt` and can be used directly in your scripts and software. You will need to use an XSLT 2.0 capable stylesheet transformer. ### Supported Transformations | From ╲ To | hOCR | ALTO | PAGEXML | TEI | Text | | ---: | --- | --- | --- | --- | --- | | hOCR | - | ✓ | ✓ | ✓ | ✓ | | ALTO | ✓ | ✓ | ✓ | - | ✓ | | PAGEXML | ✓ | ✓ | ✓ | - | ✓ | | ABBYY FineReader | ✓ | - | ✓ | - | - | | Google Cloud Vision | ✓ | ✓ | ✓ | - | - | | Amazon AWS Textract | - | - | ✓ | - | - | | TEI | ✓ | - | - | - | - | ## Validation ``` Usage: ocr-validate [OPTIONS] [] ocr-validate [OPTIONS] -h|--help Show this help, and exit ocr-validate [OPTIONS] -v|--version Show version, and exit ocr-validate [OPTIONS] -L|--list List available schemas, and exit Options: --debug -d Increase debug level by 1, can be repeated Schemas: hocr alto-1-0 alto-1-1 alto-1-2 alto-1-3 alto-1-4 alto-2-0 alto-2-1 alto-2-2-draft alto-3-0 alto-3-1 alto-3-2-draft alto-4-0 alto-4-1 alto-4-2 alto-4-3 abbyy-6-schema-v1 abbyy-8-schema-v2 abbyy-9-schema-v1 abbyy-10-schema-v1 page-2009-03-16 page-2010-01-12 page-2010-03-19 page-2013-07-15 page-2016-07-15 page-2017-07-15 page-2018-07-15 page-2019-07-15 ``` ### Validation CLI For example, to validate an XML file against the ALTO 3.1 schema: ``` ocr-validate alto-3-1 myFile.alto ``` ### Validation GUI Select the `Validate` menu option. Choose a URL and an schema. Click `Validate`. ### Validation API The XSD files are installed under `$PREFIX/share/ocr-fileformat/xsd` ### Supported Validation Formats | | hOCR | ALTO | PAGEXML | FineReader | Google Cloud Vision | Amazon AWS Textract | | ---: | --- | --- | --- | --- | --- | --- | | Validation | ✓ | ✓ | ✓ | ✓ | - | - | ## License This is free software. You may use it under the terms of the [MIT License](LICENSE). During the installation process several projects are included (in [`./vendor`](./vendor)). These projects have different licenses: * [Saxon HE 9.7](http://saxon.sourceforge.net/#F9.7HE), [`MPL`](https://www.mozilla.org/MPL/). * [ALTOXML schema](https://github.com/altoxml/schema), ["Open Source"](https://github.com/altoxml/schema/issues/37#issuecomment-218730230) for ALTO <= 3.1, [`CC BY SA 4.0`](https://creativecommons.org/licenses/by-sa/4.0/legalcode) since ALTO 4.0 * [PAGE schemas](http://www.primaresearch.org/schema/PAGE/gts/pagecontent/), `?` * [xsd-validator](https://github.com/kba/xsd-validator) by Adrian Mouat [@amouat](https://github.com/amouat), `Apache 2.0` * ABBYY FineReader XSD, `?` * [hOCR-to-ALTO](https://github.com/filak/hOCR-to-ALTO) by Filip Kriz [@filak](https://github.com/filak), [`MIT`](https://github.com/filak/hOCR-to-ALTO/blob/master/LICENSE.txt) * [hocr-spec](https://github.com/kba/hocr-spec-python) by Konstantin Baierer [@kba](https://github.com/kba), [`MIT`](https://github.com/kba/hocr-spec-python/blob/master/LICENSE) * [gcv2hocr](https://github.com/dinosauria123/gcv2hocr) by Endo Michiaki, [`CC BY 4.0`](https://creativecommons.org/licenses/by/4.0/legalcode) * [format-converters](https://github.com/OCR-D/format-converters) by OCR-D, [`Apache 2.0`](https://github.com/OCR-D/format-converters/blob/master/LICENSE) * [prima-page-converter](https://github.com/PRImA-Research-Lab/prima-page-converter/) by PRImA Research Lab , [`Apache 2.0`](https://github.com/PRImA-Research-Lab/prima-page-converter/blob/master/LICENSE) * [page-to-alto](https://github.com/kba/page-to-alto/) by Konstantin Baierer @kba, [`Apache 2.0`](https://github.com/kba/page-to-alto/blob/master/LICENSE) * [textract2page](https://github.com/slub/textract2page/) by Arne Rümmler @rue-a, [`Apache 2.0`](https://github.com/slub/textract2page/blob/master/LICENSE) ================================================ FILE: bin/ocr-transform.sh ================================================ #!/usr/bin/env bash # Default to the parent dir of this script. Overwritten by `make install` SHAREDIR="$(readlink -f "$(dirname "$(readlink -f "$0")")/..")" source "$SHAREDIR/lib.sh" #{{{ show_usage () show_usage () { [[ "$#" -gt 0 ]] && logerr "$@" echo >&2 "Usage: ${0##*/} [OPTIONS] [ []] [-- ] ${0##*/} [OPTIONS] --help-args Show script-args, and exit ${0##*/} [OPTIONS] -h|--help Show this help, and exit ${0##*/} [OPTIONS] -v|--version Show version, and exit ${0##*/} [OPTIONS] -L|--list List available from/to, and exit Options: --debug -d Increase debug level by 1, can be repeated " echo >&2 -e "\n${INDENT}Transformations:" show_transformations|sed "s/^/${INDENT}${INDENT}/" [[ "$#" -gt 0 ]] && exit 1 } #}}} #{{{ show_version () show_version () { echo "${0##*/} VERSION" } #}}} #{{{ main () main () { # debug option -d -d to print all commands to the terminal if (( DEBUG > 1 ));then set -x fi local from="$1" to="$2" infile='-' outfile='-' transformer shift 2 # Validate parameters if [[ -z "$from" ]];then show_usage "Must set 'from' parameter" elif [[ -z "$to" ]];then show_usage "Must set 'to' parameter" elif [[ -z "${OCR_TRANSFORMATIONS[$from]}" ]];then show_usage "No mapping from '$from'" else declare -a possible=(${OCR_TRANSFORMATIONS[$from]}) if ! in_array "$to" "${possible[@]}";then show_usage "No mapping from '$from' to '$to'" fi fi transformer=${OCR_TRANSFORMERS[${from}__${to}]} if [[ "$1" == '--help-args' ]];then if [[ "$transformer" = */gcv__hocr ]];then echo >&2 -e "${INDENT}Extra arguments: " elif [[ "$transformer" = */page__alto ]];then echo >&2 -e "${INDENT}page-to-alto options:" page-to-alto --help|sed '1,/^Options:/d;/--output-file/,$d' >&2 elif [[ "$transformer" = */textract__page ]];then echo >&2 -e "${INDENT}textract2page arguments: " echo >&2 -e "${INDENT}textract2page options:" else # xsl and other transformers both take arbitrary Saxon options show_saxon_options|sed "s/^/${INDENT}${INDENT}/" fi exit 0 fi declare -a script_args # if [[ "$1" == '--' ]];then script_args+=("${@:2}") set -- elif [[ -n "$1" ]];then infile="$1" fi shift # if [[ "$1" == '--' ]];then script_args+=("${@:2}") set -- elif [[ -n "$1" ]];then outfile="$1" fi shift; # if [[ "$1" == '--' ]];then script_args+=("${@:2}") fi if (( DEBUG > 0 ));then [[ "$infile" = '-' ]] && logdebug "Reading from STDIN" [[ "$outfile" = '-' ]] && logdebug "Writing to STDOUT" fi # Run it optstate=$(set +o) set -o errexit if [[ "$transformer" = *.xsl ]];then script_args=("${script_args[@]}" "-xsl:$transformer") script_args=("${script_args[@]}" "-s:$infile") [[ "$outfile" != '-' ]] && script_args=("${script_args[@]}" "-o:$outfile") exec_saxon "${script_args[@]}" else script_args=("$infile" "$outfile" "${script_args[@]}") source "$transformer" "${script_args[@]}" fi eval "$optstate" } #}}} while [[ "$1" = -* ]]; do case "$1" in -d|--debug) let DEBUG+=1 ;; -L|--list) show_transformations ; exit 0 ;; -h|--help) show_usage ; exit 0 ;; -v|--version) show_version ; exit 0 ;; *) logerr "Unknown option '$1'" && show_usage && exit 1 ;; esac shift done if [[ -d "$SHAREDIR/venv" ]];then . "$SHAREDIR/venv/bin/activate" fi main "$@" ================================================ FILE: bin/ocr-validate.sh ================================================ #!/usr/bin/env bash # Default to the parent dir of this script. Overwritten by `make install` SHAREDIR="$(readlink "$(dirname "$(readlink "$0")")/..")" source "$SHAREDIR/lib.sh" #{{{ show_usage () show_usage () { [[ "$#" -gt 0 ]] && logerr "$@" echo >&2 "Usage: ${0##*/} [OPTIONS] [] ${0##*/} [OPTIONS] -h|--help Show this help, and exit ${0##*/} [OPTIONS] -v|--version Show version, and exit ${0##*/} [OPTIONS] -L|--list List available schemas, and exit Options: --debug -d Increase debug level by 1, can be repeated " echo >&2 -e "\n${INDENT}Schemas:" show_schemas|sed "s/^/${INDENT}${INDENT}/" echo [[ "$#" -gt 0 ]] && exit 1 } #}}} #{{{ show_version () show_version () { echo "${0##*/} VERSION" } #}}} #{{{ main () main () { # debug option -d -d to print all commands to the terminal if (( DEBUG > 1 ));then set -x fi local schema="$1" file="$2" shift 2 if [[ -z "$schema" ]];then show_usage "Must set 'schema'" elif [[ -z "${OCR_VALIDATORS[$schema]}" ]];then show_usage "No such schema '$schema'" fi if [[ -z "$file" ]];then show_usage "Must set 'file'" fi if [[ "$file" == "-" ]];then ((DEBUG > 1)) && loginfo "Reading from STDIN" else file=$(readlink "$file") if [[ ! -e "$file" ]];then show_usage "No such file: '$file'" fi fi if [[ "${OCR_VALIDATORS[$schema]}" = *.xsd ]];then "exec_xsdv" "$schema" "$file" else source "${OCR_VALIDATORS[$schema]}" "$file" fi } #}}} while [[ "$1" = -* ]]; do case "$1" in --debug|-d) let DEBUG+=1 ;; --list|-L) show_schemas|sed -e 's/\s*$//' -e 's/ \+/\n/g' ; exit 0 ;; --help|-h) show_usage ; exit 0 ;; --version|-v) show_version ; exit 0 ;; *) logerr "Unknown option '$1'" && show_usage && exit 1 ;; esac shift done if [[ -d "$SHAREDIR/venv" ]];then . "$SHAREDIR/venv/bin/activate" fi main "$@" ================================================ FILE: docker.config.php ================================================ $@ $(BASENAME).alto : $(BASENAME).hocr $(OCR_TRANSFORM) hocr alto2.0 $< | $(XMLLINT) - > $@ $(BASENAME).alto.page : $(BASENAME).alto $(OCR_TRANSFORM) alto page $< | $(XMLLINT) - > $@ $(BASENAME).alto.page.alto : $(BASENAME).alto.page $(OCR_TRANSFORM) page alto $< | $(XMLLINT) - > $@ $(BASENAME).roundtrip.hocr : $(BASENAME).alto $(OCR_TRANSFORM) alto hocr $< | $(XMLLINT) - > $@ clean: $(RM) $(BASENAME)*.hocr $(BASENAME)*.alto ================================================ FILE: example/README.md ================================================ # Testing transformations Install dependencies. For Debian/Ubuntu: make deps Run a roundtrip example: make roundtrip This will: * download image (`-> x.jpg`) * OCR the image (`-> x.hocr`) * hOCR -> ALTO 2.0 (`-> x.alto`) * ALTO 2.0 -> hOCR (`-> x.roundtrip.hocr`) To see the information lost/added: make diff This will compare `x.hocr` to `x.roundtrip.hocr` using `dwdiff` and open the result in a pager. ## License The example data is from the [Deutsches Textarchiv](https://www.deutschestextarchiv.de/book/show/wetzel_reisebegleiter_1901) project, data is licensed CC BY-NC 3.0. ================================================ FILE: lib.sh ================================================ #!/usr/bin/env bash #{{{ Logging if [[ -n "$COLORTERM" || "$TERM" = *color* || "$TERM" = xterm* ]];then COLOR_ERROR="\033[1;31m" COLOR_INFO="\033[1;32m" COLOR_DEBUG="\033[1;34m" COLOR_DEFAULT="\033[0m" fi # shellcheck disable=SC2048 logerr () { local IFS=$'\n' for line in $*;do echo -e "${COLOR_DEFAULT}[${COLOR_ERROR}ERROR${COLOR_DEFAULT}] $line" >&2 done } loginfo () { echo -e "${COLOR_DEFAULT}[${COLOR_INFO}INFO${COLOR_DEFAULT}] $*" >&2; } logdebug () { echo -e "${COLOR_DEFAULT}[${COLOR_DEBUG}DEBUG${COLOR_DEFAULT}] $*" >&2; } #}}} if [[ -z "$SHAREDIR" || ! -d "$SHAREDIR" ]];then logerr "Set \$SHAREDIR before sourcing $0" exit 1 fi #{{{ utils (in_array) # utility function to find the first pos param in the rest pos params in_array () { local e for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done return 1 } #}}} #{{{ Global vars export DEBUG=0 export INDENT=" " # Mapping 'fmt' -> 'fmt2 fmt3 fmt4' declare -Ax OCR_TRANSFORMATIONS=() # Mapping 'fmt' -> '/path-to-xslt-or-transform-script' declare -Ax OCR_TRANSFORMERS=() # Mapping 'fmt' -> '/path-to-xsd-or-validate-script' declare -Ax OCR_VALIDATORS=() #}}} #{{{ Set up validation and transformation formats # setup_transformations () setup_transformations () { declare -a transformers=($( find -L "$SHAREDIR/xslt" "$SHAREDIR/script/transform" \ ! -type d \( -name '*.xsl' -or -perm -005 \) \ )) local in_fmt out_fmt for path in "${transformers[@]}";do fmt=${path##*/} fmt=${fmt%.*} OCR_TRANSFORMERS[$fmt]="$path" in_fmt=${fmt%%__*} out_fmt=${fmt##*__} if [[ -z "${OCR_TRANSFORMATIONS[$in_fmt]}" ]];then OCR_TRANSFORMATIONS[$in_fmt]="$out_fmt" else OCR_TRANSFORMATIONS[$in_fmt]+=" $out_fmt" fi done } # setup_validations () setup_validations () { declare -a validators=($( find -L "$SHAREDIR/xsd" "$SHAREDIR/script/validate" \ ! -type d \( -name '*.xsd' -or -perm -005 \) \ |sort)) local path fmt for path in "${validators[@]}";do fmt=${path##*/} fmt=${fmt%.*} OCR_VALIDATORS[$fmt]="$path" done } setup () { setup_transformations setup_validations } setup #}}} #{{{ List transformations, validations, saxon options # show_schemas () show_schemas() { local schema schemagroup declare -a sorted=($(IFS=$'\n'; echo "${!OCR_VALIDATORS[*]}"|sort -t- -nk2 -k1)) for schema in "${sorted[@]}";do [[ -n "$schemagroup" && "$schemagroup" != ${schema%%-*} ]] && echo echo -n "$schema " schemagroup=${schema%%-*} done } # show_transformations () show_transformations() { local in_fmt out_fmt for in_fmt in "${!OCR_TRANSFORMATIONS[@]}";do declare -a out_fmts=(${OCR_TRANSFORMATIONS[$in_fmt]}) for out_fmt in "${out_fmts[@]}";do echo "${in_fmt} ${out_fmt}"; done done|sort } # show_saxon_options () show_saxon_options () { exec_saxon -t 2>&1|sed -e '0,/No source file/ d' -e '/Format:/ d' } #}}} #{{{ run saxon / xsd-validator (xsdv.sh) # exec_saxon () exec_saxon() { (( DEBUG > 0 )) && loginfo Executing "java -jar $SHAREDIR/vendor/saxon.jar" "$@" (( DEBUG > 1 )) && SAXON_ARGS+=('-t') java -jar "$SHAREDIR/vendor/saxon.jar" "$@" } # exec_xsdv () exec_xsdv() { local schema="$1" file="$2" cd "$SHAREDIR/vendor/xsd-validator" if ((DEBUG > 0));then loginfo "PWD: '$PWD'" loginfo "./xsdv.sh '$SHAREDIR/xsd/${schema}.xsd' '$file'" fi ./xsdv.sh "$SHAREDIR/xsd/${schema}.xsd" "$file" } #}}} ================================================ FILE: script/transform/README.md ================================================ Scripts should be named `__`, e.g. `hocr-1.0__abbby-10`. Will be called as ``` /script/transform/__ ``` Both `` and `` can be `-`, in which case input should be read from STDIN or written to STDOUT. ================================================ FILE: script/transform/alto__page ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" JAR="$VENDORDIR/JPageConverter/PageConverter.jar" INFILE="$1" OUTFILE="$2" ARGUMENT="$3" if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" cat >"$INFILE" fi if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi java -jar "$JAR" -neg-coords toZero -source-xml "$INFILE" -target-xml "$OUTFILE" -convert-to LATEST 2>&1 if [[ "$1" = "-" ]]; then rm "$INFILE" fi if [[ "$2" = "-" ]]; then if [[ -z "$ARGUMENT" ]]; then cat "$OUTFILE" else java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT" fi rm "$OUTFILE" fi ================================================ FILE: script/transform/gcv__alto ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" JAR="$VENDORDIR/JPageConverter/PageConverter.jar" INFILE="$1" OUTFILE="$2" ARGUMENT="$3" if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" cat >"$INFILE" fi if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi java -jar "$JAR" -neg-coords toZero -source-json "$INFILE" -target-xml "$OUTFILE" -convert-to ALTO 2>&1 if [[ "$1" = "-" ]]; then rm "$INFILE" fi if [[ "$2" = "-" ]]; then if [[ -z "$ARGUMENT" ]]; then cat "$OUTFILE" else java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT" fi rm "$OUTFILE" fi ================================================ FILE: script/transform/gcv__hocr ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" VENDORSCRIPT="$VENDORDIR/gcv2hocr/gcv2hocr" INFILE="$1" OUTFILE="$2" #TODO WIDTH=${3:-2000} HEIGHT=${4:-2000} if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" cat >"$INFILE" fi if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi "$VENDORSCRIPT" "$INFILE" "$OUTFILE" "$WIDTH" "$HEIGHT" if [[ "$1" = "-" ]]; then rm "$INFILE" fi if [[ "$2" = "-" ]]; then cat "$OUTFILE" rm "$OUTFILE" fi rm preout1.txt preout2.txt ================================================ FILE: script/transform/gcv__page ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" JAR="$VENDORDIR/JPageConverter/PageConverter.jar" INFILE="$1" OUTFILE="$2" ARGUMENT="$3" if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" cat >"$INFILE" fi if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi java -jar "$JAR" -neg-coords toZero -source-json "$INFILE" -target-xml "$OUTFILE" -convert-to LATEST 2>&1 if [[ "$1" = "-" ]]; then rm "$INFILE" fi if [[ "$2" = "-" ]]; then if [[ -z "$ARGUMENT" ]]; then cat "$OUTFILE" else java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT" fi rm "$OUTFILE" fi ================================================ FILE: script/transform/page__alto ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" INFILE="$1" OUTFILE="$2" ARGUMENTS=("${@:3}") if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" cat >"$INFILE" fi if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi page-to-alto "${ARGUMENTS[@]}" -O "$OUTFILE" "$INFILE" ; retval="$?" if [[ "$1" = "-" ]]; then rm "$INFILE" fi if (( retval > 0 )); then rm "$OUTFILE" exit $retval fi if [[ "$2" = "-" ]]; then cat "$OUTFILE" rm "$OUTFILE" fi ================================================ FILE: script/transform/page__alto_legacy ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" JAR="$VENDORDIR/JPageConverter/PageConverter.jar" INFILE="$1" OUTFILE="$2" ARGUMENT="$3" if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" cat >"$INFILE" fi if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi java -jar "$JAR" -neg-coords toZero -source-xml "$INFILE" -target-xml "$OUTFILE" -convert-to ALTO 2>&1 if [[ "$1" = "-" ]]; then rm "$INFILE" fi if [[ "$2" = "-" ]]; then if [[ -z "$ARGUMENT" ]]; then cat "$OUTFILE" else java -cp "$VENDORDIR/saxon.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT" fi rm "$OUTFILE" fi ================================================ FILE: script/transform/textract__page ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" INFILE="$1" OUTFILE="$2" ARGUMENTS=("${@:3}") if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" cat >"$INFILE" fi if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi textract2page "${ARGUMENTS[@]:1}" -O "$OUTFILE" "$INFILE" "${ARGUMENTS[0]}"; retval="$?" if [[ "$1" = "-" ]]; then rm "$INFILE" fi if (( retval > 0 ));then rm "$OUTFILE" exit $retval fi if [[ "$2" = "-" ]]; then cat "$OUTFILE" rm "$OUTFILE" fi ================================================ FILE: script/validate/README.md ================================================ Scripts here will be called by `ocr-validate`. Name should be the format and version, lowercase letters, numbers and dash only. ================================================ FILE: script/validate/hocr ================================================ #!/bin/bash SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" HOCR_SPEC="$SCRIPTDIR/../../vendor/hocr-spec-python/hocr-spec" format="xml" if [[ "$TERM" = *"color"* ]];then format="ansi" fi python3 "$HOCR_SPEC" -f "$format" -p relaxed --filename "STDIN" "$1" ================================================ FILE: vendor/Makefile ================================================ MKDIR = mkdir -p RM = rm -rfv UNZIP = unzip -o WGET = wget --progress=bar:force --no-verbose PIP = pip3 SAXON_HE_VERSION_MAJOR = 11 SAXON_HE_VERSION_MINOR = 2 SAXON_HE_ZIP = SaxonHE$(SAXON_HE_VERSION_MAJOR)-$(SAXON_HE_VERSION_MINOR)J.zip SAXON_HE_URL = https://netcologne.dl.sourceforge.net/project/saxon/Saxon-HE/$(SAXON_HE_VERSION_MAJOR)/Java/$(SAXON_HE_ZIP) SAXON_HE_JAR = saxon-he-$(SAXON_HE_VERSION_MAJOR).$(SAXON_HE_VERSION_MINOR).jar PAGE_SCHEMA_REPO = page-schema PAGE_SCHEMA_VERSIONS = 2009-03-16 2010-01-12 2010-03-19 2013-07-15 2016-07-15 2017-07-15 2018-07-15 2019-07-15 PAGE_SCHEMA_BASE_URL = https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/PAGE-release/gts/pagecontent ABBYY_SCHEMA_REPO = abbyy-schema ABBYY_SCHEMA_BASE_URL = https://fr7.abbyy.com/FineReader_xml/FineReader ABBYY_SCHEMA_VERSIONS = 6-schema-v1 8-schema-v2 9-schema-v1 10-schema-v1 ALTO2PAGE_VERSION_MAJOR_MINOR = 1.5 ALTO2PAGE_VERSION = $(ALTO2PAGE_VERSION_MAJOR_MINOR).06 ALTO2PAGE_ZIP = JPageConverter.zip ALTO2PAGE_URL = https://github.com/UB-Mannheim/prima-page-converter/releases/download/$(ALTO2PAGE_VERSION)/JPageConverter_$(ALTO2PAGE_VERSION).zip ALTO2PAGE_DIR = JPageConverter # {{{ # SAXON_BROWSER_VERSION = 1.1 # SAXON_BROWSER_ZIP = Saxon-CE_$(SAXON_BROWSER_VERSION).zip # SAXON_BROWSER_JS = TODO # SAXON_BROWSER_URL = http://www.saxonica.com/ce/download/$(SAXON_BROWSER_ZIP) # $(SAXON_BROWSER_JS): $(SAXON_BROWSER_ZIP) # $(SAXON_BROWSER_ZIP): # wget -O '$@' '$(SAXON_BROWSER_URL)' #}}} .PHONY: all check $(PAGE_SCHEMA_REPO) $(ABBYY_SCHEMA_REPO) gcv2hocr page-to-alto textract2page format-converters all:\ $(PAGE_SCHEMA_REPO)\ $(ABBYY_SCHEMA_REPO)\ gcv2hocr \ saxon.jar \ $(ALTO2PAGE_DIR) \ page-to-alto \ textract2page \ format-converters clean: $(RM) $(SAXON_HE_JAR) saxon.jar $(RM) $(SAXON_HE_ZIP) $(RM) $(PAGE_SCHEMA_REPO) $(RM) $(ALTO2PAGE_DIR) $(RM) $(ALTO2PAGE_ZIP) check: @which wget >/dev/null || (echo "Missing wget. Please install package wget." && exit 1) @which unzip >/dev/null || (echo "Missing unzip. Please install package unzip." && exit 1) $(ABBYY_SCHEMA_REPO): @$(MKDIR) "$@" && cd "$@" && \ for version in $(ABBYY_SCHEMA_VERSIONS);do \ xsd=abbyy-$$version.xsd; if [ ! -e $$xsd ];then \ $(WGET) -O $$xsd $(ABBYY_SCHEMA_BASE_URL)$$version.xml; \ fi; \ done; $(PAGE_SCHEMA_REPO): @$(MKDIR) "$@" && cd "$@" && \ for version in $(PAGE_SCHEMA_VERSIONS);do \ xsd=page-$$version.xsd; if [ ! -e $$xsd ];then \ $(WGET) -O $$xsd $(PAGE_SCHEMA_BASE_URL)/$$version/pagecontent.xsd; \ fi; \ done; saxon.jar: $(SAXON_HE_JAR) ln -sf "$<" "$@" $(SAXON_HE_JAR): $(SAXON_HE_ZIP) $(UNZIP) "$<" $(SAXON_HE_ZIP): $(WGET) -O "$@" "$(SAXON_HE_URL)" gcv2hocr: $(MAKE) -C $@ $(ALTO2PAGE_ZIP): $(WGET) -O "$@" "$(ALTO2PAGE_URL)" $(ALTO2PAGE_DIR): $(ALTO2PAGE_ZIP) $(UNZIP) "$<" rm -rf "$@" mv "JPageConverter $(ALTO2PAGE_VERSION)" "$@" page-to-alto: cd "$@"; $(PIP) install . textract2page: cd "$@"; $(PIP) install . format-converters: cd "$@"; $(PIP) install . ================================================ FILE: web/config.php ================================================ dirname(__FILE__) . '/../bin/ocr-validate.sh', 'ocr-transform' => dirname(__FILE__) . '/../bin/ocr-transform.sh', 'formats' => [ 'transform' => [], 'validate' => [], ], ]; $local_settings = dirname(__FILE__) . '/config.local.php'; if (file_exists($local_settings) === TRUE) { include $local_settings; } /** * List of installed transform from-to-tuples. * List of installed schemas. */ function buildFormatList() { global $config; $lines = []; exec($config['ocr-transform'] . ' -L', $lines); foreach ($lines as $line) { $fromto = preg_split("/\s+/", $line); $from = $fromto[0]; $to = $fromto[1]; // echo $from, "\t", $to, "\n"; if (! array_key_exists($from, $config['formats']['transform'])) { $config['formats']['transform'][$from] = [$to]; } else { array_push($config['formats']['transform'][$from], $to); } } exec($config['ocr-validate'] . ' -L', $config['formats']['validate']); } buildFormatList(); return $config; ================================================ FILE: web/index.html ================================================ OCR Fileformat Fork me on GitHub ================================================ FILE: web/ocr-fileformat.css ================================================ .glyphicon.spinning { animation: spin 1s infinite linear; -webkit-animation: spin2 1s infinite linear; } @keyframes spin { from { transform: scale(1) rotate(0deg); } to { transform: scale(1) rotate(360deg); } } @-webkit-keyframes spin2 { from { -webkit-transform: rotate(0deg); } to { -webkit-transform: rotate(360deg); } } .result { max-height: 75vh; } .github-fork-ribbon { position: fixed; } .github-fork-ribbon.left-bottom:before { background-color: #080; } ================================================ FILE: web/ocr-fileformat.js ================================================ /* globals $ */ /* globals Blob */ /* global Prism */ let OcrFileformatAPI = function OcrFileformatAPI(endpoint) { this.endpoint = endpoint; }; OcrFileformatAPI.prototype.urlFor = function urlFor(action, params) { params || (params = {}); let url = this.endpoint + '?do=' + action; for (let paramName of Object.keys(params)) { url += '&' + paramName + '=' + params[paramName]; } return url; }; OcrFileformatAPI.prototype.updateFormats = function updateFormats(cb) { let self = this; this.request('list', null, null, function(err, formats) { self.formats = formats; cb(err); }); }; OcrFileformatAPI.prototype.request = function request(endpoint, query, formData, cb) { let ajaxCall = { type: 'GET', url: window.api.urlFor(endpoint, query), success: function(data) { cb(null, data); }, error: function(xhr) { cb(xhr.responseText); }, }; if (formData) { ajaxCall.type = 'POST'; ajaxCall.data = formData; ajaxCall.processData = false; ajaxCall.contentType = false; } $.ajax(ajaxCall); }; function escapeHTML(str) { return str. replace(/&/g, '&'). replace(//g, '>'); } function onChangeFormat() { if ($("#transform-from option").length == 1) { Object.keys(window.api.formats.transform).forEach(function(from) { $("#transform-from").append($("