[
  {
    "path": ".dockerignore",
    "content": "Dockerfile\nexample\ntest\nREADME.md\nxsd\n\nxslt\n!xslt/alto2.0__alto3.0.xsl\n!xslt/page__text.xsl\n!xslt/tei__hocr.xsl\n\nvendor/*\n!vendor/Makefile\n!vendor/saxon*.jar\n"
  },
  {
    "path": ".eslintrc.google.js",
    "content": "/**\n * Copyright 2016 Google Inc. All rights reserved.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n'use strict';\n\nmodule.exports = {\n  rules: {\n    // The rules below are listed in the order they appear on the eslint\n    // rules page. All rules are listed to make it easier to keep in sync\n    // as new ESLint rules are added.\n    // http://eslint.org/docs/rules/\n    // - Rules in the `eslint:recommended` ruleset that aren't specifically\n    //   mentioned by the google styleguide are listed but commented out (so\n    //   they don't override a base ruleset).\n    // - Rules that are recommended but contradict the Google styleguide\n    //   are explicitely set to the Google styleguide value.\n\n    // Possible Errors\n    // http://eslint.org/docs/rules/#possible-errors\n    // ---------------------------------------------\n    // 'for-direction': 0,\n    // 'no-await-in-loop': 0,\n    // 'no-compare-neg-zero': 2, // eslint:recommended\n    'no-cond-assign': 0, // eslint:recommended\n    // 'no-console': 2, // eslint:recommended\n    // 'no-constant-condition': 2, // eslint:recommended\n    // 'no-control-regex': 2, // eslint:recommended\n    // 'no-debugger': 2, // eslint:recommended\n    // 'no-dupe-args': 2, // eslint:recommended\n    // 'no-dupe-keys': 2, // eslint:recommended\n    // 'no-duplicate-case': 2, // eslint:recommended\n    // 'no-empty': 2, // eslint:recommended\n    // 'no-empty-character-class': 2, // eslint:recommended\n    // 'no-ex-assign': 2, // eslint:recommended\n    // 'no-extra-boolean-cast': 2, // eslint:recommended\n    // 'no-extra-parens': 0,\n    // 'no-extra-semi': 2, // eslint:recommended\n    // 'no-func-assign': 2, // eslint:recommended\n    // 'no-inner-declarations': 2, // eslint:recommended\n    // 'no-invalid-regexp': 2, // eslint:recommended\n    'no-irregular-whitespace': 2, // eslint:recommended\n    // 'no-obj-calls': 2, // eslint:recommended\n    // 'no-prototype-builtins': 0,\n    // 'no-regex-spaces': 2, // eslint:recommended\n    // 'no-sparse-arrays': 2, // eslint:recommended\n    // 'no-template-curly-in-string': 0,\n    'no-unexpected-multiline': 2, // eslint:recommended\n    // 'no-unreachable': 2, // eslint:recommended\n    // 'no-unsafe-finally': 2, // eslint:recommended\n    // 'no-unsafe-negation': 0,\n    // 'use-isnan': 2 // eslint:recommended\n    'valid-jsdoc': [2, {\n      requireParamDescription: false,\n      requireReturnDescription: false,\n      requireReturn: false,\n      prefer: {returns: 'return'},\n    }],\n    // 'valid-typeof': 2 // eslint:recommended\n\n\n    // Best Practices\n    // http://eslint.org/docs/rules/#best-practices\n    // --------------------------------------------\n\n    // 'accessor-pairs': 0,\n    // 'array-callback-return': 0,\n    // 'block-scoped-var': 0,\n    // 'class-methods-use-this': 0,\n    // 'complexity': 0,\n    // 'consistent-return': 0\n    'curly': [2, 'multi-line'], // TODO(philipwalton): add an option to enforce\n                                // braces with the exception of simple,\n                                // single-line if statements.\n    // 'default-case': 0,\n    // 'dot-location': 0,\n    // 'dot-notation': 0,\n    // 'eqeqeq': 0,\n    'guard-for-in': 2,\n    // 'no-alert': 0,\n    'no-caller': 2,\n    // 'no-case-declarations': 2, // eslint:recommended\n    // 'no-div-regex': 0,\n    // 'no-else-return': 0,\n    // 'no-empty-function': 0,\n    // 'no-empty-pattern': 2, // eslint:recommended\n    // 'no-eq-null': 0,\n    // 'no-eval': 0,\n    'no-extend-native': 2,\n    'no-extra-bind': 2,\n    // 'no-extra-label': 0,\n    // 'no-fallthrough': 2, // eslint:recommended\n    // 'no-floating-decimal': 0,\n    // 'no-global-assign': 0,\n    // 'no-implicit-coercion': 0,\n    // 'no-implicit-globals': 0,\n    // 'no-implied-eval': 0,\n    'no-invalid-this': 2,\n    // 'no-iterator': 0,\n    // 'no-labels': 0,\n    // 'no-lone-blocks': 0,\n    // 'no-loop-func': 0,\n    // 'no-magic-numbers': 0,\n    'no-multi-spaces': 2,\n    'no-multi-str': 2,\n    // 'no-new': 0,\n    // 'no-new-func': 0,\n    'no-new-wrappers': 2,\n    // 'no-octal': 2, // eslint:recommended\n    // 'no-octal-escape': 0,\n    // 'no-param-reassign': 0,\n    // 'no-proto': 0,\n    // 'no-redeclare': 2, // eslint:recommended\n    // 'no-restricted-properties': 0,\n    // 'no-return-assign': 0,\n    // 'no-script-url': 0,\n    // 'no-self-assign': 2, // eslint:recommended\n    // 'no-self-compare': 0,\n    // 'no-sequences': 0,\n    'no-throw-literal': 2, // eslint:recommended\n    // 'no-unmodified-loop-condition': 0,\n    // 'no-unused-expressions': 0,\n    // 'no-unused-labels': 2, // eslint:recommended\n    // 'no-useless-call': 0,\n    // 'no-useless-concat': 0,\n    // 'no-useless-escape': 0,\n    // 'no-void': 0,\n    // 'no-warning-comments': 0,\n    'no-with': 2,\n    // 'prefer-promise-reject-errors': 0,\n    // 'radix': 0,\n    // 'require-await': 0,\n    // 'vars-on-top': 0,\n    // 'wrap-iife': 0,\n    // 'yoda': 0,\n\n    // Strict Mode\n    // http://eslint.org/docs/rules/#strict-mode\n    // -----------------------------------------\n    // 'strict': 0,\n\n    // Variables\n    // http://eslint.org/docs/rules/#variables\n    // ---------------------------------------\n    // 'init-declarations': 0,\n    // 'no-catch-shadow': 0,\n    // 'no-delete-var': 2, // eslint:recommended\n    // 'no-label-var': 0,\n    // 'no-restricted-globals': 0,\n    // 'no-shadow': 0,\n    // 'no-shadow-restricted-names': 0,\n    // 'no-undef': 2, // eslint:recommended\n    // 'no-undef-init': 0,\n    // 'no-undefined': 0,\n    'no-unused-vars': [2, {args: 'none'}], // eslint:recommended\n    // 'no-use-before-define': 0,\n\n    // Node.js and CommonJS\n    // http://eslint.org/docs/rules/#nodejs-and-commonjs\n    // -------------------------------------------------\n    // 'callback-return': 0,\n    // 'global-require': 0,\n    // 'handle-callback-err': 0,\n    // 'no-buffer-constructor': 0,\n    // 'no-mixed-requires': 0,\n    // 'no-new-require': 0,\n    // 'no-path-concat': 0,\n    // 'no-process-env': 0,\n    // 'no-process-exit': 0,\n    // 'no-restricted-modules': 0,\n    // 'no-sync': 0,\n\n    // Stylistic Issues\n    // http://eslint.org/docs/rules/#stylistic-issues\n    // ----------------------------------------------\n    'array-bracket-newline': 0, // eslint:recommended\n    'array-bracket-spacing': [2, 'never'],\n    'array-element-newline': 0, // eslint:recommended\n    'block-spacing': [2, 'never'],\n    'brace-style': 2,\n    'camelcase': [2, {properties: 'never'}],\n    // 'capitalized-comments': 0,\n    'comma-dangle': [2, 'always-multiline'],\n    'comma-spacing': 2,\n    'comma-style': 2,\n    'computed-property-spacing': 2,\n    // 'consistent-this': 0,\n    'eol-last': 2,\n    'func-call-spacing': 2,\n    // 'func-name-matching': 0,\n    // 'func-names': 0,\n    // 'func-style': 0,\n    // 'id-blacklist': 0,\n    // 'id-length': 0,\n    // 'id-match': 0,\n    // 'indent': 0, // TODO(philipwalton): this rule isn't compatible with\n                    // Google's 4-space indent for line continuations.\n    // 'jsx-quotes': 0,\n    'key-spacing': 2,\n    'keyword-spacing': 2,\n    // 'line-comment-position': 0,\n    'linebreak-style': 2,\n    // 'lines-around-comment': 0,\n    // 'max-depth': 0,\n    'max-len': [2, {\n      code: 80,\n      tabWidth: 2,\n      ignoreUrls: true,\n      ignorePattern: '^goog\\.(module|require)',\n    }],\n    // 'max-lines': 0,\n    // 'max-nested-callbacks': 0,\n    // 'max-params': 0,\n    // 'max-statements': 0,\n    // 'max-statements-per-line': 0,\n    // 'multiline-ternary': 0, // TODO(philipwalton): add a rule to enforce the\n                               // operator appearing at the end of the line.\n    'new-cap': 2,\n    // 'new-parens': 0,\n    // 'newline-per-chained-call': 0,\n    'no-array-constructor': 2,\n    // 'no-bitwise': 0,\n    // 'no-continue': 0,\n    // 'no-inline-comments': 0,\n    // 'no-lonely-if': 0,\n    // 'no-mixed-operators': 0,\n    'no-mixed-spaces-and-tabs': 2, // eslint:recommended\n    // 'no-multi-assign': 0,\n    'no-multiple-empty-lines': [2, {max: 2}],\n    // 'no-negated-condition': 0,\n    // 'no-nested-ternary': 0,\n    'no-new-object': 2,\n    // 'no-plusplus': 0,\n    // 'no-restricted-syntax': 0,\n    'no-tabs': 2,\n    // 'no-ternary': 0,\n    'no-trailing-spaces': 2,\n    // 'no-underscore-dangle': 0,\n    // 'no-unneeded-ternary': 0,\n    // 'no-whitespace-before-property': 0,\n    // 'nonblock-statement-body-position': 0,\n    // 'object-curly-newline': 0,\n    'object-curly-spacing': 2,\n    // 'object-property-newline': 0,\n    'one-var': [2, {\n      var: 'never',\n      let: 'never',\n      const: 'never',\n    }],\n    // 'one-var-declaration-per-line': 0,\n    // 'operator-assignment': 0,\n    // 'operator-linebreak': 0,\n    'padded-blocks': [2, 'never'],\n    // 'padding-line-between-statements': 0,\n    'quote-props': [2, 'consistent'],\n    'quotes': [2, 'single', {allowTemplateLiterals: true}],\n    'require-jsdoc': [2, {\n      require: {\n        FunctionDeclaration: true,\n        MethodDefinition: true,\n        ClassDeclaration: true,\n      },\n    }],\n    'semi': 2,\n    'semi-spacing': 2,\n    // 'semi-style': 0,\n    // 'sort-keys': 0,\n    // 'sort-vars': 0,\n    'space-before-blocks': 2,\n    'space-before-function-paren': [2, {\n      asyncArrow: 'always',\n      anonymous: 'never',\n      named: 'never',\n    }],\n    // 'space-in-parens': 0,\n    // 'space-infix-ops': 0,\n    // 'space-unary-ops': 0,\n    'spaced-comment': [2, 'always'],\n    // 'switch-colon-spacing': 2,\n    // 'template-tag-spacing': 0,\n    // 'unicode-bom': 0,\n    // 'wrap-regex': 0,\n\n    // ECMAScript 6\n    // http://eslint.org/docs/rules/#ecmascript-6\n    // ------------------------------------------\n    // 'arrow-body-style': 0,\n    'arrow-parens': [2, 'always'], // TODO(philipwalton): technically arrow\n                                   // parens are optional but recommended.\n                                   // ESLint doesn't support a *consistent*\n                                   // setting so \"always\" is used.\n    // 'arrow-spacing': 0,\n    'constructor-super': 2, // eslint:recommended\n    'generator-star-spacing': [2, 'after'],\n    // 'no-class-assign': 0,\n    // 'no-confusing-arrow': 0,\n    // 'no-const-assign': 0, // eslint:recommended\n    // 'no-dupe-class-members': 0, // eslint:recommended\n    // 'no-duplicate-imports': 0,\n    'no-new-symbol': 2, // eslint:recommended\n    // 'no-restricted-imports': 0,\n    'no-this-before-super': 2, // eslint:recommended\n    // 'no-useless-computed-key': 0,\n    // 'no-useless-constructor': 0,\n    // 'no-useless-rename': 0,\n    'no-var': 2,\n    // 'object-shorthand': 0,\n    // 'prefer-arrow-callback': 0,\n    // 'prefer-const': 0,\n    // 'prefer-destructuring': 0,\n    // 'prefer-numeric-literals': 0,\n    'prefer-rest-params': 2,\n    'prefer-spread': 2,\n    // 'prefer-template': 0,\n    // 'require-yield': 2, // eslint:recommended\n    'rest-spread-spacing': 2,\n    // 'sort-imports': 0,\n    // 'symbol-description': 0,\n    // 'template-curly-spacing': 0,\n    'yield-star-spacing': [2, 'after'],\n  },\n};\n"
  },
  {
    "path": ".eslintrc.js",
    "content": "module.exports = {\n    extends: './.eslintrc.google.js',\n    parserOptions: {\n        \"ecmaVersion\": 2017,\n        \"sourceType\": \"module\",\n    },\n    env: {\n        es6: true,\n    },\n    rules: {\n        'arrow-parens': 0,\n        'block-spacing': 0,\n        'brace-style': 0,\n        'camelcase': 0,\n        'comma-dangle': 0,\n        'comma-style': [2, 'last'],\n        'curly': 0,\n        'indent': [0, 4],\n        'key-spacing': 0,\n        'linebreak-style': 2,\n        'max-len': 0,\n        'new-cap': 0,\n        'no-invalid-this': 0,\n        'no-multi-spaces': 0,\n        'no-undef': 2,\n        'no-unused-vars': 1,\n        'object-curly-spacing': 0,\n        'padded-blocks': [0, 'never'],\n        'quote-props': 0,\n        'quotes': 0,\n        'require-jsdoc': 0,\n        'semi': [1, 'always'],\n        'space-before-function-paren': [0, {\"anonymous\": \"never\"}],\n        'valid-jsdoc': 0,\n    },\n    globals: {\n        // $: true,\n        _: true,\n        rdfstore: true,\n        FormData: true,\n        Backbone: true,\n        document: true,\n        require: true,\n        define: true,\n        console: true,\n        window: true,\n        process: true,\n        module: true,\n        Image: true,\n        exports: true,\n        parent: true,\n        setTimeout: true,\n        setInterval: true,\n        clearTimeout: true,\n        clearInterval: true,\n        __dirname: true,\n        GM_registerMenuCommand: true,\n        __filename: true,\n        Buffer: true,\n        fetch: true,\n    },\n}\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: Continuous Integration\n\n# Continuous integration test for ocr-fileformat.\n\non:\n  # pull_request:\n  # push:\n  # schedule:\n  #   - cron: 0 20 * * *\n  workflow_dispatch:\n\njobs:\n  linux:\n    runs-on: ubuntu-22.04\n\n    steps:\n    - uses: actions/checkout@v3\n      with:\n        submodules: recursive\n\n    - name: Install tesseract and other dependencies\n      run: |\n           sudo apt-get update\n           sudo make -C example deps\n\n    - name: Run make all\n      run: |\n           make all PREFIX=$HOME\n\n    - name: Run tests\n      run: |\n           make -C example roundtrip diff\n"
  },
  {
    "path": ".github/workflows/codeql.yml",
    "content": "name: \"CodeQL\"\n\non:\n  push:\n    branches: [ \"master\" ]\n  pull_request:\n    branches: [ \"master\" ]\n  schedule:\n    - cron: \"46 17 * * 3\"\n\njobs:\n  analyze:\n    name: Analyze\n    runs-on: ubuntu-latest\n    permissions:\n      actions: read\n      contents: read\n      security-events: write\n\n    strategy:\n      fail-fast: false\n      matrix:\n        language: [ javascript ]\n\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v3\n\n      - name: Initialize CodeQL\n        uses: github/codeql-action/init@v2\n        with:\n          languages: ${{ matrix.language }}\n          queries: +security-and-quality\n\n      - name: Autobuild\n        uses: github/codeql-action/autobuild@v2\n\n      - name: Perform CodeQL Analysis\n        uses: github/codeql-action/analyze@v2\n        with:\n          category: \"/language:${{ matrix.language }}\"\n"
  },
  {
    "path": ".gitignore",
    "content": "/Saxon*\n*.jar\n/*.alto\nvendor/*\n!vendor/Makefile\nocr-fileformat_*\n*~\n"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"vendor/alto-schema\"]\n\tpath = vendor/alto-schema\n\turl = https://github.com/altoxml/schema.git\n[submodule \"vendor/format-converters\"]\n\tpath = vendor/format-converters\n\turl = https://github.com/OCR-D/format-converters.git\n[submodule \"vendor/gcv2hocr\"]\n\tpath = vendor/gcv2hocr\n\turl = https://github.com/dinosauria123/gcv2hocr.git\n[submodule \"vendor/hocr-spec-python\"]\n\tpath = vendor/hocr-spec-python\n\turl = https://github.com/kba/hocr-spec-python.git\n[submodule \"vendor/hOCR-to-ALTO\"]\n\tpath = vendor/hOCR-to-ALTO\n\turl = https://github.com/filak/hOCR-to-ALTO.git\n[submodule \"vendor/im2alto\"]\n\tpath = vendor/im2alto\n\turl = https://github.com/karkraeg/im2alto.git\n[submodule \"vendor/page-to-alto\"]\n\tpath = vendor/page-to-alto\n\turl = https://github.com/kba/page-to-alto.git\n[submodule \"vendor/xsd-validator\"]\n\tpath = vendor/xsd-validator\n\turl = https://github.com/kba/xsd-validator.git\n[submodule \"vendor/textract2page\"]\n\tpath = vendor/textract2page\n\turl = https://github.com/slub/textract2page.git\n"
  },
  {
    "path": ".zipignore",
    "content": ".git\n.zipignore\n.gitignore\nexample\nocr-fileformat_*\n*.pdf\n*.zip\n"
  },
  {
    "path": "CITATION.cff",
    "content": "# This CITATION.cff file was generated with cffinit.\n# Visit https://bit.ly/cffinit to generate yours today!\n\ncff-version: 1.2.0\ntitle: ocr-fileformat\nmessage: >-\n  You may cite this software using the metadata from this file.\ntype: software\nauthors:\n  - name: Universitätsbibliothek Mannheim\n    country: DE\n    city: Mannheim\n    website: 'https://www.bib.uni-mannheim.de/'\n  - given-names: Konstantin\n    family-names: Baierer\n    orcid: 'https://orcid.org/0000-0003-2397-242X'\n  - given-names: Stefan\n    family-names: Weil\n    affiliation: Universitätsbibliothek Mannheim\n    orcid: 'https://orcid.org/0000-0002-0524-9898'\n  - family-names: Zumstein\n    given-names: Philipp\n    affiliation: Universitätsbibliothek Mannheim\n    orcid: 'https://orcid.org/0000-0002-6485-9434'\n  - given-names: Robert\n    family-names: Sachunsky\n  - given-names: Jörg\n    orcid: 'https://orcid.org/0000-0002-6406-4906'\n    family-names: Mechnich\n    affiliation: Universitätsbibliothek Mannheim\n  - given-names: Uwe\n    family-names: Hartwig\n    orcid: 'https://orcid.org/0000-0001-7164-6376'\n  - given-names: Mike\n    family-names: Gerber\n  - given-names: Clemens\n    orcid: 'https://orcid.org/0000-0001-5293-8322'\n    family-names: Neudecker\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM alpine:edge\n\nEXPOSE 8080\nCOPY . /ocr-fileformat\nWORKDIR /ocr-fileformat\nRUN apk add --no-cache openjdk8-jre php7 php7-json php7-openssl python3 py-lxml py-future git make ca-certificates wget bash gcc libc-dev \\\n    && update-ca-certificates \\\n    && make install \\\n    && cp docker.config.php web/config.local.php \\\n    && sed -i '/^upload_max_filesize/ s/=.*$/= 100M/' /etc/php7/php.ini \\\n    && sed -i 's/;extension=php_openssl.dll/extension=php_openssl.dll/' /etc/php7/php.ini \\\n    && mv web /ocr-fileformat-web \\\n    && rm -rf /ocr-fileformat \\\n    && apk del git make wget gcc libc-dev\n# Disable POST upload limit\nRUN sed -i 's,post_max_size = 8M,post_max_size = 0,' /etc/php7/php.ini\nVOLUME /data\nWORKDIR /data\nCMD php7 -S $(hostname -i):8080 -t /ocr-fileformat-web\n"
  },
  {
    "path": "LICENSE",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2016 Universitätsbibliothek Mannheim\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Makefile",
    "content": "# Makefile for ocr-fileformat\n\nPKG_NAME = ocr-fileformat\nPKG_VERSION = 0.5.0\nDOCKER_IMAGE = ubma/ocr-fileformat\n\n# Either get the version from Git (if available) or use PKG_VERSION.\nROOTDIR = $(abspath $(dir $(MAKEFILE_LIST)))\nVERSION = $(shell [ -d \"$(ROOTDIR)/.git\" ] && git -C \"$(ROOTDIR)\" describe --tags 2>/dev/null || echo $(PKG_VERSION))\n\nCP = cp -a\nLN = ln -sf\nMV = mv -f\nMKDIR = mkdir -p\nRM = rm -rfv\nZIP = zip\n\nPREFIX = $(DESTDIR)/usr/local\nSHAREDIR = $(PREFIX)/share/$(PKG_NAME)\nBINDIR = $(PREFIX)/bin\nPYTHON = python3\n\nTSHT = ./test/tsht\nTSHT_URL = https://cdn.rawgit.com/kba/tsht/master/tsht\n\n# BEGIN-EVAL makefile-parser --make-help Makefile\n\nhelp:\n\t@echo \"\"\n\t@echo \"  Targets\"\n\t@echo \"\"\n\t@echo \"    all        Download vendor assets, link XSD schemas and XSLT stylesheets\"\n\t@echo \"    vendor     Download all vendor assets\"\n\t@echo \"    xsd        Link all XSD schemas\"\n\t@echo \"    xslt       Link all XSLT stylesheets\"\n\t@echo \"    install    Install ocr-fileformat\"\n\t@echo \"    uninstall  Uninstall ocr-fileformat\"\n\t@echo \"    clean      Remove linked assets\"\n\t@echo \"    realclean  Remove linked assets and vendor files\"\n\t@echo \"    docker     Create the docker image\"\n\t@echo \"    release    Make release tarball / zipball\"\n\t@echo\n\t@echo\n\t@echo \"  Variables\"\n\t@echo\n\t@echo \"    PREFIX     Top-level directory for installation [$(PREFIX)]\"\n\t@echo \"    PYTHON     Python version to use for tools [$(PYTHON)]\"\n\n# END-EVAL\n\n# Download vendor assets, link XSD schemas and XSLT stylesheets\nall: vendor xsd xslt\n\ncheck:\n\t$(MAKE) -C vendor check\n\n.PHONY: vendor\n# Download all vendor assets\nvendor: check\n\t# download the dependencies\n\tgit submodule update --init\n\t# create+activate a Python venv if not already active\n\tif [ -z \"$(VIRTUAL_ENV)\" ]; then \\\n\t$(PYTHON) -m venv $(SHAREDIR)/venv && \\\n\t. $(SHAREDIR)/venv/bin/activate && \\\n\tpip install -U pip; \\\n\tfi && $(MAKE) -C vendor all\n\n.PHONY: xsd\n# Link all XSD schemas\nxsd: vendor\n\t$(MKDIR) xsd\n\t# copy Alto XSD\n\tcd xsd && $(LN) ../vendor/alto-schema/*/*.xsd . && \\\n\t\tfor xsd in *.xsd;do \\\n\t\t\ttarget_xsd=`echo $$xsd|sed 's/.//g'|sed 's/-/./'`; \\\n\t\t\tif [ ! -e $$target_xsd ];then \\\n\t\t\t\t$(MV) $$xsd $$target_xsd; \\\n\t\t\tfi; done\n\t# copy PAGE XSD\n\t@cd xsd && $(LN) ../vendor/page-schema/*.xsd .\n\t# copy ABBYY XSD\n\tcd xsd && $(LN) ../vendor/abbyy-schema/*.xsd .\n\n.PHONY: xslt\n# Link all XSLT stylesheets\nxslt: vendor\n\t$(MKDIR) xslt\n\t# symlink hocr<->alto as well as the language codes lookup xml\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto.xsl hocr__alto.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto2.0.xsl hocr__alto2.0.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto2.1.xsl hocr__alto2.1.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto3.xsl hocr__alto3.0.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__alto4.xsl hocr__alto4.0.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/alto__hocr.xsl alto__hocr.xsl\n\tcd xslt && $(LN) alto__hocr.xsl alto2.0__hocr.xsl\n\tcd xslt && $(LN) alto__hocr.xsl alto2.1__hocr.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/hocr__text.xsl hocr__text.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/alto__text.xsl alto__text.xsl\n\tcd xslt && $(LN) ../vendor/hOCR-to-ALTO/codes_lookup.xml codes_lookup.xml\n\tcd xslt && $(LN) ../vendor/format-converters/page2hocr.xsl page__hocr.xsl\n\tcd xslt && $(LN) ../vendor/format-converters/abbyy2hocr.xsl abbyy__hocr.xsl\n\tcd xslt && $(LN) ../vendor/format-converters/hocr2tei.xsl hocr__tei.xsl\n\tcd xslt && $(LN) alto2.0__alto3.0.xsl alto2.0__alto3.1.xsl\n\tcd xslt && $(LN) alto2.0__alto3.0.xsl alto2.1__alto3.0.xsl\n\tcd xslt && $(LN) alto2.0__alto3.0.xsl alto2.1__alto3.1.xsl\n\tcd xslt && $(LN) ../vendor/im2alto/iw2alto.xsl mybib__alto3.0.xsl\n\n# Install ocr-fileformat\ndefine SEDSCRIPT\necho '/^SHAREDIR=/c\\'\necho 'SHAREDIR=\"$(SHAREDIR)\"'\necho 's/VERSION/$(VERSION)/'\nendef\nexport SEDSCRIPT\ninstall: all\n\t$(MKDIR) $(SHAREDIR)\n\t$(CP) script xsd xslt vendor lib.sh $(SHAREDIR)\n\t$(RM) $(SHAREDIR)/vendor/*/.git\n\t$(MKDIR) $(BINDIR)\n\teval \"$$SEDSCRIPT\" | sed -f - bin/ocr-transform.sh > $(BINDIR)/ocr-transform\n\teval \"$$SEDSCRIPT\" | sed -f - bin/ocr-validate.sh  > $(BINDIR)/ocr-validate\n\tchmod a+x $(BINDIR)/ocr-transform $(BINDIR)/ocr-validate\n\tfind $(SHAREDIR) -not -type l -exec chmod u+w {} \\;\n\n# Uninstall ocr-fileformat\nuninstall:\n\t$(RM) $(BINDIR)/ocr-transform\n\t$(RM) $(BINDIR)/ocr-validate\n\t$(RM) $(SHAREDIR)\n\n# Remove linked assets\nclean:\n\t$(RM) xsd/*\n\tfind xslt -type l -delete\n\n# Remove linked assets and vendor files\nrealclean: clean\n\t$(MAKE) -C vendor clean\n\n# Create the docker image\ndocker:\n\tdocker build -t \"$(DOCKER_IMAGE)\" .\n\n# Make release tarball / zipball\nrelease:\n\t$(RM) $(PKG_NAME)_$(PKG_VERSION)\n\t$(MKDIR) $(PKG_NAME)_$(PKG_VERSION)\n\ttar -X .zipignore -cf - . | tar -xf - -C $(PKG_NAME)_$(PKG_VERSION)\n\t# $(CP) LICENSE Makefile README.md bin/ lib.sh vendor/\n\ttar czf $(PKG_NAME)_$(PKG_VERSION).tar.gz $(PKG_NAME)_$(PKG_VERSION)\n\tzip --symlinks -r $(PKG_NAME)_$(PKG_VERSION).zip $(PKG_NAME)_$(PKG_VERSION)\n"
  },
  {
    "path": "README.md",
    "content": "# ocr-fileformat\n\n[![Codacy Badge](https://app.codacy.com/project/badge/Grade/1cd1dc54634249aebbe3e157569ed26f)](https://app.codacy.com/gh/UB-Mannheim/ocr-fileformat/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)\n[![Build Status](https://github.com/UB-Mannheim/ocr-fileformat/actions/workflows/ci.yml/badge.svg)](https://github.com/UB-Mannheim/ocr-fileformat/actions/workflows/ci.yml)\n[![GitHub release](https://img.shields.io/github/release/UB-Mannheim/ocr-fileformat.svg?maxAge=3600)](https://github.com/UB-Mannheim/ocr-fileformat/releases)\n[![ocr-fileformat Docker build](https://img.shields.io/docker/automated/ubma/ocr-fileformat.svg?maxAge=2592000?style=plastic)](https://hub.docker.com/r/ubma/ocr-fileformat)\n\nValidate and transform between OCR file formats (hOCR, ALTO, PAGE, FineReader)\n\n![Screenshot GUI](https://raw.githubusercontent.com/UB-Mannheim/ocr-fileformat/master/screenshot.png)\n\n<!-- BEGIN-MARKDOWN-TOC -->\n* [Installation](#installation)\n  * [Docker](#docker)\n  * [System-wide](#system-wide)\n* [Usage](#usage)\n  * [CLI](#cli)\n  * [GUI](#gui)\n  * [API](#api)\n* [Transformation](#transformation)\n  * [Transformation CLI](#transformation-cli)\n  * [Transformation GUI](#transformation-gui)\n  * [Transformation API](#transformation-api)\n  * [Supported Transformations](#supported-transformations)\n* [Validation](#validation)\n  * [Validation CLI](#validation-cli)\n  * [Validation GUI](#validation-gui)\n  * [Validation API](#validation-api)\n  * [Supported Validation Formats](#supported-validation-formats)\n* [License](#license)\n\n<!-- END-MARKDOWN-TOC -->\n\n## Installation\n\n### Docker\n\nYou can run the [command line scripts](#cli) and [web interface](#gui) as a\n[Docker container](https://hub.docker.com/r/ubma/ocr-fileformat), you only need\nDocker installed.\n\nTo start the web interface on [http://localhost:8080](http://localhost:8080):\n\n```sh\ndocker run --rm -it -p 8080:8080 ubma/ocr-fileformat\n```\n\nTo run the command line scripts, mount the directory containing your input\nfiles into the container's `/data` directory:\n\n```sh\ndocker run --rm -it -v \"$PWD\":/data ubma/ocr-fileformat ocr-transform alto2.0 hocr somefile.alto\n```\n\n### System-wide\n\nTo install system-wide to `/usr/local`:\n\n```sh\nsudo make install\n```\n\nTo install without `sudo` to your home directory:\n\n```sh\nmake install PREFIX=$HOME/.local\n```\n\nIf `$HOME/.local/bin` is not in your `PATH`, add this to your shell startup file (e.g. `~/.bashrc` or `~/.zshrc`):\n\n```\nexport PATH=\"$HOME/.local/bin $PATH\"\n```\n\nThe web application has a PHP backed. You can deploy it on any PHP-capable\nserver by copying the [`web`](./web) folder somewhere below the document root\nof your server, e.g. `/var/www/html` for Apache on Debian/Ubuntu:\n\n```\nsudo -u www-data cp -r web /var/www/html/ocr-fileformat\n```\n\nIn this example the GUI would be available under [http://localhost/ocr-fileformat/](http://localhost/ocr-fileformat/).\n\n## Usage\n\nThe project offers two functionalities, which can be accessd via a command line\nscript (CLI), using a web interface (GUI) or in you own tools (API)\n\n### CLI\n\n* [`ocr-transform`](./bin/ocr-transform.sh): Transformation of OCR output between OCR formats\n* [`ocr-validate`](./bin/ocr-validate.sh): Validation of OCR output against OCR format schemas\n\n### GUI\n\nThe web interface is for testing validation and transformations. You can upload\na file or select an input file by URL.\n\n### API\n\n* [`$PREFIX/share/ocr-fileformat/xslt`](./xslt) - XSLT stylesheets\n* [`$PREFIX/share/ocr-fileformat/xsd`](./xsd) - XSD schemas\n* [`$PREFIX/share/ocr-fileformat/script/transform`](./script/transform) - Transformation scripts\n* [`$PREFIX/share/ocr-fileformat/script/validate`](./script/validate) - Validation scripts\n\n## Transformation\n\n### Transformation CLI\n\n```\nUsage: ocr-transform [-dl] <input-fmt> <output-fmt> [<input> [<output>]] [-- <saxon_opts>]\n```\n\nFor example, you can transform an ALTO XML to a hOCR file with:\n\n```sh\nocr-transform alto hocr sample.xml sample.hocr\n```\n\nOr convert from ALTO XML (version 2.1) to hOCR with:\n\n```sh\nocr-transform alto2.1 hocr sample.alto sample.hocr\n```\n\nYou can also pass arguments directly to the Saxon CLI by passing them after a double dash (`--`). For example, to set the `foo` parameter to `bar`:\n\n```sh\nocr-transform alto hocr sample.xml sample.hocr -- foo=bar\n```\n\nTry `ocr-transform -h` to get an overview:\n\n<!-- BEGIN-EVAL echo '```';./bin/ocr-transform.sh -h 2>&1;echo '```'  -->\n```\nUsage:\nocr-transform [OPTIONS] <from> <to> [<infile> [<outfile>]] [-- <script-args>]\nocr-transform [OPTIONS] <from> <to> --help-args Show script-args, and exit\nocr-transform [OPTIONS] -h|--help               Show this help, and exit\nocr-transform [OPTIONS] -v|--version            Show version, and exit\nocr-transform [OPTIONS] -L|--list               List available from/to, and exit\n\n    Options:\n        --debug   -d     Increase debug level by 1, can be repeated\n\n    Transformations:\n        abbyy hocr\n        abbyy page\n        alto hocr\n        alto page\n        alto text\n        alto2.0 alto3.0\n        alto2.0 alto3.1\n        alto2.0 hocr\n        alto2.1 alto3.0\n        alto2.1 alto3.1\n        alto2.1 hocr\n        alto4.2 alto2.1\n        gcv alto\n        gcv hocr\n        gcv page\n        hocr alto\n        hocr alto2.0\n        hocr alto2.1\n        hocr alto3.0\n        hocr alto4.0\n        hocr page\n        hocr tei\n        hocr text\n        mybib alto3.0\n        page alto\n        page alto_legacy\n        page hocr\n        page page2019\n        page text\n        tei hocr\n        textract page\n```\n\n<!-- END-EVAL -->\n\n### Transformation GUI\n\nSelect the `Transform` menu option. Choose a URL, an input and an output\nformat. Click `Transform`.\n\n### Transformation API\n\nThe stylesheets are installed in `$PREFIX/share/ocr-fileformat/xslt` and can be\nused directly in your scripts and software. You will need to use an XSLT 2.0\ncapable stylesheet transformer.\n\n### Supported Transformations\n\n| From ╲ To           | hOCR | ALTO | PAGEXML | TEI | Text |\n| ---:                | ---  | ---  | ---     | --- | ---  |\n| hOCR                | -    | ✓    | ✓       | ✓   | ✓    |\n| ALTO                | ✓    | ✓    | ✓       | -   | ✓    |\n| PAGEXML             | ✓    | ✓    | ✓       | -   | ✓    |\n| ABBYY FineReader    | ✓    | -    | ✓       | -   | -    |\n| Google Cloud Vision | ✓    | ✓    | ✓       | -   | -    |\n| Amazon AWS Textract | -    | -    | ✓       | -   | -    |\n| TEI                 | ✓    | -    | -       | -   | -    |\n\n## Validation\n\n<!-- BEGIN-EVAL echo '```';./bin/ocr-validate.sh -h 2>&1;echo '```'  -->\n```\nUsage:\nocr-validate [OPTIONS] <schema> <file> [<resultsFile>]\nocr-validate [OPTIONS] -h|--help       Show this help, and exit\nocr-validate [OPTIONS] -v|--version    Show version, and exit\nocr-validate [OPTIONS] -L|--list       List available schemas, and exit\n\n    Options:\n        --debug   -d     Increase debug level by 1, can be repeated\n\n    Schemas:\n        hocr\n        alto-1-0 alto-1-1 alto-1-2 alto-1-3 alto-1-4 alto-2-0 alto-2-1 alto-2-2-draft alto-3-0 alto-3-1 alto-3-2-draft alto-4-0 alto-4-1 alto-4-2 alto-4-3\n        abbyy-6-schema-v1 abbyy-8-schema-v2 abbyy-9-schema-v1 abbyy-10-schema-v1\n        page-2009-03-16 page-2010-01-12 page-2010-03-19 page-2013-07-15 page-2016-07-15 page-2017-07-15 page-2018-07-15 page-2019-07-15\n```\n\n<!-- END-EVAL -->\n\n### Validation CLI\n\nFor example, to validate an XML file against the ALTO 3.1 schema:\n\n```\nocr-validate alto-3-1 myFile.alto\n```\n\n### Validation GUI\n\nSelect the `Validate` menu option. Choose a URL and an schema. Click `Validate`.\n\n### Validation API\n\nThe XSD files are installed under `$PREFIX/share/ocr-fileformat/xsd`\n\n### Supported Validation Formats\n\n|            | hOCR | ALTO | PAGEXML | FineReader | Google Cloud Vision | Amazon AWS Textract |\n| ---:       | ---  | ---  | ---     | ---        | ---                 | ---                 |\n| Validation | ✓    | ✓    | ✓       | ✓          | -                   | -                   |\n\n\n## License\n\nThis is free software. You may use it under the terms of the [MIT License](LICENSE).\n\nDuring the installation process several projects are included (in [`./vendor`](./vendor)). These projects have different licenses:\n\n* [Saxon HE 9.7](http://saxon.sourceforge.net/#F9.7HE), [`MPL`](https://www.mozilla.org/MPL/).\n* [ALTOXML schema](https://github.com/altoxml/schema), [\"Open Source\"](https://github.com/altoxml/schema/issues/37#issuecomment-218730230) for ALTO <= 3.1, [`CC BY SA 4.0`](https://creativecommons.org/licenses/by-sa/4.0/legalcode) since ALTO 4.0\n* [PAGE schemas](http://www.primaresearch.org/schema/PAGE/gts/pagecontent/), `?`\n* [xsd-validator](https://github.com/kba/xsd-validator) by Adrian Mouat [@amouat](https://github.com/amouat), `Apache 2.0`\n* ABBYY FineReader XSD, `?`\n* [hOCR-to-ALTO](https://github.com/filak/hOCR-to-ALTO) by Filip Kriz [@filak](https://github.com/filak), [`MIT`](https://github.com/filak/hOCR-to-ALTO/blob/master/LICENSE.txt)\n* [hocr-spec](https://github.com/kba/hocr-spec-python) by Konstantin Baierer [@kba](https://github.com/kba), [`MIT`](https://github.com/kba/hocr-spec-python/blob/master/LICENSE)\n* [gcv2hocr](https://github.com/dinosauria123/gcv2hocr) by Endo Michiaki, [`CC BY 4.0`](https://creativecommons.org/licenses/by/4.0/legalcode)\n* [format-converters](https://github.com/OCR-D/format-converters) by OCR-D, [`Apache 2.0`](https://github.com/OCR-D/format-converters/blob/master/LICENSE)\n* [prima-page-converter](https://github.com/PRImA-Research-Lab/prima-page-converter/) by PRImA Research Lab , [`Apache 2.0`](https://github.com/PRImA-Research-Lab/prima-page-converter/blob/master/LICENSE)\n* [page-to-alto](https://github.com/kba/page-to-alto/) by Konstantin Baierer @kba, [`Apache 2.0`](https://github.com/kba/page-to-alto/blob/master/LICENSE)\n* [textract2page](https://github.com/slub/textract2page/) by Arne Rümmler @rue-a, [`Apache 2.0`](https://github.com/slub/textract2page/blob/master/LICENSE)\n"
  },
  {
    "path": "bin/ocr-transform.sh",
    "content": "#!/usr/bin/env bash\n\n# Default to the parent dir of this script. Overwritten by `make install`\nSHAREDIR=\"$(readlink -f \"$(dirname \"$(readlink -f \"$0\")\")/..\")\"\nsource \"$SHAREDIR/lib.sh\"\n\n#{{{ show_usage ()\nshow_usage () {\n    [[ \"$#\" -gt 0 ]] && logerr \"$@\"\n\n    echo >&2 \"Usage:\n${0##*/} [OPTIONS] <from> <to> [<infile> [<outfile>]] [-- <script-args>]\n${0##*/} [OPTIONS] <from> <to> --help-args Show script-args, and exit\n${0##*/} [OPTIONS] -h|--help               Show this help, and exit\n${0##*/} [OPTIONS] -v|--version            Show version, and exit\n${0##*/} [OPTIONS] -L|--list               List available from/to, and exit\n\n    Options:\n        --debug   -d     Increase debug level by 1, can be repeated\n\n\"\n    echo >&2 -e \"\\n${INDENT}Transformations:\"\n    show_transformations|sed \"s/^/${INDENT}${INDENT}/\"\n\n    [[ \"$#\" -gt 0 ]] && exit 1\n}\n#}}}\n\n#{{{ show_version ()\nshow_version () {\n    echo \"${0##*/} VERSION\"\n}\n#}}}\n\n#{{{ main ()\nmain () {\n    # debug option -d -d to print all commands to the terminal\n    if (( DEBUG > 1 ));then\n        set -x\n    fi\n\n    local from=\"$1\" to=\"$2\" infile='-' outfile='-' transformer\n    shift 2\n\n    # Validate parameters\n    if [[ -z \"$from\" ]];then\n        show_usage \"Must set 'from' parameter\"\n    elif [[ -z \"$to\" ]];then\n        show_usage \"Must set 'to' parameter\"\n    elif [[ -z \"${OCR_TRANSFORMATIONS[$from]}\" ]];then\n        show_usage \"No mapping from '$from'\"\n    else\n        declare -a possible=(${OCR_TRANSFORMATIONS[$from]})\n        if ! in_array \"$to\" \"${possible[@]}\";then\n            show_usage \"No mapping from '$from' to '$to'\"\n        fi\n    fi\n    transformer=${OCR_TRANSFORMERS[${from}__${to}]}\n\n    if [[ \"$1\" == '--help-args' ]];then\n        if [[ \"$transformer\" = */gcv__hocr ]];then\n            echo >&2 -e \"${INDENT}Extra arguments: <width> <height>\"\n        elif [[ \"$transformer\" = */page__alto ]];then\n            echo >&2 -e \"${INDENT}page-to-alto options:\"\n            page-to-alto --help|sed '1,/^Options:/d;/--output-file/,$d' >&2\n        elif [[ \"$transformer\" = */textract__page ]];then\n            echo >&2 -e \"${INDENT}textract2page arguments: <image-file>\"\n            echo >&2 -e \"${INDENT}textract2page options:\"\n        else\n            # xsl and other transformers both take arbitrary Saxon options\n            show_saxon_options|sed \"s/^/${INDENT}${INDENT}/\"\n        fi\n        exit 0\n    fi\n\n    declare -a script_args\n\n    # <infile>\n    if [[ \"$1\" == '--' ]];then\n        script_args+=(\"${@:2}\")\n        set --\n    elif [[ -n \"$1\" ]];then\n        infile=\"$1\"\n    fi\n    shift\n\n    # <outfile>\n    if [[ \"$1\" == '--' ]];then\n        script_args+=(\"${@:2}\")\n        set --\n    elif [[ -n \"$1\" ]];then\n        outfile=\"$1\"\n    fi\n    shift;\n\n    # <script-args>\n    if [[ \"$1\" == '--' ]];then\n        script_args+=(\"${@:2}\")\n    fi\n\n    if (( DEBUG > 0 ));then\n        [[ \"$infile\" = '-' ]] && logdebug \"Reading from STDIN\"\n        [[ \"$outfile\" = '-' ]] && logdebug \"Writing to STDOUT\"\n    fi\n\n    # Run it\n    optstate=$(set +o)\n    set -o errexit\n    if [[ \"$transformer\" = *.xsl ]];then\n        script_args=(\"${script_args[@]}\" \"-xsl:$transformer\")\n        script_args=(\"${script_args[@]}\" \"-s:$infile\")\n        [[ \"$outfile\" != '-' ]] &&  script_args=(\"${script_args[@]}\" \"-o:$outfile\")\n        exec_saxon \"${script_args[@]}\"\n    else\n        script_args=(\"$infile\" \"$outfile\" \"${script_args[@]}\")\n        source \"$transformer\" \"${script_args[@]}\"\n    fi\n    eval \"$optstate\"\n}\n#}}}\n\nwhile [[ \"$1\" = -* ]]; do\n    case \"$1\" in\n        -d|--debug) let DEBUG+=1 ;;\n        -L|--list) show_transformations ; exit 0 ;;\n        -h|--help) show_usage ; exit 0 ;;\n        -v|--version) show_version ; exit 0 ;;\n        *) logerr \"Unknown option '$1'\" && show_usage && exit 1 ;;\n    esac\n    shift\ndone\n\nif [[ -d \"$SHAREDIR/venv\" ]];then\n    . \"$SHAREDIR/venv/bin/activate\"\nfi\n\nmain \"$@\"\n"
  },
  {
    "path": "bin/ocr-validate.sh",
    "content": "#!/usr/bin/env bash\n\n# Default to the parent dir of this script. Overwritten by `make install`\nSHAREDIR=\"$(readlink \"$(dirname \"$(readlink \"$0\")\")/..\")\"\nsource \"$SHAREDIR/lib.sh\"\n\n#{{{ show_usage ()\nshow_usage () {\n    [[ \"$#\" -gt 0 ]] && logerr \"$@\"\n\n    echo >&2 \"Usage:\n${0##*/} [OPTIONS] <schema> <file> [<resultsFile>]\n${0##*/} [OPTIONS] -h|--help       Show this help, and exit\n${0##*/} [OPTIONS] -v|--version    Show version, and exit\n${0##*/} [OPTIONS] -L|--list       List available schemas, and exit\n\n    Options:\n        --debug   -d     Increase debug level by 1, can be repeated\n\n\"\n    echo >&2 -e \"\\n${INDENT}Schemas:\"\n    show_schemas|sed \"s/^/${INDENT}${INDENT}/\"\n    echo\n\n    [[ \"$#\" -gt 0 ]] && exit 1\n}\n#}}}\n\n#{{{ show_version ()\nshow_version () {\n    echo \"${0##*/} VERSION\"\n}\n#}}}\n\n#{{{ main ()\nmain () {\n    # debug option -d -d to print all commands to the terminal\n    if (( DEBUG > 1 ));then\n        set -x\n    fi\n\n    local schema=\"$1\" file=\"$2\"\n    shift 2\n\n    if [[ -z \"$schema\" ]];then\n        show_usage \"Must set 'schema'\"\n    elif [[ -z \"${OCR_VALIDATORS[$schema]}\" ]];then\n        show_usage \"No such schema '$schema'\"\n    fi\n\n    if [[ -z \"$file\" ]];then\n        show_usage \"Must set 'file'\"\n    fi\n\n    if [[ \"$file\" == \"-\" ]];then\n        ((DEBUG > 1)) && loginfo \"Reading from STDIN\"\n    else \n        file=$(readlink \"$file\")\n        if [[ ! -e \"$file\" ]];then\n            show_usage \"No such file: '$file'\"\n        fi\n    fi\n\n    if [[ \"${OCR_VALIDATORS[$schema]}\" = *.xsd ]];then\n        \"exec_xsdv\" \"$schema\" \"$file\"\n    else\n        source \"${OCR_VALIDATORS[$schema]}\" \"$file\"\n    fi\n}\n#}}}\n\nwhile [[ \"$1\" = -* ]]; do\n    case \"$1\" in\n        --debug|-d) let DEBUG+=1 ;;\n        --list|-L) show_schemas|sed -e 's/\\s*$//' -e 's/ \\+/\\n/g' ; exit 0 ;;\n        --help|-h) show_usage ; exit 0 ;;\n        --version|-v) show_version ; exit 0 ;;\n        *) logerr \"Unknown option '$1'\" && show_usage && exit 1 ;;\n    esac\n    shift\ndone\n\nif [[ -d \"$SHAREDIR/venv\" ]];then\n    . \"$SHAREDIR/venv/bin/activate\"\nfi\n\nmain \"$@\"\n"
  },
  {
    "path": "docker.config.php",
    "content": "<?php\n$config['ocr-validate'] = '/usr/local/bin/ocr-validate';\n$config['ocr-transform'] = '/usr/local/bin/ocr-transform';\n"
  },
  {
    "path": "example/.gitignore",
    "content": "wetzel_reisebegleiter_1901_0021*.alto\nwetzel_reisebegleiter_1901_0021*.hocr\nwetzel_reisebegleiter_1901_0021*.page\n/out\n"
  },
  {
    "path": "example/Makefile",
    "content": "# https://media.dwds.de/dta/images/wetzel_reisebegleiter_1901/wetzel_reisebegleiter_1901_0021_800px.jpg\nBOOK=wetzel_reisebegleiter_1901\nPAGE=0021\nBASENAME=$(BOOK)_$(PAGE)\n\nDEBIAN_PACKAGES = libxml2-utils tesseract-ocr tesseract-ocr-script-frak wget dwdiff\nXMLLINT = xmllint --format\nOCR_TRANSFORM = ../bin/ocr-transform.sh\nifdef HOMEBREW_PREFIX\nTESSERACT = $(HOMEBREW_PREFIX)/bin/tesseract -l Fraktur\nelse\nTESSERACT = /usr/bin/tesseract -l Fraktur\nendif\nWGET = wget\nRM = rm -f\nDWDIFF = dwdiff -p -l -c\nPAGER = less -R\nAPT_GET = sudo apt-get -y\n\n.PHONY: roundtrip\nroundtrip: $(BASENAME).roundtrip.hocr $(BASENAME).alto.page\n\n.PHONY: deps\ndeps:\n\t$(APT_GET) install $(DEBIAN_PACKAGES)\n\n.PHONY: diff\ndiff: $(BASENAME).roundtrip.hocr $(BASENAME).hocr\n\t$(DWDIFF) $^ || exit 0\n\n.PHONY: idiff\nidiff: $(BASENAME).roundtrip.hocr $(BASENAME).hocr\n\t$(DWDIFF) $^ | $(PAGER)\n\n$(BASENAME)_800px.jpg:\n\t$(WGET) http://media.dwds.de/dta/images/$(BOOK)/$(BASENAME)_800px.jpg\n\n$(BASENAME).hocr : $(BASENAME)_800px.jpg\n\t$(TESSERACT) $< stdout hocr | $(XMLLINT) - > $@\n\n\n$(BASENAME).alto : $(BASENAME).hocr\n\t$(OCR_TRANSFORM) hocr alto2.0 $< | $(XMLLINT) - > $@\n\n$(BASENAME).alto.page : $(BASENAME).alto\n\t$(OCR_TRANSFORM) alto page $< | $(XMLLINT) - > $@\n\n$(BASENAME).alto.page.alto : $(BASENAME).alto.page\n\t$(OCR_TRANSFORM) page alto $< | $(XMLLINT) - > $@\n\n$(BASENAME).roundtrip.hocr : $(BASENAME).alto\n\t$(OCR_TRANSFORM) alto hocr $< | $(XMLLINT) - > $@\n\nclean:\n\t$(RM) $(BASENAME)*.hocr $(BASENAME)*.alto\n"
  },
  {
    "path": "example/README.md",
    "content": "# Testing transformations\n\nInstall dependencies. For Debian/Ubuntu:\n\n    make deps\n\nRun a roundtrip example:\n\n    make roundtrip\n\nThis will:\n\n* download image (`-> x.jpg`)\n* OCR the image (`-> x.hocr`)\n* hOCR -> ALTO 2.0 (`-> x.alto`)\n* ALTO 2.0 -> hOCR (`-> x.roundtrip.hocr`)\n\nTo see the information lost/added:\n\n    make diff\n\nThis will compare `x.hocr` to `x.roundtrip.hocr` using `dwdiff` and open the result in a pager.\n\n## License\n\nThe example data is from the [Deutsches Textarchiv](https://www.deutschestextarchiv.de/book/show/wetzel_reisebegleiter_1901) project, data is licensed CC BY-NC 3.0.\n"
  },
  {
    "path": "lib.sh",
    "content": "#!/usr/bin/env bash\n\n#{{{ Logging\nif [[ -n \"$COLORTERM\" || \"$TERM\" = *color* || \"$TERM\" = xterm* ]];then\n    COLOR_ERROR=\"\\033[1;31m\"\n    COLOR_INFO=\"\\033[1;32m\"\n    COLOR_DEBUG=\"\\033[1;34m\"\n    COLOR_DEFAULT=\"\\033[0m\"\nfi\n# shellcheck disable=SC2048\nlogerr () {\n    local IFS=$'\\n'\n    for line in $*;do\n        echo -e \"${COLOR_DEFAULT}[${COLOR_ERROR}ERROR${COLOR_DEFAULT}] $line\" >&2\n    done\n}\nloginfo () { echo -e \"${COLOR_DEFAULT}[${COLOR_INFO}INFO${COLOR_DEFAULT}] $*\" >&2; }\nlogdebug () { echo -e \"${COLOR_DEFAULT}[${COLOR_DEBUG}DEBUG${COLOR_DEFAULT}] $*\" >&2; }\n#}}}\n\nif [[ -z \"$SHAREDIR\" || ! -d \"$SHAREDIR\" ]];then\n    logerr \"Set \\$SHAREDIR before sourcing $0\"\n    exit 1\nfi\n\n#{{{ utils (in_array)\n# utility function to find the first pos param in the rest pos params\nin_array () {\n    local e\n    for e in \"${@:2}\"; do [[ \"$e\" == \"$1\" ]] && return 0; done\n    return 1\n}\n#}}}\n\n#{{{ Global vars\nexport DEBUG=0\nexport INDENT=\"    \"\n# Mapping 'fmt' -> 'fmt2 fmt3 fmt4'\ndeclare -Ax OCR_TRANSFORMATIONS=()\n# Mapping 'fmt' -> '/path-to-xslt-or-transform-script'\ndeclare -Ax OCR_TRANSFORMERS=()\n# Mapping 'fmt' -> '/path-to-xsd-or-validate-script'\ndeclare -Ax OCR_VALIDATORS=()\n#}}}\n\n#{{{ Set up validation and transformation formats\n# setup_transformations ()\nsetup_transformations () {\n    declare -a transformers=($(\n        find -L \"$SHAREDIR/xslt\" \"$SHAREDIR/script/transform\" \\\n            ! -type d \\( -name '*.xsl' -or -perm -005 \\) \\\n        ))\n    local in_fmt out_fmt\n    for path in \"${transformers[@]}\";do\n        fmt=${path##*/}\n        fmt=${fmt%.*}\n        OCR_TRANSFORMERS[$fmt]=\"$path\"\n        in_fmt=${fmt%%__*}\n        out_fmt=${fmt##*__}\n        if [[ -z \"${OCR_TRANSFORMATIONS[$in_fmt]}\" ]];then\n            OCR_TRANSFORMATIONS[$in_fmt]=\"$out_fmt\"\n        else\n            OCR_TRANSFORMATIONS[$in_fmt]+=\" $out_fmt\"\n        fi\n    done\n}\n\n# setup_validations ()\nsetup_validations () {\n    declare -a validators=($(\n        find -L \"$SHAREDIR/xsd\" \"$SHAREDIR/script/validate\" \\\n            ! -type d \\( -name '*.xsd' -or -perm -005 \\) \\\n            |sort))\n    local path fmt\n    for path in \"${validators[@]}\";do\n        fmt=${path##*/}\n        fmt=${fmt%.*}\n        OCR_VALIDATORS[$fmt]=\"$path\"\n    done\n}\n\nsetup () {\n    setup_transformations\n    setup_validations\n}\nsetup\n#}}}\n\n#{{{ List transformations, validations, saxon options\n# show_schemas ()\nshow_schemas() {\n    local schema schemagroup\n    declare -a sorted=($(IFS=$'\\n'; echo \"${!OCR_VALIDATORS[*]}\"|sort -t- -nk2  -k1))\n    for schema in \"${sorted[@]}\";do\n        [[ -n \"$schemagroup\" && \"$schemagroup\" != ${schema%%-*} ]] && echo\n        echo -n \"$schema \"\n        schemagroup=${schema%%-*}\n    done\n}\n\n# show_transformations ()\nshow_transformations() {\n    local in_fmt out_fmt\n    for in_fmt in \"${!OCR_TRANSFORMATIONS[@]}\";do\n        declare -a out_fmts=(${OCR_TRANSFORMATIONS[$in_fmt]})\n        for out_fmt in \"${out_fmts[@]}\";do\n            echo \"${in_fmt} ${out_fmt}\";\n        done\n    done|sort\n}\n\n# show_saxon_options ()\nshow_saxon_options () {\n    exec_saxon -t 2>&1|sed -e '0,/No source file/ d' -e '/Format:/ d'\n}\n#}}}\n\n#{{{ run saxon / xsd-validator (xsdv.sh)\n# exec_saxon ()\nexec_saxon() {\n    (( DEBUG > 0 )) && loginfo Executing \"java -jar $SHAREDIR/vendor/saxon.jar\" \"$@\"\n    (( DEBUG > 1 )) && SAXON_ARGS+=('-t')\n    java -jar \"$SHAREDIR/vendor/saxon.jar\" \"$@\"\n}\n\n# exec_xsdv ()\nexec_xsdv() {\n    local schema=\"$1\" file=\"$2\"\n    cd \"$SHAREDIR/vendor/xsd-validator\"\n    if ((DEBUG > 0));then\n        loginfo \"PWD: '$PWD'\"\n        loginfo \"./xsdv.sh '$SHAREDIR/xsd/${schema}.xsd' '$file'\"\n    fi\n    ./xsdv.sh \"$SHAREDIR/xsd/${schema}.xsd\" \"$file\"\n}\n#}}}\n"
  },
  {
    "path": "script/transform/README.md",
    "content": "Scripts should be named `<from>__<to>`, e.g. `hocr-1.0__abbby-10`.\n\nWill be called as\n\n```\n/script/transform/<from>__<to> <infile> <outfile> <additional-args>\n```\n\nBoth `<infile>` and `<outfile>` can be `-`, in which case input should be read\nfrom STDIN or written to STDOUT.\n"
  },
  {
    "path": "script/transform/alto__page",
    "content": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJAR=\"$VENDORDIR/JPageConverter/PageConverter.jar\"\nINFILE=\"$1\"\nOUTFILE=\"$2\"\nARGUMENT=\"$3\"\n\nif [[ \"$1\" = \"-\" ]]; then\n    INFILE=\"$(mktemp)\"\n    cat >\"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    OUTFILE=\"$(mktemp)\"\nfi\n\njava -jar \"$JAR\" -neg-coords toZero -source-xml \"$INFILE\" -target-xml \"$OUTFILE\" -convert-to LATEST 2>&1\n\nif [[ \"$1\" = \"-\" ]]; then\n    rm \"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    if [[ -z \"$ARGUMENT\" ]]; then\n        cat \"$OUTFILE\"\n    else\n        java -cp \"$VENDORDIR/saxon.jar\" net.sf.saxon.Query -s:\"$OUTFILE\" -qs:/ \"$ARGUMENT\"\n    fi\n    rm \"$OUTFILE\"\nfi\n"
  },
  {
    "path": "script/transform/gcv__alto",
    "content": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJAR=\"$VENDORDIR/JPageConverter/PageConverter.jar\"\nINFILE=\"$1\"\nOUTFILE=\"$2\"\nARGUMENT=\"$3\"\n\nif [[ \"$1\" = \"-\" ]]; then\n    INFILE=\"$(mktemp)\"\n    cat >\"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    OUTFILE=\"$(mktemp)\"\nfi\n\njava -jar \"$JAR\" -neg-coords toZero -source-json \"$INFILE\" -target-xml \"$OUTFILE\" -convert-to ALTO 2>&1\n\nif [[ \"$1\" = \"-\" ]]; then\n    rm \"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    if [[ -z \"$ARGUMENT\" ]]; then\n        cat \"$OUTFILE\"\n    else\n        java -cp \"$VENDORDIR/saxon.jar\" net.sf.saxon.Query -s:\"$OUTFILE\" -qs:/ \"$ARGUMENT\"\n    fi\n    rm \"$OUTFILE\"\nfi\n"
  },
  {
    "path": "script/transform/gcv__hocr",
    "content": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nVENDORSCRIPT=\"$VENDORDIR/gcv2hocr/gcv2hocr\"\nINFILE=\"$1\"\nOUTFILE=\"$2\"\n#TODO\nWIDTH=${3:-2000}\nHEIGHT=${4:-2000}\n\nif [[ \"$1\" = \"-\" ]]; then\n    INFILE=\"$(mktemp)\"\n    cat >\"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    OUTFILE=\"$(mktemp)\"\nfi\n\n\"$VENDORSCRIPT\" \"$INFILE\" \"$OUTFILE\" \"$WIDTH\" \"$HEIGHT\"\n\nif [[ \"$1\" = \"-\" ]]; then\n    rm \"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    cat \"$OUTFILE\"\n    rm \"$OUTFILE\"\nfi\n\nrm preout1.txt preout2.txt\n"
  },
  {
    "path": "script/transform/gcv__page",
    "content": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJAR=\"$VENDORDIR/JPageConverter/PageConverter.jar\"\nINFILE=\"$1\"\nOUTFILE=\"$2\"\nARGUMENT=\"$3\"\n\nif [[ \"$1\" = \"-\" ]]; then\n    INFILE=\"$(mktemp)\"\n    cat >\"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    OUTFILE=\"$(mktemp)\"\nfi\n\njava -jar \"$JAR\" -neg-coords toZero -source-json \"$INFILE\" -target-xml \"$OUTFILE\" -convert-to LATEST 2>&1\n\nif [[ \"$1\" = \"-\" ]]; then\n    rm \"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    if [[ -z \"$ARGUMENT\" ]]; then\n        cat \"$OUTFILE\"\n    else\n        java -cp \"$VENDORDIR/saxon.jar\" net.sf.saxon.Query -s:\"$OUTFILE\" -qs:/ \"$ARGUMENT\"\n    fi\n    rm \"$OUTFILE\"\nfi\n"
  },
  {
    "path": "script/transform/page__alto",
    "content": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nINFILE=\"$1\"\nOUTFILE=\"$2\"\nARGUMENTS=(\"${@:3}\")\n\nif [[ \"$1\" = \"-\" ]]; then\n    INFILE=\"$(mktemp)\"\n    cat >\"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    OUTFILE=\"$(mktemp)\"\nfi\n\npage-to-alto \"${ARGUMENTS[@]}\" -O \"$OUTFILE\" \"$INFILE\" ; retval=\"$?\"\n\nif [[ \"$1\" = \"-\" ]]; then\n    rm \"$INFILE\"\nfi\n\nif (( retval > 0 )); then\n    rm \"$OUTFILE\"\n    exit $retval\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    cat \"$OUTFILE\"\n    rm \"$OUTFILE\"\nfi\n"
  },
  {
    "path": "script/transform/page__alto_legacy",
    "content": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nJAR=\"$VENDORDIR/JPageConverter/PageConverter.jar\"\nINFILE=\"$1\"\nOUTFILE=\"$2\"\nARGUMENT=\"$3\"\n\nif [[ \"$1\" = \"-\" ]]; then\n    INFILE=\"$(mktemp)\"\n    cat >\"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    OUTFILE=\"$(mktemp)\"\nfi\n\njava -jar \"$JAR\" -neg-coords toZero -source-xml \"$INFILE\" -target-xml \"$OUTFILE\" -convert-to ALTO 2>&1\n\nif [[ \"$1\" = \"-\" ]]; then\n    rm \"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    if [[ -z \"$ARGUMENT\" ]]; then\n        cat \"$OUTFILE\"\n    else\n        java -cp \"$VENDORDIR/saxon.jar\" net.sf.saxon.Query -s:\"$OUTFILE\" -qs:/ \"$ARGUMENT\"\n    fi\n    rm \"$OUTFILE\"\nfi\n"
  },
  {
    "path": "script/transform/textract__page",
    "content": "#!/bin/bash\n\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENDORDIR=\"$(cd $SCRIPTDIR/../../vendor/; pwd)\"\nINFILE=\"$1\"\nOUTFILE=\"$2\"\nARGUMENTS=(\"${@:3}\")\n\nif [[ \"$1\" = \"-\" ]]; then\n    INFILE=\"$(mktemp)\"\n    cat >\"$INFILE\"\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    OUTFILE=\"$(mktemp)\"\nfi\n\ntextract2page \"${ARGUMENTS[@]:1}\" -O \"$OUTFILE\" \"$INFILE\" \"${ARGUMENTS[0]}\"; retval=\"$?\"\n\nif [[ \"$1\" = \"-\" ]]; then\n    rm \"$INFILE\"\nfi\n\nif (( retval > 0 ));then\n    rm \"$OUTFILE\"\n    exit $retval\nfi\n\nif [[ \"$2\" = \"-\" ]]; then\n    cat \"$OUTFILE\"\n    rm \"$OUTFILE\"\nfi\n"
  },
  {
    "path": "script/validate/README.md",
    "content": "Scripts here will be called by `ocr-validate`.\n\nName should be the format and version, lowercase letters, numbers and dash only.\n"
  },
  {
    "path": "script/validate/hocr",
    "content": "#!/bin/bash\nSCRIPTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nHOCR_SPEC=\"$SCRIPTDIR/../../vendor/hocr-spec-python/hocr-spec\"\n\nformat=\"xml\"\nif [[ \"$TERM\" = *\"color\"* ]];then\n    format=\"ansi\"\nfi\n\npython3 \"$HOCR_SPEC\" -f \"$format\" -p relaxed --filename \"STDIN\" \"$1\"\n"
  },
  {
    "path": "vendor/Makefile",
    "content": "MKDIR = mkdir -p\nRM = rm -rfv\nUNZIP = unzip -o\nWGET = wget --progress=bar:force --no-verbose\nPIP = pip3\n\nSAXON_HE_VERSION_MAJOR = 11\nSAXON_HE_VERSION_MINOR = 2\nSAXON_HE_ZIP = SaxonHE$(SAXON_HE_VERSION_MAJOR)-$(SAXON_HE_VERSION_MINOR)J.zip\nSAXON_HE_URL = https://netcologne.dl.sourceforge.net/project/saxon/Saxon-HE/$(SAXON_HE_VERSION_MAJOR)/Java/$(SAXON_HE_ZIP)\nSAXON_HE_JAR = saxon-he-$(SAXON_HE_VERSION_MAJOR).$(SAXON_HE_VERSION_MINOR).jar\n\nPAGE_SCHEMA_REPO = page-schema\nPAGE_SCHEMA_VERSIONS = 2009-03-16 2010-01-12 2010-03-19 2013-07-15 2016-07-15 2017-07-15 2018-07-15 2019-07-15\nPAGE_SCHEMA_BASE_URL = https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/PAGE-release/gts/pagecontent\n\nABBYY_SCHEMA_REPO = abbyy-schema\nABBYY_SCHEMA_BASE_URL = https://fr7.abbyy.com/FineReader_xml/FineReader\nABBYY_SCHEMA_VERSIONS = 6-schema-v1 8-schema-v2 9-schema-v1 10-schema-v1\n\nALTO2PAGE_VERSION_MAJOR_MINOR = 1.5\nALTO2PAGE_VERSION = $(ALTO2PAGE_VERSION_MAJOR_MINOR).06\nALTO2PAGE_ZIP = JPageConverter.zip\nALTO2PAGE_URL = https://github.com/UB-Mannheim/prima-page-converter/releases/download/$(ALTO2PAGE_VERSION)/JPageConverter_$(ALTO2PAGE_VERSION).zip\nALTO2PAGE_DIR = JPageConverter\n\n# {{{\n# SAXON_BROWSER_VERSION = 1.1\n# SAXON_BROWSER_ZIP = Saxon-CE_$(SAXON_BROWSER_VERSION).zip\n# SAXON_BROWSER_JS =  TODO\n# SAXON_BROWSER_URL = http://www.saxonica.com/ce/download/$(SAXON_BROWSER_ZIP)\n\n# $(SAXON_BROWSER_JS): $(SAXON_BROWSER_ZIP)\n\n# $(SAXON_BROWSER_ZIP):\n#     wget -O '$@' '$(SAXON_BROWSER_URL)'\n#}}}\n\n.PHONY: all check $(PAGE_SCHEMA_REPO) $(ABBYY_SCHEMA_REPO) gcv2hocr page-to-alto textract2page format-converters\n\nall:\\\n\t$(PAGE_SCHEMA_REPO)\\\n\t$(ABBYY_SCHEMA_REPO)\\\n\tgcv2hocr \\\n\tsaxon.jar \\\n\t$(ALTO2PAGE_DIR) \\\n\tpage-to-alto \\\n\ttextract2page \\\n\tformat-converters\n\nclean:\n\t$(RM) $(SAXON_HE_JAR) saxon.jar\n\t$(RM) $(SAXON_HE_ZIP)\n\t$(RM) $(PAGE_SCHEMA_REPO)\n\t$(RM) $(ALTO2PAGE_DIR)\n\t$(RM) $(ALTO2PAGE_ZIP)\n\ncheck:\n\t@which wget >/dev/null || (echo \"Missing wget. Please install package wget.\" && exit 1)\n\t@which unzip >/dev/null || (echo \"Missing unzip. Please install package unzip.\" && exit 1)\n\n$(ABBYY_SCHEMA_REPO):\n\t@$(MKDIR) \"$@\" && cd \"$@\" && \\\n\t\tfor version in $(ABBYY_SCHEMA_VERSIONS);do \\\n\t\t\txsd=abbyy-$$version.xsd; if [ ! -e $$xsd ];then \\\n\t\t\t\t$(WGET) -O $$xsd $(ABBYY_SCHEMA_BASE_URL)$$version.xml; \\\n\t\t\tfi; \\\n\t\tdone;\n\n$(PAGE_SCHEMA_REPO):\n\t@$(MKDIR) \"$@\" && cd \"$@\" && \\\n\t\tfor version in $(PAGE_SCHEMA_VERSIONS);do \\\n\t\t\txsd=page-$$version.xsd; if [ ! -e $$xsd ];then \\\n\t\t\t\t$(WGET) -O $$xsd $(PAGE_SCHEMA_BASE_URL)/$$version/pagecontent.xsd; \\\n\t\t\tfi; \\\n\t\tdone;\n\nsaxon.jar: $(SAXON_HE_JAR)\n\tln -sf \"$<\" \"$@\"\n\n$(SAXON_HE_JAR): $(SAXON_HE_ZIP)\n\t$(UNZIP) \"$<\"\n\n$(SAXON_HE_ZIP):\n\t$(WGET) -O \"$@\" \"$(SAXON_HE_URL)\"\n\ngcv2hocr:\n\t$(MAKE) -C $@\n\n$(ALTO2PAGE_ZIP):\n\t$(WGET) -O \"$@\" \"$(ALTO2PAGE_URL)\"\n\n$(ALTO2PAGE_DIR): $(ALTO2PAGE_ZIP)\n\t$(UNZIP) \"$<\"\n\trm -rf \"$@\"\n\tmv \"JPageConverter $(ALTO2PAGE_VERSION)\" \"$@\"\n\npage-to-alto:\n\tcd \"$@\"; $(PIP) install .\n\ntextract2page:\n\tcd \"$@\"; $(PIP) install .\n\nformat-converters:\n\tcd \"$@\"; $(PIP) install .\n"
  },
  {
    "path": "web/config.php",
    "content": "<?php\nif (!defined('IncludingScript')) {\n    die('Direct access not permitted');\n}\n\n// We don't want ANSI coloring.\nputenv('TERM=dumb');\n\n$config = [\n    'ocr-validate' => dirname(__FILE__) . '/../bin/ocr-validate.sh',\n    'ocr-transform' => dirname(__FILE__) . '/../bin/ocr-transform.sh',\n    'formats' => [\n        'transform' => [],\n        'validate' => [],\n    ],\n];\n\n$local_settings = dirname(__FILE__) . '/config.local.php';\nif (file_exists($local_settings) === TRUE) {\n    include $local_settings;\n}\n\n/**\n * List of installed transform from-to-tuples.\n * List of installed schemas.\n */\nfunction buildFormatList()\n{\n    global $config;\n    $lines = [];\n    exec($config['ocr-transform'] . ' -L', $lines);\n    foreach ($lines as $line) {\n        $fromto = preg_split(\"/\\s+/\", $line);\n        $from = $fromto[0];\n        $to = $fromto[1];\n        // echo $from, \"\\t\", $to, \"\\n\";\n        if (! array_key_exists($from, $config['formats']['transform'])) {\n            $config['formats']['transform'][$from] = [$to];\n        } else {\n            array_push($config['formats']['transform'][$from], $to);\n        }\n    }\n    exec($config['ocr-validate'] . ' -L', $config['formats']['validate']);\n}\n\nbuildFormatList();\n\nreturn $config;\n"
  },
  {
    "path": "web/index.html",
    "content": "<!doctype HTML>\n<html lang=\"en\">\n  <head>\n    <link href=\"https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css\" rel=\"stylesheet\"/>\n    <link rel=\"stylesheet\" href=\"https://cdnjs.cloudflare.com/ajax/libs/notie/3.2.0/notie.css\"/>\n    <link rel=\"stylesheet\" href=\"https://cdnjs.cloudflare.com/ajax/libs/prism/1.9.0/themes/prism.min.css\">\n    <link rel=\"stylesheet\" href=\"https://cdnjs.cloudflare.com/ajax/libs/prism/1.9.0/themes/prism-coy.css\">\n    <link rel=\"stylesheet\" href=\"https://cdnjs.cloudflare.com/ajax/libs/github-fork-ribbon-css/0.2.0/gh-fork-ribbon.min.css\" />\n    <link rel=\"stylesheet\" href=\"ocr-fileformat.css\"/>\n    <link rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"favicon.png\">\n    <title>OCR Fileformat</title>\n  </head>\n  <body>\n\n    <!-- Static navbar -->\n    <nav class=\"navbar navbar-inverse\">\n      <div class=\"container-fluid\">\n        <div class=\"navbar-header\">\n          <button type=\"button\" class=\"navbar-toggle collapsed\" data-toggle=\"collapse\" data-target=\"#navbar\">\n            <span class=\"sr-only\">Toggle navigation</span>\n            <span class=\"icon-bar\"></span>\n            <span class=\"icon-bar\"></span>\n            <span class=\"icon-bar\"></span>\n          </button>\n          <a class=\"navbar-brand\" href=\"#\">OCR Fileformat</a>\n        </div>\n        <div id=\"navbar\" class=\"navbar-collapse collapse\">\n          <ul class=\"nav navbar-nav\">\n            <li class=\"active\"><a data-toggle=\"tab\" href=\"#transform\">Transform</a></li>\n            <li class=\"\"><a data-toggle=\"tab\" href=\"#validate\">Validate</a></li>\n            <li><a data-toggle=\"tab\" href=\"#help\">Help</a></li>\n          </ul>\n        </div><!--/.nav-collapse -->\n      </div><!--/.container-fluid -->\n    </nav>\n\n    <div class=\"container\">\n\n      <section class=\"tab-content\">\n\n        <div class=\"tab-pane active\" id=\"transform\">\n\n          <div class=\"row\">\n\n              <div class=\"col-xs-3\">\n\n                <!-- Nav tabs -->\n                <ul class=\"nav nav-tabs nav-justified\" role=\"tablist\">\n                  <li role=\"presentation\" class=\"active\">\n                    <a role=\"tab\" data-toggle=\"tab\" href=\"#transform-url-tab\" tabindex=-1>URL</a>\n                  </li>\n                  <li role=\"presentation\">\n                    <a role=\"tab\" data-toggle=\"tab\" href=\"#transform-file-tab\" tabindex=-1>Upload</a>\n                  </li>\n                </ul>\n\n              </div>\n          </div>\n\n          <div class=\"row\">\n\n              <div class=\"col-xs-6\">\n\n                <!-- Tab panes -->\n                <div class=\"tab-content input\">\n                  <div role=\"tabpanel\" class=\"tab-pane active\" id=\"transform-url-tab\">\n                    <input id=\"transform-url\"\n                      class=\"form-control\"\n                      type=\"url\"\n                      placeholder=\"http://example.org/xml\"\n                      style=\"width:100%\" />\n                  </div>\n                  <div role=\"tabpanel\" class=\"tab-pane\" id=\"transform-file-tab\">\n                    <input id=\"transform-file\"\n                      class=\"form-control\"\n                      type=\"file\"\n                      style=\"width:100%\" />\n                  </div>\n                </div>\n\n              </div>\n\n              <div class=\"col-xs-6\">\n\n                <div class=\"form-inline formats\" role=\"form\">\n                  <select id=\"transform-from\" class=\"form-control\" disabled>\n                    <option disabled selected value> -- input -- </option>\n                  </select>\n                  <select id=\"transform-to\" class=\"form-control\" disabled>\n                    <option disabled selected value> -- output -- </option>\n                  </select>\n                  <button id=\"transform-submit\" class=\"btn btn-success\" disabled>\n                    Transform\n                    <span class=\"hidden glyphicon glyphicon-refresh spinning\"></span>\n                  </button>\n                </div>\n\n              </div>\n\n          </div>\n\n          <div class=\"row\">\n            <div class=\"col-xs-12\">\n              <div id=\"transform-result\" class=\"result hidden\">\n                <div class=\"btn-group btn-group-sm\" role=\"group\">\n                  <a class=\"btn btn-default btn-primary download\">\n                    <span class=\"glyphicon glyphicon-download\">&nbsp;</span>Download\n                  </a>\n                </div>\n                <pre>\n                  <code class=\"language-markup\">\n                    <code></code>\n                  </code>\n                </pre>\n              </div>\n            </div>\n          </div>\n\n        </div>\n\n        <div class=\"tab-pane\" id=\"validate\">\n\n          <div class=\"row\">\n\n            <div class=\"col-xs-6\">\n\n              <!-- Tab panes -->\n              <div class=\"tab-content input\">\n                <div role=\"tabpanel\" class=\"tab-pane active\" id=\"validate-url-tab\">\n                  <input id=\"validate-url\"\n                  class=\"form-control\"\n                  type=\"url\"\n                  placeholder=\"http://example.org/xml\"\n                  style=\"width:100%\" />\n                </div>\n                <div role=\"tabpanel\" class=\"tab-pane\" id=\"validate-file-tab\">\n                  <input id=\"validate-file\"\n                  class=\"form-control\"\n                  type=\"file\"\n                  style=\"width:100%\" />\n                </div>\n              </div>\n\n              <!-- Nav tabs -->\n              <ul class=\"nav nav-tabs nav-justified\" role=\"tablist\">\n                <li role=\"presentation\" class=\"active\">\n                  <a role=\"tab\" data-toggle=\"tab\" href=\"#validate-url-tab\" tabindex=-1>URL</a>\n                </li>\n                <li role=\"presentation\">\n                  <a role=\"tab\" data-toggle=\"tab\" href=\"#validate-file-tab\" tabindex=-1>Upload</a>\n                </li>\n              </ul>\n\n            </div>\n\n            <div class=\"col-xs-6\">\n\n              <div class=\"form-inline formats\" role=\"form\">\n                <select id=\"validate-format\" class=\"form-control\">\n                  <option disabled selected value> -- format -- </option>\n                </select>\n                <button id=\"validate-submit\" class=\"btn btn-success\" disabled>\n                  Validate\n                  <span class=\"hidden glyphicon glyphicon-refresh spinning\"></span>\n                </button>\n              </div>\n\n            </div>\n\n          </div>\n        <div class=\"row\">\n          <div class=\"col-xs-12\">\n            <div class=\"result hidden\" id=\"validate-result\">\n              <pre>\n                <code class=\"language-markup\">\n                  <code></code>\n                </code>\n              </pre>\n            </div>\n          </div>\n        </div>\n      </div>\n\n      <div class=\"tab-pane\" id=\"help\">\n        <h3>Examples</h3>\n\n        <h4>ABBYY FineReader</h4>\n        <ul>\n          <li><a href=\"https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/abbyy/417576986_0031.xml\">https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/abbyy/417576986_0031.xml</a></li>\n        </ul>\n\n        <h4>hOCR</h4>\n        <ul>\n          <li><a href=\"https://cdn.rawgit.com/kba/ocr-fileformat-samples/master/samples/hocr/1.1/wetzel_reisebegleiter_1901_0021.hocr\">https://cdn.rawgit.com/kba/ocr-fileformat-samples/master/samples/hocr/1.1/wetzel_reisebegleiter_1901_0021.hocr</a></li>\n          <li><a href=\"https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/0001-tesseract.hocr\">https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/0001-tesseract.hocr</a></li>\n        </ul>\n\n        <h4>ALTO</h4>\n        <ul>\n          <li><a href=\"http://chroniclingamerica.loc.gov/lccn/sn86069133/1910-10-31/ed-1/seq-1/ocr.xml\">http://chroniclingamerica.loc.gov/lccn/sn86069133/1910-10-31/ed-1/seq-1/ocr.xml</a></li>\n          <li><a href=\"https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/alto/417576986_0031.xml\">https://digi.bib.uni-mannheim.de/~stweil/ocr-praxis/Testseiten/alto/417576986_0031.xml</a></li>\n          <li><a href=\"https://rawgit.com/kba/ocr-fileformat-samples/master/samples/alto/2.0/wetzel_reisebegleiter_1901_0021.alto\">https://rawgit.com/kba/ocr-fileformat-samples/master/samples/alto/2.0/wetzel_reisebegleiter_1901_0021.alto</a></li>\n        </ul>\n\n        <h4>PAGE XML</h4>\n        <ul>\n          <li><a href=\"https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/documentation/example/SimplePage.xml\">https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/documentation/example/SimplePage.xml</a></li>\n          <li><a href=\"https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/pagecontent/examples/aletheiaexamplepage.xml\">https://raw.githubusercontent.com/PRImA-Research-Lab/PAGE-XML/master/pagecontent/examples/aletheiaexamplepage.xml</a></li>\n        </ul>\n\n        <h3>Source Code</h3>\n\n        <a href=\"https://github.com/UB-Mannheim/ocr-fileformat\">https://github.com/UB-Mannheim/ocr-fileformat</a>\n\n      </div>\n\n      </section>\n    </div>\n\n<a class=\"github-fork-ribbon left-bottom\"\n    href=\"https://github.com/UB-Mannheim/ocr-fileformat\"\n    title=\"Fork me on GitHub\">Fork me on GitHub</a>\n\n<script src=\"https://code.jquery.com/jquery-2.2.4.js\" integrity=\"sha256-iT6Q9iMJYuQiMWNd9lDyBUStIq/8PuOW33aOqmvFpqI=\" crossorigin=\"anonymous\"></script>\n<script src=\"https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js\"></script>\n<!-- <script src=\"https://cdn.rawgit.com/rndme/download/master/download.min.js\"></script> -->\n<script src=\"https://rawgit.com/notifyjs/notifyjs/master/dist/notify.js\"></script>\n<script src=\"ocr-fileformat.js\"></script>\n<script src=\"https://cdnjs.cloudflare.com/ajax/libs/prism/1.9.0/prism.min.js\" integrity=\"sha512-KnX1xdVSdEHliREuSgUX9kgmit/Wk63n4X3cjoWfISEVsi2Qi2NW88dYyKXCCS8YcFMgzHywK3BIafTfhK2Tig==\" crossorigin=\"anonymous\" referrerpolicy=\"no-referrer\"></script>\n    </body>\n  </html>\n"
  },
  {
    "path": "web/ocr-fileformat.css",
    "content": ".glyphicon.spinning {\n    animation: spin 1s infinite linear;\n    -webkit-animation: spin2 1s infinite linear;\n}\n\n@keyframes spin {\n    from { transform: scale(1) rotate(0deg); }\n    to { transform: scale(1) rotate(360deg); }\n}\n\n@-webkit-keyframes spin2 {\n    from { -webkit-transform: rotate(0deg); }\n    to { -webkit-transform: rotate(360deg); }\n}\n\n.result {\n    max-height: 75vh;\n}\n\n.github-fork-ribbon {\n  position: fixed;\n}\n.github-fork-ribbon.left-bottom:before {\n    background-color: #080;\n}\n"
  },
  {
    "path": "web/ocr-fileformat.js",
    "content": "/* globals $ */\n/* globals Blob */\n/* global Prism */\n\nlet OcrFileformatAPI = function OcrFileformatAPI(endpoint) {\n    this.endpoint = endpoint;\n};\n\nOcrFileformatAPI.prototype.urlFor = function urlFor(action, params) {\n    params || (params = {});\n    let url = this.endpoint + '?do=' + action;\n    for (let paramName of Object.keys(params)) {\n        url += '&' + paramName + '=' + params[paramName];\n    }\n    return url;\n};\n\nOcrFileformatAPI.prototype.updateFormats = function updateFormats(cb) {\n    let self = this;\n    this.request('list', null, null, function(err, formats) {\n        self.formats = formats;\n        cb(err);\n    });\n};\n\nOcrFileformatAPI.prototype.request = function request(endpoint, query, formData, cb) {\n    let ajaxCall = {\n        type: 'GET',\n        url: window.api.urlFor(endpoint, query),\n        success: function(data) {\n            cb(null, data);\n        },\n        error: function(xhr) {\n            cb(xhr.responseText);\n        },\n    };\n    if (formData) {\n        ajaxCall.type = 'POST';\n        ajaxCall.data = formData;\n        ajaxCall.processData =  false;\n        ajaxCall.contentType = false;\n    }\n    $.ajax(ajaxCall);\n};\n\nfunction escapeHTML(str) {\n    return str.\n        replace(/&/g, '&amp;').\n        replace(/</g, '&lt;').\n        replace(/\"/g, '&quot;').\n        replace(/'/g, '&#39;').\n        replace(/\\//g, '&#x2F;').\n        replace(/>/g, '&gt;');\n}\n\nfunction onChangeFormat() {\n    if ($(\"#transform-from option\").length == 1) {\n        Object.keys(window.api.formats.transform).forEach(function(from) {\n            $(\"#transform-from\").append($(\"<option>\").append(from));\n        });\n        $(\"#transform-from\").removeAttr('disabled');\n    }\n    let selectedFrom = $(\"#transform-from\").val();\n    $(\"#transform-to\").attr('disabled', selectedFrom === null);\n    if (selectedFrom) {\n        $(\"#transform-to option\").slice(1).remove();\n        window.api.formats.transform[selectedFrom].forEach(function(to) {\n            $(\"#transform-to\").append($(\"<option>\").append(to));\n        });\n    }\n    if ($(\"#validate-format option\").length == 1) {\n        window.api.formats.validate.forEach(function(format) {\n            $(\"#validate-format\").append($(\"<option>\").append(format));\n        });\n    }\n}\n\nfunction submit(tabName, params) {\n    let pane = $(\"#\" + tabName);\n    let input = pane.find(\".input .active input\");\n    let formData;\n    const isFileUpload = input.attr('type') === 'file';\n    if (isFileUpload) {\n        formData = new FormData();\n        formData.append('file', input.prop('files')[0]);\n    } else {\n        params.url = input.val();\n    }\n    $(\"button .spinning\", pane).removeClass('hidden');\n    window.api.request(tabName, params, formData, function(err, data) {\n        pane.find(\"button .spinning\").addClass('hidden');\n        if (err) {\n            return $.notify(err, 'error');\n        }\n    pane.find(\".result a.download\").off('click').on('click', ev => {\n            const outputFormat = $(\"#transform-to\").val();\n            const basename = input.val()\n                .replace(/^.*\\\\/, '')  // C:\\fakepath\\foo.hocr -> foo.hocr\n                .replace(/^.*\\//, '')  // http://bla/foo.bar -> foo.hocr?raw=true\n                .replace(/\\?.*$$/, '') // foo.hocr?raw=true -> foo.hocr\n                ;\n            const extension = outputFormat === 'text' ? 'text'\n                : outputFormat === 'hocr' ? 'html'\n                : outputFormat + '.xml';\n            const type = outputFormat === 'text' ? 'text/plain'\n                : outputFormat === 'hocr' ? 'text/html'\n                : 'text/xml';\n            const downloadUrl = window.URL.createObjectURL(new Blob([data], {type}));\n            const filename = `${basename}.${extension}`;\n            const dummyLink = document.createElement('a');\n            dummyLink.setAttribute('download', filename);\n            dummyLink.href = downloadUrl;\n            dummyLink.style.display = 'none';\n            document.body.appendChild(dummyLink);\n            dummyLink.click();\n            document.body.removeChild(dummyLink);\n            window.URL.revokeObjectURL(downloadUrl);\n        });\n        pane.find('.result pre code').html(escapeHTML(data));\n        pane.find(\".result\").removeClass('hidden');\n        Prism.highlightAll();\n    });\n}\n\nfunction maybeEnableSubmit() {\n    let el = $(\".tab-pane.active\");\n    let inputSet = !!$(\".input .active input\", el).val();\n    let selects = $(\".formats select\", el);\n    let formatsSet = selects.length == selects.map(function() {return $(this).val();}).length;\n    $(\"button\", el).attr('disabled', !(inputSet && formatsSet));\n}\n\nfunction hashRoute() {\n    let hash = window.location.hash;\n    let pageTab = hash.replace(/-.*/, '');\n    $(\"a[data-toggle='tab'][href='\" + pageTab + \"']\").tab('show');\n    $(\"a[data-toggle='tab'][href='\" + hash + \"']\").tab('show');\n}\n\n$(function() {\n    $.notify.defaults({position: 'bottom right'});\n    const api = window.api = new OcrFileformatAPI('ocr-fileformat.php');\n    $.notify(\"Loading formats\", 'info');\n    api.updateFormats(function(err) {\n        if (err) {\n            $.notify(\"Error loading formats\", \"error\");\n            return;\n        }\n        $.notify(\"Loaded formats\", 'success');\n\n        $(\"#transform-from\").on('change', onChangeFormat);\n\n        $(\"a\").on('shown.bs.tab', maybeEnableSubmit);\n        $(\":input\").on('input change', maybeEnableSubmit);\n        $(\".tab-pane\").on('shown.bs.tab', maybeEnableSubmit);\n\n        $(\"#validate-submit\").on('click', function() {\n            submit('validate', {format: $(\"#validate-format\").val()});\n        });\n        $(\"#transform-submit\").on('click', function() {\n            submit('transform', {from: $(\"#transform-from\").val(), to: $(\"#transform-to\").val()});\n        });\n\n        $(\"a[data-toggle='tab']\").on('click tap', function() {window.location.hash = $(this).attr('href');});\n\n        $(window).on('hashchange', hashRoute);\n\n        onChangeFormat();\n        hashRoute();\n    });\n});\n\n/* vim: set sw=4 : */\n"
  },
  {
    "path": "web/ocr-fileformat.php",
    "content": "<?php\n\n// To hide the config\ndefine('IncludingScript', TRUE);\n\n$config = include('config.php');\n\n\n/**\n * Send a Malformed Request error.\n */\nfunction send400($msg)\n{\n  http_response_code(400);\n  header(\"Content-Type: text/plain\");\n  echo $msg;\n}\n\n/**\n * Send a JSON response\n */\nfunction sendJSON($data)\n{\n  header(\"Content-Type: application/json\");\n  echo json_encode($data);\n}\n\n/**\n * Open a bidirectinal child process, write data into it and echo the result.\n */\nfunction pipeToCommand($cmd, $xml)\n{\n  $descriptorspec = array(\n    0 => array(\"pipe\", \"r\"),\n    1 => array(\"pipe\", \"w\"),\n    2 => array(\"pipe\", \"w\"),\n  );\n  $process = proc_open(\"TERM=dumb \" . $cmd, $descriptorspec, $pipes);\n  $ret = array();\n  if (is_resource($process)) {\n    fwrite($pipes[0], $xml);\n    fclose($pipes[0]);\n    $ret['stdout'] = stream_get_contents($pipes[1]);\n    $ret['stderr'] = stream_get_contents($pipes[2]);\n    fclose($pipes[1]);\n    fclose($pipes[2]);\n    proc_close($process);\n    return $ret;\n  }\n}\n\n/**\n * Transform from one format to another, fetching the data by URL\n */\nfunction transform($url, $from, $to)\n{\n  global $config;\n  if (!array_key_exists($from, $config['formats']['transform'])\n    || !in_array($to, $config['formats']['transform'][$from])) {\n    send400(\"No such transformation '$from -> $to'\");\n    return;\n  }\n  $xml = file_get_contents($url);\n  if (!$xml) {\n    send400(\"Could not retrieve URL '$url'\");\n    return;\n  }\n  header(\"Content-Type: \" . $to === \"html\" ? \"text/html\" : \"application/xml\"); \n  $res = pipeToCommand($config['ocr-transform'] . \" -d '$from' '$to' - -- '!indent=yes'\", $xml);\n  echo $res['stdout'];\n}\n\n/**\n * Validate against a schema, data retrieved via HTTP GET.\n */\nfunction validate($url, $format)\n{\n  global $config;\n  if (!in_array($format, $config['formats']['validate'])) {\n    return send400(\"No validator for '$format'\");\n  }\n  header(\"Content-Type: text/plain\");\n  $xml = file_get_contents($url);\n  if (!$xml) {\n    return send400(\"Could not retrieve URL '$url'\");\n  }\n  header(\"Content-Type: text/plain\");\n  $res = pipeToCommand($config['ocr-validate'] . \" \" . $format . \" -\", $xml);\n  echo $res['stdout'];\n  echo $res['stderr'];\n}\n\n/**\n * Handle request\n */\nif (array_key_exists('file', $_FILES)) {\n    $_GET['url'] = $_FILES[\"file\"]['tmp_name'];\n}\n\nswitch ($_GET['do']) {\n  case 'list':\n    sendJSON($config['formats']);\n    break;\n  case 'transform':\n    if (!array_key_exists('url', $_GET)) {\n      return send400(\"Must be either POST with file field 'file' or GET with param 'url'.\");\n    }\n    transform($_GET[\"url\"], $_GET[\"from\"], $_GET[\"to\"]);\n    break;\n  case 'validate':\n    if (!array_key_exists('url', $_GET)) {\n      return send400(\"Must be either POST with file field 'file' or GET with param 'url'.\");\n    }\n    validate($_GET[\"url\"], $_GET[\"format\"]);\n    break;\n  default:\n    send400(\"Unknown/missing action, set 'do' parameter to either 'validate' or 'transform'\");\n    break;\n}\n"
  },
  {
    "path": "xsd/.gitignore",
    "content": "*.xsd\n"
  },
  {
    "path": "xslt/.gitignore",
    "content": "*.xml\n*.xsl\n!alto2.0__alto3.0.xsl\n!page__text.xsl\n!tei__hocr.xsl\n"
  },
  {
    "path": "xslt/alto2.0__alto3.0.xsl",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!-- https://github.com/altoxml/documentation/issues/1#issuecomment-219671094 -->\n<xsl:stylesheet \n   xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n   xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" version=\"1.0\"\n   xmlns:v2=\"http://www.loc.gov/standards/alto/ns-v2#\"\n   xmlns:v3=\"http://www.loc.gov/standards/alto/ns-v3#\">\n   <xsl:template match=\"@* | node()\">\n      <xsl:copy>\n         <xsl:apply-templates select=\"@* | node()\"/>\n      </xsl:copy>\n   </xsl:template>\n\n   <!-- replace xsi:schemaLocation attribute -->\n   <xsl:template match=\"@xsi:schemaLocation\">\n      <xsl:attribute name=\"xsi:schemaLocation\">http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/standards/alto/v3/alto-3-0.xsd</xsl:attribute>\n   </xsl:template>\n\n   <!-- replace namespace  -->\n   <xsl:template match=\"v2:*\">\n      <xsl:element name=\"{local-name()}\" namespace=\"http://www.loc.gov/standards/alto/ns-v3#\">\n         <xsl:apply-templates select=\"@* | node()\"/>\n      </xsl:element>\n   </xsl:template>\n\n</xsl:stylesheet>\n"
  },
  {
    "path": "xslt/page__text.xsl",
    "content": "<xsl:stylesheet\n    version=\"1.0\"\n    xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n    xmlns:pc=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15\">\n  <!-- rid of xml syntax: -->\n  <xsl:output\n      method=\"text\"\n      standalone=\"yes\"\n      omit-xml-declaration=\"yes\"/>\n  <!-- copy text element verbatim: -->\n  <xsl:variable name=\"newline\"><xsl:text>\n</xsl:text>\n  </xsl:variable>\n  <!-- paragraph break -->\n  <xsl:param name=\"pb\" select=\"concat($newline,$newline)\"/>\n  <!-- line break -->\n  <xsl:param name=\"lb\" select=\"$newline\"/>\n  <!-- text order: by element or by explicit ReadingOrder (reading-order|document) -->\n  <xsl:param name=\"order\" select=\"'reading-order'\"/>\n  <!-- hierarchy level to extract text annotation from (region|line|word|glyph|highest) -->\n  <xsl:param name=\"level\" select=\"'highest'\"/>\n  <!-- use key mechanism for IDREFs, because XSD does not support id mechanism -->\n  <xsl:key name=\"textRegion\" match=\"pc:TextRegion\" use=\"@id\"/>\n  <xsl:template match=\"pc:PcGts/pc:Page\">\n    <xsl:variable name=\"regions\" select=\"//pc:TextRegion\"/>\n    <xsl:choose>\n      <xsl:when test=\"starts-with($order, 'reading-order') and pc:ReadingOrder//*[@regionRef|@regionRefIndexed]\">\n        <xsl:call-template name=\"getrefs\">\n          <xsl:with-param name=\"group\" select=\"pc:ReadingOrder/*\"/>\n        </xsl:call-template>\n      </xsl:when>\n      <xsl:otherwise>\n        <xsl:for-each select=\"$regions\">\n          <xsl:call-template name=\"getlines\">\n            <xsl:with-param name=\"region\" select=\".\"/>\n          </xsl:call-template>\n          <xsl:value-of select=\"$pb\"/>\n        </xsl:for-each>\n      </xsl:otherwise>\n    </xsl:choose>\n  </xsl:template>\n  <xsl:template name=\"getlines\">\n    <xsl:param name=\"region\"/>\n    <xsl:choose>\n      <xsl:when test=\"$level='region' or $level='highest' and $region/pc:TextEquiv/pc:Unicode\">\n        <xsl:value-of select=\"$region/pc:TextEquiv[1]/pc:Unicode\" disable-output-escaping=\"yes\"/>\n      </xsl:when>\n      <xsl:otherwise>\n        <xsl:for-each select=\"$region/pc:TextLine\">\n          <xsl:if test=\"position()>1\">\n            <xsl:value-of select=\"$lb\"/>\n          </xsl:if>\n          <xsl:choose>\n            <xsl:when test=\"$level='line' or $level='highest' and pc:TextEquiv/pc:Unicode\">\n              <xsl:value-of select=\"pc:TextEquiv[1]/pc:Unicode\" disable-output-escaping=\"yes\"/>\n            </xsl:when>\n            <xsl:otherwise>\n              <xsl:for-each select=\"pc:Word\">\n                <xsl:if test=\"position()>1\">\n                  <xsl:text> </xsl:text>\n                </xsl:if>\n                <xsl:choose>\n                  <xsl:when test=\"$level='word' or $level='highest' and pc:TextEquiv/pc:Unicode\">\n                    <xsl:value-of select=\"pc:TextEquiv[1]/pc:Unicode\" disable-output-escaping=\"yes\"/>\n                  </xsl:when>\n                  <xsl:otherwise>\n                    <xsl:for-each select=\"pc:Glyph\">\n                      <xsl:value-of select=\"pc:TextEquiv[1]/pc:Unicode\" disable-output-escaping=\"yes\"/>\n                    </xsl:for-each>\n                  </xsl:otherwise>\n                </xsl:choose> <!-- word level? -->\n              </xsl:for-each>\n            </xsl:otherwise>\n          </xsl:choose> <!-- line level? -->\n        </xsl:for-each>\n      </xsl:otherwise>\n    </xsl:choose> <!-- region level? -->\n  </xsl:template>\n  <xsl:template name=\"getrefs\">\n    <xsl:param name=\"group\"/>\n    <xsl:for-each select=\"$group/*\">\n      <xsl:sort select=\"@index\" data-type=\"number\"/>\n      <!--<xsl:variable name=\"region\" select=\"id(@regionRef|@regionRefIndexed)\"/>-->\n      <xsl:variable name=\"region\" select=\"key('textRegion', @regionRef|@regionRefIndexed)\"/>\n      <xsl:if test=\"$region\">\n        <xsl:call-template name=\"getlines\">\n          <xsl:with-param name=\"region\" select=\"$region\"/>\n        </xsl:call-template>\n        <xsl:value-of select=\"$pb\"/>\n      </xsl:if>\n      <!-- UnorderedGroup(Indexed) and OrderedGroup(Indexed): recurse -->\n      <xsl:if test=\"contains(local-name(.), 'Group')\">\n        <xsl:call-template name=\"getrefs\">\n          <xsl:with-param name=\"group\" select=\".\"/>\n        </xsl:call-template>\n      </xsl:if>\n    </xsl:for-each>\n  </xsl:template>\n  <!-- override implicit rules copying elements and attributes: -->\n  <xsl:template match=\"text()\"/>\n</xsl:stylesheet>\n"
  },
  {
    "path": "xslt/tei__hocr.xsl",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<xsl:stylesheet\n  xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n  xmlns:util=\"http://example/com/util/namespace\"\n  version=\"2.0\"\n  exclude-result-prefixes=\"xsl util\"\n  xmlns=\"http://www.w3.org/1999/xhtml\">\n  \n  <xsl:output method=\"html\" encoding=\"UTF-8\" indent=\"yes\"\n              omit-xml-declaration=\"yes\" />\n  <xsl:param name=\"docTitle\" select=\"'document_name'\"/>\n  <xsl:param name=\"langs\"    select=\"'de'\"/>\n  <xsl:param name=\"npages\"   select=\"1\"/>\n  <xsl:param name=\"scripts\"  select=\"'Latg'\"/>\n  <xsl:param name=\"system\"   select=\"'unknown'\"/>\n  <xsl:param name=\"left\"     select=\"-1\"/>\n  <xsl:param name=\"top\"      select=\"-1\"/>\n  <xsl:param name=\"width\"    select=\"-1\"/>\n  <xsl:param name=\"height\"   select=\"-1\"/>\n\n  <!-- converts comma-separated to space-separated coordinates -->\n  <xsl:function name=\"util:coords\">\n    <xsl:param name=\"coords\" />\n    <xsl:value-of select=\"replace($coords, ',', ' ')\" />\n  </xsl:function>\n\n  <!-- calculates bounding box of all nodes with attribute 'function' -->\n  <xsl:function name=\"util:get-pagebox\">\n    <xsl:param name=\"nodes\" />\n    <xsl:variable name=\"bbox\">\n      <xsl:choose>\n        <xsl:when test=\"$left = -1\">\n          <xsl:for-each select=\"$nodes\">\n            <xsl:sort select=\"tokenize(./@function, ',')[1]\" data-type=\"number\" order=\"ascending\" />\n            <xsl:if test=\"position() = 1\">\n              <xsl:value-of select=\"tokenize(./@function, ',')[1]\" />\n            </xsl:if>\n            <xsl:text> </xsl:text>\n          </xsl:for-each>\n        </xsl:when>\n        <xsl:otherwise>\n          <xsl:value-of select=\"$left\" />\n          <xsl:text> </xsl:text>\n        </xsl:otherwise>\n      </xsl:choose>\n      <xsl:choose>\n        <xsl:when test=\"$top = -1\">\n          <xsl:for-each select=\"$nodes\">\n            <xsl:sort select=\"tokenize(./@function, ',')[2]\" data-type=\"number\" order=\"ascending\" />\n            <xsl:if test=\"position() = 1\">\n              <xsl:value-of select=\"tokenize(./@function, ',')[2]\" />\n            </xsl:if>\n            <xsl:text> </xsl:text>\n          </xsl:for-each>\n        </xsl:when>\n        <xsl:otherwise>\n          <xsl:value-of select=\"$top\" />\n          <xsl:text> </xsl:text>\n        </xsl:otherwise>\n      </xsl:choose>\n      <xsl:choose>\n        <xsl:when test=\"$width = -1\">\n          <xsl:for-each select=\"$nodes\">\n            <xsl:sort select=\"tokenize(./@function, ',')[3]\" data-type=\"number\" order=\"descending\" />\n            <xsl:if test=\"position() = 1\">\n              <xsl:value-of select=\"tokenize(./@function, ',')[3]\" />\n            </xsl:if>\n            <xsl:text> </xsl:text>\n          </xsl:for-each>\n        </xsl:when>\n        <xsl:otherwise>\n          <xsl:value-of select=\"$width\" />\n          <xsl:text> </xsl:text>\n        </xsl:otherwise>\n      </xsl:choose>\n      <xsl:choose>\n        <xsl:when test=\"$height = -1\">\n          <xsl:for-each select=\"$nodes\">\n            <xsl:sort select=\"tokenize(./@function, ',')[4]\" data-type=\"number\" order=\"descending\" />\n            <xsl:if test=\"position() = 1\">\n              <xsl:value-of select=\"tokenize(./@function, ',')[4]\" />\n            </xsl:if>\n            <xsl:text> </xsl:text>\n          </xsl:for-each>\n        </xsl:when>\n        <xsl:otherwise>\n          <xsl:value-of select=\"$height\" />\n          <xsl:text> </xsl:text>\n        </xsl:otherwise>\n      </xsl:choose>\n    </xsl:variable>\n    <xsl:value-of select=\"normalize-space($bbox)\" />\n  </xsl:function>\n\n  <!-- Start of transformation -->\n  <xsl:template match=\"/\">\n    <html>\n      <head>\n        <title>\n          <xsl:value-of select=\"$docTitle\" />\n        </title>\n        <meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" />\n        <meta name=\"ocr-system\" content=\"{$system}\" />\n        <meta name=\"ocr-capabilities\" content=\"ocr_page ocr_carea ocr_par ocr_line ocrx_word\" />\n        <meta name=\"ocr-langs\" content=\"{$langs}\" />\n        <meta name=\"ocr-number-of-pages\" content=\"{$npages}\" />\n        <meta name=\"ocr-scripts\" content=\"{$scripts}\" />\n      </head>\n      <xsl:apply-templates select=\".//text\" />\n    </html>\n  </xsl:template>\n\n  <xsl:template match=\"text\">\n    <body>\n        <xsl:apply-templates select=\".//milestone\" />\n    </body>\n  </xsl:template>\n\n  <!-- Page -->\n  <xsl:template match=\"milestone[@type='page']\">\n    <xsl:variable name=\"pageno\" select=\"@n\" />\n    <xsl:variable name=\"pagenodes\" select=\"//*[@function]\" />\n    <xsl:variable name=\"pagebox\" select=\"util:get-pagebox($pagenodes)\" />\n    \n    <div class=\"ocr_page\" id=\"page_{$pageno}\" title=\"image &quot;{$docTitle}&quot;; bbox {$pagebox}; ppageno {$pageno - 1}\">\n      <div class=\"ocr_carea\" id=\"block_{$pageno}\" title=\"bbox {$pagebox}\">\n        <xsl:apply-templates select=\"//p|//figure\" />\n      </div>\n    </div>\n  </xsl:template>\n\n  <!-- Paragraph -->\n  <xsl:template match=\"p\">\n    <xsl:variable name=\"pid\" select=\"@id\" />\n    <p class=\"ocr_par\" id=\"{$pid}\">\n      <xsl:apply-templates select=\"./w\" />\n    </p>\n  </xsl:template>\n\n  <!-- Word -->\n  <xsl:template match=\"w\">\n    <xsl:variable name=\"bbox\" select=\"util:coords(@function)\" />\n    <span class=\"ocrx_word\" title=\"bbox {$bbox}\">\n      <xsl:value-of select=\"text()\" />\n    </span>\n  </xsl:template>\n  \n  <!-- Figure -->\n  <xsl:template match=\"figure\">\n    <xsl:variable name=\"bbox\" select=\"util:coords(@function)\" />\n    <div class=\"ocr_float\" title=\"bbox {$bbox}\" />\n  </xsl:template>\n\n  <!-- Unmatched Elements -->\n   <xsl:template match=\"*\">\n    <xsl:message terminate=\"no\">\n      WARNING: Unmatched element: <xsl:value-of select=\"name()\"/>\n    </xsl:message>\n    <xsl:apply-templates/>\n  </xsl:template>\n\n</xsl:stylesheet>\n"
  }
]